datachain 0.7.5__py3-none-any.whl → 0.7.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +99 -113
- datachain/catalog/loader.py +8 -65
- datachain/cli.py +148 -57
- datachain/data_storage/__init__.py +0 -3
- datachain/data_storage/metastore.py +2 -9
- datachain/data_storage/sqlite.py +7 -145
- datachain/data_storage/warehouse.py +1 -5
- datachain/dataset.py +15 -0
- datachain/func/__init__.py +2 -1
- datachain/func/func.py +7 -2
- datachain/lib/dc.py +4 -4
- datachain/lib/pytorch.py +1 -4
- datachain/query/dataset.py +0 -5
- datachain/query/dispatch.py +1 -13
- datachain/query/session.py +0 -1
- datachain/remote/studio.py +33 -1
- datachain/studio.py +80 -0
- {datachain-0.7.5.dist-info → datachain-0.7.7.dist-info}/METADATA +1 -1
- {datachain-0.7.5.dist-info → datachain-0.7.7.dist-info}/RECORD +23 -24
- datachain/data_storage/id_generator.py +0 -136
- {datachain-0.7.5.dist-info → datachain-0.7.7.dist-info}/LICENSE +0 -0
- {datachain-0.7.5.dist-info → datachain-0.7.7.dist-info}/WHEEL +0 -0
- {datachain-0.7.5.dist-info → datachain-0.7.7.dist-info}/entry_points.txt +0 -0
- {datachain-0.7.5.dist-info → datachain-0.7.7.dist-info}/top_level.txt +0 -0
datachain/lib/pytorch.py
CHANGED
|
@@ -70,20 +70,17 @@ class PytorchDataset(IterableDataset):
|
|
|
70
70
|
# For compatibility with multiprocessing,
|
|
71
71
|
# we can only store params in __init__(), as Catalog isn't picklable
|
|
72
72
|
# see https://github.com/iterative/dvcx/issues/954
|
|
73
|
-
self._idgen_params = catalog.id_generator.clone_params()
|
|
74
73
|
self._ms_params = catalog.metastore.clone_params()
|
|
75
74
|
self._wh_params = catalog.warehouse.clone_params()
|
|
76
75
|
self._catalog_params = catalog.get_init_params()
|
|
77
76
|
self.catalog: Optional[Catalog] = None
|
|
78
77
|
|
|
79
78
|
def _get_catalog(self) -> "Catalog":
|
|
80
|
-
idgen_cls, idgen_args, idgen_kwargs = self._idgen_params
|
|
81
|
-
idgen = idgen_cls(*idgen_args, **idgen_kwargs)
|
|
82
79
|
ms_cls, ms_args, ms_kwargs = self._ms_params
|
|
83
80
|
ms = ms_cls(*ms_args, **ms_kwargs)
|
|
84
81
|
wh_cls, wh_args, wh_kwargs = self._wh_params
|
|
85
82
|
wh = wh_cls(*wh_args, **wh_kwargs)
|
|
86
|
-
return Catalog(
|
|
83
|
+
return Catalog(ms, wh, **self._catalog_params)
|
|
87
84
|
|
|
88
85
|
def __iter__(self) -> Iterator[Any]:
|
|
89
86
|
if self.catalog is None:
|
datachain/query/dataset.py
CHANGED
|
@@ -442,9 +442,6 @@ class UDFStep(Step, ABC):
|
|
|
442
442
|
udf_info = {
|
|
443
443
|
"udf_data": filtered_cloudpickle_dumps(self.udf),
|
|
444
444
|
"catalog_init": self.catalog.get_init_params(),
|
|
445
|
-
"id_generator_clone_params": (
|
|
446
|
-
self.catalog.id_generator.clone_params()
|
|
447
|
-
),
|
|
448
445
|
"metastore_clone_params": self.catalog.metastore.clone_params(),
|
|
449
446
|
"warehouse_clone_params": self.catalog.warehouse.clone_params(),
|
|
450
447
|
"table": udf_table,
|
|
@@ -969,8 +966,6 @@ class SQLGroupBy(SQLClause):
|
|
|
969
966
|
def apply_sql_clause(self, query) -> Select:
|
|
970
967
|
if not self.cols:
|
|
971
968
|
raise ValueError("No columns to select")
|
|
972
|
-
if not self.group_by:
|
|
973
|
-
raise ValueError("No columns to group by")
|
|
974
969
|
|
|
975
970
|
subquery = query.subquery()
|
|
976
971
|
|
datachain/query/dispatch.py
CHANGED
|
@@ -65,7 +65,6 @@ def udf_entrypoint() -> int:
|
|
|
65
65
|
dispatch = UDFDispatcher(
|
|
66
66
|
udf_info["udf_data"],
|
|
67
67
|
udf_info["catalog_init"],
|
|
68
|
-
udf_info["id_generator_clone_params"],
|
|
69
68
|
udf_info["metastore_clone_params"],
|
|
70
69
|
udf_info["warehouse_clone_params"],
|
|
71
70
|
udf_fields=udf_info["udf_fields"],
|
|
@@ -119,7 +118,6 @@ class UDFDispatcher:
|
|
|
119
118
|
self,
|
|
120
119
|
udf_data,
|
|
121
120
|
catalog_init_params,
|
|
122
|
-
id_generator_clone_params,
|
|
123
121
|
metastore_clone_params,
|
|
124
122
|
warehouse_clone_params,
|
|
125
123
|
udf_fields: "Sequence[str]",
|
|
@@ -129,11 +127,6 @@ class UDFDispatcher:
|
|
|
129
127
|
):
|
|
130
128
|
self.udf_data = udf_data
|
|
131
129
|
self.catalog_init_params = catalog_init_params
|
|
132
|
-
(
|
|
133
|
-
self.id_generator_class,
|
|
134
|
-
self.id_generator_args,
|
|
135
|
-
self.id_generator_kwargs,
|
|
136
|
-
) = id_generator_clone_params
|
|
137
130
|
(
|
|
138
131
|
self.metastore_class,
|
|
139
132
|
self.metastore_args,
|
|
@@ -155,18 +148,13 @@ class UDFDispatcher:
|
|
|
155
148
|
|
|
156
149
|
def _create_worker(self) -> "UDFWorker":
|
|
157
150
|
if not self.catalog:
|
|
158
|
-
id_generator = self.id_generator_class(
|
|
159
|
-
*self.id_generator_args, **self.id_generator_kwargs
|
|
160
|
-
)
|
|
161
151
|
metastore = self.metastore_class(
|
|
162
152
|
*self.metastore_args, **self.metastore_kwargs
|
|
163
153
|
)
|
|
164
154
|
warehouse = self.warehouse_class(
|
|
165
155
|
*self.warehouse_args, **self.warehouse_kwargs
|
|
166
156
|
)
|
|
167
|
-
self.catalog = Catalog(
|
|
168
|
-
id_generator, metastore, warehouse, **self.catalog_init_params
|
|
169
|
-
)
|
|
157
|
+
self.catalog = Catalog(metastore, warehouse, **self.catalog_init_params)
|
|
170
158
|
self.udf = loads(self.udf_data)
|
|
171
159
|
return UDFWorker(
|
|
172
160
|
self.catalog,
|
datachain/query/session.py
CHANGED
datachain/remote/studio.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import os
|
|
@@ -23,7 +24,8 @@ DatasetStatsData = Optional[DatasetStats]
|
|
|
23
24
|
DatasetRowsData = Optional[Iterable[dict[str, Any]]]
|
|
24
25
|
DatasetExportStatus = Optional[dict[str, Any]]
|
|
25
26
|
DatasetExportSignedUrls = Optional[list[str]]
|
|
26
|
-
|
|
27
|
+
FileUploadData = Optional[dict[str, Any]]
|
|
28
|
+
JobData = Optional[dict[str, Any]]
|
|
27
29
|
|
|
28
30
|
logger = logging.getLogger("datachain")
|
|
29
31
|
|
|
@@ -308,3 +310,33 @@ class StudioClient:
|
|
|
308
310
|
"datachain/dataset-export-status",
|
|
309
311
|
{"dataset_name": name, "dataset_version": version},
|
|
310
312
|
)
|
|
313
|
+
|
|
314
|
+
def upload_file(self, file_name: str, content: bytes) -> Response[FileUploadData]:
|
|
315
|
+
data = {
|
|
316
|
+
"file_content": base64.b64encode(content).decode("utf-8"),
|
|
317
|
+
"file_name": file_name,
|
|
318
|
+
}
|
|
319
|
+
return self._send_request("datachain/upload-file", data)
|
|
320
|
+
|
|
321
|
+
def create_job(
|
|
322
|
+
self,
|
|
323
|
+
query: str,
|
|
324
|
+
query_type: str,
|
|
325
|
+
environment: Optional[str] = None,
|
|
326
|
+
workers: Optional[int] = None,
|
|
327
|
+
query_name: Optional[str] = None,
|
|
328
|
+
files: Optional[list[str]] = None,
|
|
329
|
+
python_version: Optional[str] = None,
|
|
330
|
+
requirements: Optional[str] = None,
|
|
331
|
+
) -> Response[JobData]:
|
|
332
|
+
data = {
|
|
333
|
+
"query": query,
|
|
334
|
+
"query_type": query_type,
|
|
335
|
+
"environment": environment,
|
|
336
|
+
"workers": workers,
|
|
337
|
+
"query_name": query_name,
|
|
338
|
+
"files": files,
|
|
339
|
+
"python_version": python_version,
|
|
340
|
+
"requirements": requirements,
|
|
341
|
+
}
|
|
342
|
+
return self._send_request("datachain/job", data)
|
datachain/studio.py
CHANGED
|
@@ -34,6 +34,19 @@ def process_studio_cli_args(args: "Namespace"):
|
|
|
34
34
|
print(tabulate(rows, headers="keys"))
|
|
35
35
|
return 0
|
|
36
36
|
|
|
37
|
+
if args.cmd == "run":
|
|
38
|
+
return create_job(
|
|
39
|
+
args.query_file,
|
|
40
|
+
args.team,
|
|
41
|
+
args.env_file,
|
|
42
|
+
args.env,
|
|
43
|
+
args.workers,
|
|
44
|
+
args.files,
|
|
45
|
+
args.python_version,
|
|
46
|
+
args.req,
|
|
47
|
+
args.req_file,
|
|
48
|
+
)
|
|
49
|
+
|
|
37
50
|
if args.cmd == "team":
|
|
38
51
|
return set_team(args)
|
|
39
52
|
raise DataChainError(f"Unknown command '{args.cmd}'.")
|
|
@@ -168,3 +181,70 @@ def save_config(hostname, token):
|
|
|
168
181
|
conf["studio"] = studio_conf
|
|
169
182
|
|
|
170
183
|
return config.config_file()
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def create_job(
|
|
187
|
+
query_file: str,
|
|
188
|
+
team_name: Optional[str],
|
|
189
|
+
env_file: Optional[str] = None,
|
|
190
|
+
env: Optional[list[str]] = None,
|
|
191
|
+
workers: Optional[int] = None,
|
|
192
|
+
files: Optional[list[str]] = None,
|
|
193
|
+
python_version: Optional[str] = None,
|
|
194
|
+
req: Optional[list[str]] = None,
|
|
195
|
+
req_file: Optional[str] = None,
|
|
196
|
+
):
|
|
197
|
+
query_type = "PYTHON" if query_file.endswith(".py") else "SHELL"
|
|
198
|
+
with open(query_file) as f:
|
|
199
|
+
query = f.read()
|
|
200
|
+
|
|
201
|
+
environment = "\n".join(env) if env else ""
|
|
202
|
+
if env_file:
|
|
203
|
+
with open(env_file) as f:
|
|
204
|
+
environment = f.read() + "\n" + environment
|
|
205
|
+
|
|
206
|
+
requirements = "\n".join(req) if req else ""
|
|
207
|
+
if req_file:
|
|
208
|
+
with open(req_file) as f:
|
|
209
|
+
requirements = f.read() + "\n" + requirements
|
|
210
|
+
|
|
211
|
+
client = StudioClient(team=team_name)
|
|
212
|
+
file_ids = upload_files(client, files) if files else []
|
|
213
|
+
|
|
214
|
+
response = client.create_job(
|
|
215
|
+
query=query,
|
|
216
|
+
query_type=query_type,
|
|
217
|
+
environment=environment,
|
|
218
|
+
workers=workers,
|
|
219
|
+
query_name=os.path.basename(query_file),
|
|
220
|
+
files=file_ids,
|
|
221
|
+
python_version=python_version,
|
|
222
|
+
requirements=requirements,
|
|
223
|
+
)
|
|
224
|
+
if not response.ok:
|
|
225
|
+
raise_remote_error(response.message)
|
|
226
|
+
|
|
227
|
+
if not response.data:
|
|
228
|
+
raise DataChainError("Failed to create job")
|
|
229
|
+
|
|
230
|
+
print(f"Job {response.data.get('job', {}).get('id')} created")
|
|
231
|
+
print("Open the job in Studio at", response.data.get("job", {}).get("url"))
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def upload_files(client: StudioClient, files: list[str]) -> list[str]:
|
|
235
|
+
file_ids = []
|
|
236
|
+
for file in files:
|
|
237
|
+
file_name = os.path.basename(file)
|
|
238
|
+
with open(file, "rb") as f:
|
|
239
|
+
file_content = f.read()
|
|
240
|
+
response = client.upload_file(file_name, file_content)
|
|
241
|
+
if not response.ok:
|
|
242
|
+
raise_remote_error(response.message)
|
|
243
|
+
|
|
244
|
+
if not response.data:
|
|
245
|
+
raise DataChainError(f"Failed to upload file {file_name}")
|
|
246
|
+
|
|
247
|
+
file_id = response.data.get("blob", {}).get("id")
|
|
248
|
+
if file_id:
|
|
249
|
+
file_ids.append(str(file_id))
|
|
250
|
+
return file_ids
|
|
@@ -2,10 +2,10 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
|
|
4
4
|
datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=Ysm-6Kb-54FfkN35VJIe5vW7Kik8VGA3wcyCUnqPBHg,42245
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
8
|
-
datachain/dataset.py,sha256=
|
|
8
|
+
datachain/dataset.py,sha256=P-pDBgvPqJGDhq_I7fwCfb6hY8E8mIAO8Q0NT7SNlNE,19128
|
|
9
9
|
datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=TgKg25ZWAP5enzKgw2_2GUPJVdnQUh6uySHB5SJrUY4,7773
|
|
@@ -14,13 +14,13 @@ datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,11
|
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
|
|
15
15
|
datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
datachain/studio.py,sha256=
|
|
17
|
+
datachain/studio.py,sha256=MthVADn-jM2I5TlESOfbzFnKGZjpuk9bM8m2vqOK-C8,7227
|
|
18
18
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
19
|
datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
|
|
20
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
21
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=s4fat0jjP3JPq0RGQ9zfzRkX1JavxxCrcB1tJKMgsks,57686
|
|
22
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
23
|
-
datachain/catalog/loader.py,sha256
|
|
23
|
+
datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
|
|
24
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
25
25
|
datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
|
|
26
26
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
@@ -29,21 +29,20 @@ datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
|
|
|
29
29
|
datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
|
|
30
30
|
datachain/client/local.py,sha256=vwbgCwZ7IqY2voj2l7tLJjgov7Dp--fEUvUwUBsMbls,4457
|
|
31
31
|
datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
|
|
32
|
-
datachain/data_storage/__init__.py,sha256=
|
|
32
|
+
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
33
33
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
34
|
-
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
35
34
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
36
|
-
datachain/data_storage/metastore.py,sha256=
|
|
35
|
+
datachain/data_storage/metastore.py,sha256=EzSsfR_l_84i1AewYygpdsJyzGqEmvXjpeohlYF7h4A,37435
|
|
37
36
|
datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu-AY,9895
|
|
38
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
39
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
40
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
41
|
-
datachain/func/__init__.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=D_ZQ0PHmZzHO2dinv4naVJocUDIZUwV4WAz692C1cyk,22521
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=tjIkU-5JywBR0apCyqTcwSyaRtGxhu2L7IVjrz-55uc,30802
|
|
40
|
+
datachain/func/__init__.py,sha256=VAN7N2-eCHgidMCFI-fJTkCwdI1U_NIuCOgYc4sfYUQ,812
|
|
42
41
|
datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
|
|
43
42
|
datachain/func/array.py,sha256=zHDNWuWLA7HVa9FEvQeHhVi00_xqenyleTqcLwkXWBI,5477
|
|
44
43
|
datachain/func/base.py,sha256=wA0sBQAVyN9LPxoo7Ox83peS0zUVnyuKxukwAcjGLfY,534
|
|
45
44
|
datachain/func/conditional.py,sha256=mQroxsoExpBW84Zm5dAYP4OpBblWmzfnF2qJq9rba54,2223
|
|
46
|
-
datachain/func/func.py,sha256=
|
|
45
|
+
datachain/func/func.py,sha256=GykhTvNbACFSwaSXsgVlDnqR48kpP_GNAxm3bcq1RYg,12560
|
|
47
46
|
datachain/func/path.py,sha256=mqN_mfkwv44z2II7DMTp_fGGw95hmTCNls_TOFNpr4k,3155
|
|
48
47
|
datachain/func/random.py,sha256=pENOLj9rSmWfGCnOsUIaCsVC5486zQb66qfQvXaz9Z4,452
|
|
49
48
|
datachain/func/string.py,sha256=NQzaXXYu7yb72HPADy4WrFlcgvTS77L9x7-qvCKJtnk,4522
|
|
@@ -53,7 +52,7 @@ datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
|
|
|
53
52
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
54
53
|
datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
|
|
55
54
|
datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
|
|
56
|
-
datachain/lib/dc.py,sha256=
|
|
55
|
+
datachain/lib/dc.py,sha256=t5y5tsYyU7uuk3gEPPhhcDSZ1tL1aHkKG2W54eHiUq8,89492
|
|
57
56
|
datachain/lib/file.py,sha256=-XMkL6ED1sE7TMhWoMRTEuOXswZJw8X6AEmJDONFP74,15019
|
|
58
57
|
datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
|
|
59
58
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
@@ -61,7 +60,7 @@ datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
|
|
|
61
60
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
62
61
|
datachain/lib/meta_formats.py,sha256=anK2bDVbaeCCh0yvKUBaW2MVos3zRgdaSV8uSduzPcU,6680
|
|
63
62
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
64
|
-
datachain/lib/pytorch.py,sha256=
|
|
63
|
+
datachain/lib/pytorch.py,sha256=Nh6fUbQMLX8OpZvX4tw4bJjTCQpRKi0jSLgkJnLHdTM,5880
|
|
65
64
|
datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
|
|
66
65
|
datachain/lib/signal_schema.py,sha256=_uh19nCKhiD9ua8oIN1Q8R9iYv1BZAuqTJCLYVmyW8k,24557
|
|
67
66
|
datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
@@ -88,15 +87,15 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
|
|
|
88
87
|
datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
|
|
89
88
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
90
89
|
datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
|
|
91
|
-
datachain/query/dataset.py,sha256=
|
|
92
|
-
datachain/query/dispatch.py,sha256=
|
|
90
|
+
datachain/query/dataset.py,sha256=J6SbCLnFlZgCxRchc3tVk5tcC7xo1Hp616JGlEZXCDo,54547
|
|
91
|
+
datachain/query/dispatch.py,sha256=fZ0TgGFRcsrYh1iXQoZVjkUl4Xetom9PSHoeDes3IRs,11606
|
|
93
92
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
94
93
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
95
94
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
96
95
|
datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
|
|
97
|
-
datachain/query/session.py,sha256=
|
|
96
|
+
datachain/query/session.py,sha256=vvLIJ5b8eElovHLAWq_CZJXmN5t7C7iAZA7x9wPPOms,5905
|
|
98
97
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
99
|
-
datachain/remote/studio.py,sha256=
|
|
98
|
+
datachain/remote/studio.py,sha256=jp6NWo7OPUxqO8uYEHP0_XFlmj47rMxC80qKQ7rA3Xk,11024
|
|
100
99
|
datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
|
|
101
100
|
datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
|
|
102
101
|
datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
|
|
@@ -117,9 +116,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
117
116
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
118
117
|
datachain/toolkit/split.py,sha256=ZgDcrNiKiPXZmKD591_1z9qRIXitu5zwAsoVPB7ykiU,2508
|
|
119
118
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
120
|
-
datachain-0.7.
|
|
121
|
-
datachain-0.7.
|
|
122
|
-
datachain-0.7.
|
|
123
|
-
datachain-0.7.
|
|
124
|
-
datachain-0.7.
|
|
125
|
-
datachain-0.7.
|
|
119
|
+
datachain-0.7.7.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
120
|
+
datachain-0.7.7.dist-info/METADATA,sha256=laxYaz9f-PIJ30f3krSjRu45CjyfbnBM8Q4kddXa9dM,18006
|
|
121
|
+
datachain-0.7.7.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
122
|
+
datachain-0.7.7.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
123
|
+
datachain-0.7.7.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
124
|
+
datachain-0.7.7.dist-info/RECORD,,
|
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from collections.abc import Iterable
|
|
4
|
-
from functools import cached_property
|
|
5
|
-
from typing import TYPE_CHECKING, Optional
|
|
6
|
-
|
|
7
|
-
from sqlalchemy import Column, Integer, Table, Text
|
|
8
|
-
|
|
9
|
-
from datachain.data_storage.serializer import Serializable
|
|
10
|
-
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
from sqlalchemy.schema import SchemaItem
|
|
13
|
-
|
|
14
|
-
from datachain.data_storage.db_engine import DatabaseEngine
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger("datachain")
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class AbstractIDGenerator(ABC, Serializable):
|
|
21
|
-
"""
|
|
22
|
-
Abstract ID Generator class. This class is responsible for generating
|
|
23
|
-
unique IDs for each prefix (e.g. S3 bucket or dataset).
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
@abstractmethod
|
|
27
|
-
def clone(self) -> "AbstractIDGenerator":
|
|
28
|
-
"""Clones AbstractIDGenerator implementation."""
|
|
29
|
-
|
|
30
|
-
def init(self) -> None:
|
|
31
|
-
"""Initialize ID generator."""
|
|
32
|
-
|
|
33
|
-
def cleanup_for_tests(self):
|
|
34
|
-
"""Cleanup for tests."""
|
|
35
|
-
|
|
36
|
-
def close(self) -> None:
|
|
37
|
-
"""Closes any active database connections."""
|
|
38
|
-
|
|
39
|
-
def close_on_exit(self) -> None:
|
|
40
|
-
"""Closes any active database or HTTP connections, called on Session exit or
|
|
41
|
-
for test cleanup only, as some ID Generator implementations may handle this
|
|
42
|
-
differently.
|
|
43
|
-
"""
|
|
44
|
-
self.close()
|
|
45
|
-
|
|
46
|
-
@abstractmethod
|
|
47
|
-
def init_id(self, uri: str) -> None:
|
|
48
|
-
"""Initializes the ID generator for the given URI with zero last_id."""
|
|
49
|
-
|
|
50
|
-
@abstractmethod
|
|
51
|
-
def get_next_ids(self, uri: str, count: int) -> range:
|
|
52
|
-
"""Returns a range of IDs for the given URI."""
|
|
53
|
-
|
|
54
|
-
def get_next_id(self, uri: str) -> int:
|
|
55
|
-
"""Returns the next ID for the given URI."""
|
|
56
|
-
return self.get_next_ids(uri, 1)[0]
|
|
57
|
-
|
|
58
|
-
def delete_uri(self, uri: str):
|
|
59
|
-
"""Deletes the given URI."""
|
|
60
|
-
self.delete_uris([uri])
|
|
61
|
-
|
|
62
|
-
def delete_uris(self, uris: Iterable[str]):
|
|
63
|
-
"""Deletes the given URIs."""
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
class AbstractDBIDGenerator(AbstractIDGenerator):
|
|
67
|
-
"""
|
|
68
|
-
Abstract ID Generator class, to be implemented by any Database Adapters
|
|
69
|
-
for a specific database system. This class is responsible for generating
|
|
70
|
-
unique IDs for each prefix (e.g. S3 bucket or dataset) and storing them
|
|
71
|
-
in a database. It is also responsible for initializing the database
|
|
72
|
-
and creating the necessary tables.
|
|
73
|
-
"""
|
|
74
|
-
|
|
75
|
-
_db: "DatabaseEngine"
|
|
76
|
-
_table_prefix: Optional[str] = None
|
|
77
|
-
_skip_db_init: bool = False
|
|
78
|
-
_base_table_name = "id_generator"
|
|
79
|
-
|
|
80
|
-
def __init__(
|
|
81
|
-
self,
|
|
82
|
-
db: "DatabaseEngine",
|
|
83
|
-
table_prefix: Optional[str] = None,
|
|
84
|
-
skip_db_init: bool = False,
|
|
85
|
-
):
|
|
86
|
-
self._db = db
|
|
87
|
-
self._table_prefix = table_prefix
|
|
88
|
-
self._skip_db_init = skip_db_init
|
|
89
|
-
if db and not skip_db_init:
|
|
90
|
-
self.init()
|
|
91
|
-
|
|
92
|
-
@abstractmethod
|
|
93
|
-
def clone(self) -> "AbstractDBIDGenerator":
|
|
94
|
-
"""Clones AbstractIDGenerator implementation."""
|
|
95
|
-
|
|
96
|
-
def close(self) -> None:
|
|
97
|
-
"""Closes any active database connections."""
|
|
98
|
-
self.db.close()
|
|
99
|
-
|
|
100
|
-
@property
|
|
101
|
-
def db(self) -> "DatabaseEngine":
|
|
102
|
-
return self._db
|
|
103
|
-
|
|
104
|
-
@property
|
|
105
|
-
def _columns(self) -> list["SchemaItem"]:
|
|
106
|
-
return [
|
|
107
|
-
Column("uri", Text, primary_key=True, nullable=False),
|
|
108
|
-
# This is the last id used (and starts at zero if no ids have been used)
|
|
109
|
-
Column("last_id", Integer, nullable=False),
|
|
110
|
-
]
|
|
111
|
-
|
|
112
|
-
@cached_property
|
|
113
|
-
def _table(self) -> Table:
|
|
114
|
-
table_name = self._base_table_name
|
|
115
|
-
if self._table_prefix:
|
|
116
|
-
table_name = f"{self._table_prefix}_{table_name}"
|
|
117
|
-
return Table(table_name, self.db.metadata, *self._columns, extend_existing=True)
|
|
118
|
-
|
|
119
|
-
def init(self) -> None:
|
|
120
|
-
self.db.create_table(self._table, if_not_exists=True)
|
|
121
|
-
|
|
122
|
-
def cleanup_for_tests(self):
|
|
123
|
-
"""Cleanup for tests."""
|
|
124
|
-
self.db.drop_table(self._table, if_exists=True)
|
|
125
|
-
|
|
126
|
-
@abstractmethod
|
|
127
|
-
def init_id(self, uri: str) -> None:
|
|
128
|
-
"""Initializes the ID generator for the given URI with zero last_id."""
|
|
129
|
-
|
|
130
|
-
@abstractmethod
|
|
131
|
-
def get_next_ids(self, uri: str, count: int) -> range:
|
|
132
|
-
"""Returns a range of IDs for the given URI."""
|
|
133
|
-
|
|
134
|
-
def delete_uris(self, uris: Iterable[str]):
|
|
135
|
-
"""Deletes the given URIs from the database."""
|
|
136
|
-
self.db.execute(self._table.delete().where(self._table.c.uri.in_(uris)))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|