datachain 0.7.11__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +53 -41
- datachain/cli.py +25 -3
- datachain/data_storage/sqlite.py +20 -6
- datachain/lib/dc.py +155 -109
- datachain/lib/diff.py +197 -0
- datachain/lib/meta_formats.py +38 -42
- datachain/query/dataset.py +1 -0
- datachain/remote/studio.py +53 -1
- datachain/studio.py +47 -2
- {datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/METADATA +3 -2
- {datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/RECORD +15 -14
- {datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/LICENSE +0 -0
- {datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/WHEEL +0 -0
- {datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.7.11.dist-info → datachain-0.8.0.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
-
import math
|
|
5
4
|
import os
|
|
6
5
|
import os.path
|
|
7
6
|
import posixpath
|
|
@@ -13,7 +12,6 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
|
13
12
|
from copy import copy
|
|
14
13
|
from dataclasses import dataclass
|
|
15
14
|
from functools import cached_property, reduce
|
|
16
|
-
from random import shuffle
|
|
17
15
|
from threading import Thread
|
|
18
16
|
from typing import (
|
|
19
17
|
IO,
|
|
@@ -58,11 +56,7 @@ from datachain.node import DirType, Node, NodeWithPath
|
|
|
58
56
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
59
57
|
from datachain.remote.studio import StudioClient
|
|
60
58
|
from datachain.sql.types import DateTime, SQLType
|
|
61
|
-
from datachain.utils import
|
|
62
|
-
DataChainDir,
|
|
63
|
-
batched,
|
|
64
|
-
datachain_paths_join,
|
|
65
|
-
)
|
|
59
|
+
from datachain.utils import DataChainDir, datachain_paths_join
|
|
66
60
|
|
|
67
61
|
from .datasource import DataSource
|
|
68
62
|
|
|
@@ -90,7 +84,7 @@ QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
|
|
|
90
84
|
QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
|
|
91
85
|
|
|
92
86
|
# dataset pull
|
|
93
|
-
PULL_DATASET_MAX_THREADS =
|
|
87
|
+
PULL_DATASET_MAX_THREADS = 5
|
|
94
88
|
PULL_DATASET_CHUNK_TIMEOUT = 3600
|
|
95
89
|
PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be available
|
|
96
90
|
PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
|
|
@@ -130,6 +124,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
130
124
|
local_ds_version: int,
|
|
131
125
|
schema: dict[str, Union[SQLType, type[SQLType]]],
|
|
132
126
|
max_threads: int = PULL_DATASET_MAX_THREADS,
|
|
127
|
+
progress_bar=None,
|
|
133
128
|
):
|
|
134
129
|
super().__init__(max_threads)
|
|
135
130
|
self._check_dependencies()
|
|
@@ -142,6 +137,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
142
137
|
self.schema = schema
|
|
143
138
|
self.last_status_check: Optional[float] = None
|
|
144
139
|
self.studio_client = StudioClient()
|
|
140
|
+
self.progress_bar = progress_bar
|
|
145
141
|
|
|
146
142
|
def done_task(self, done):
|
|
147
143
|
for task in done:
|
|
@@ -198,6 +194,20 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
198
194
|
for c in [c for c, t in self.schema.items() if t == DateTime]:
|
|
199
195
|
df[c] = pd.to_datetime(df[c], unit="s")
|
|
200
196
|
|
|
197
|
+
# id will be autogenerated in DB
|
|
198
|
+
return df.drop("sys__id", axis=1)
|
|
199
|
+
|
|
200
|
+
def get_parquet_content(self, url: str):
|
|
201
|
+
while True:
|
|
202
|
+
if self.should_check_for_status():
|
|
203
|
+
self.check_for_status()
|
|
204
|
+
r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
|
|
205
|
+
if r.status_code == 404:
|
|
206
|
+
time.sleep(PULL_DATASET_SLEEP_INTERVAL)
|
|
207
|
+
continue
|
|
208
|
+
r.raise_for_status()
|
|
209
|
+
return r.content
|
|
210
|
+
|
|
201
211
|
def do_task(self, urls):
|
|
202
212
|
import lz4.frame
|
|
203
213
|
import pandas as pd
|
|
@@ -207,31 +217,22 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
207
217
|
local_ds = metastore.get_dataset(self.local_ds_name)
|
|
208
218
|
|
|
209
219
|
urls = list(urls)
|
|
210
|
-
while urls:
|
|
211
|
-
for url in urls:
|
|
212
|
-
if self.should_check_for_status():
|
|
213
|
-
self.check_for_status()
|
|
214
|
-
|
|
215
|
-
r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
|
|
216
|
-
if r.status_code == 404:
|
|
217
|
-
time.sleep(PULL_DATASET_SLEEP_INTERVAL)
|
|
218
|
-
# moving to the next url
|
|
219
|
-
continue
|
|
220
|
-
|
|
221
|
-
r.raise_for_status()
|
|
222
|
-
|
|
223
|
-
df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
|
|
224
220
|
|
|
225
|
-
|
|
221
|
+
for url in urls:
|
|
222
|
+
if self.should_check_for_status():
|
|
223
|
+
self.check_for_status()
|
|
226
224
|
|
|
227
|
-
|
|
228
|
-
|
|
225
|
+
df = pd.read_parquet(
|
|
226
|
+
io.BytesIO(lz4.frame.decompress(self.get_parquet_content(url)))
|
|
227
|
+
)
|
|
228
|
+
df = self.fix_columns(df)
|
|
229
229
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
230
|
+
inserted = warehouse.insert_dataset_rows(
|
|
231
|
+
df, local_ds, self.local_ds_version
|
|
232
|
+
)
|
|
233
|
+
self.increase_counter(inserted) # type: ignore [arg-type]
|
|
234
|
+
# sometimes progress bar doesn't get updated so manually updating it
|
|
235
|
+
self.update_progress_bar(self.progress_bar)
|
|
235
236
|
|
|
236
237
|
|
|
237
238
|
@dataclass
|
|
@@ -1291,13 +1292,13 @@ class Catalog:
|
|
|
1291
1292
|
for source in data_sources: # type: ignore [union-attr]
|
|
1292
1293
|
yield source, source.ls(fields)
|
|
1293
1294
|
|
|
1294
|
-
def pull_dataset( # noqa: PLR0915
|
|
1295
|
+
def pull_dataset( # noqa: C901, PLR0915
|
|
1295
1296
|
self,
|
|
1296
1297
|
remote_ds_uri: str,
|
|
1297
1298
|
output: Optional[str] = None,
|
|
1298
1299
|
local_ds_name: Optional[str] = None,
|
|
1299
1300
|
local_ds_version: Optional[int] = None,
|
|
1300
|
-
|
|
1301
|
+
cp: bool = False,
|
|
1301
1302
|
force: bool = False,
|
|
1302
1303
|
edatachain: bool = False,
|
|
1303
1304
|
edatachain_file: Optional[str] = None,
|
|
@@ -1305,7 +1306,7 @@ class Catalog:
|
|
|
1305
1306
|
client_config=None,
|
|
1306
1307
|
) -> None:
|
|
1307
1308
|
def _instantiate(ds_uri: str) -> None:
|
|
1308
|
-
if
|
|
1309
|
+
if not cp:
|
|
1309
1310
|
return
|
|
1310
1311
|
assert output
|
|
1311
1312
|
self.cp(
|
|
@@ -1318,7 +1319,7 @@ class Catalog:
|
|
|
1318
1319
|
)
|
|
1319
1320
|
print(f"Dataset {ds_uri} instantiated locally to {output}")
|
|
1320
1321
|
|
|
1321
|
-
if
|
|
1322
|
+
if cp and not output:
|
|
1322
1323
|
raise ValueError("Please provide output directory for instantiation")
|
|
1323
1324
|
|
|
1324
1325
|
studio_client = StudioClient()
|
|
@@ -1417,12 +1418,26 @@ class Catalog:
|
|
|
1417
1418
|
signed_urls = export_response.data
|
|
1418
1419
|
|
|
1419
1420
|
if signed_urls:
|
|
1420
|
-
shuffle(signed_urls)
|
|
1421
|
-
|
|
1422
1421
|
with (
|
|
1423
1422
|
self.metastore.clone() as metastore,
|
|
1424
1423
|
self.warehouse.clone() as warehouse,
|
|
1425
1424
|
):
|
|
1425
|
+
|
|
1426
|
+
def batch(urls):
|
|
1427
|
+
"""
|
|
1428
|
+
Batching urls in a way that fetching is most efficient as
|
|
1429
|
+
urls with lower id will be created first. Because that, we
|
|
1430
|
+
are making sure all threads are pulling most recent urls
|
|
1431
|
+
from beginning
|
|
1432
|
+
"""
|
|
1433
|
+
res = [[] for i in range(PULL_DATASET_MAX_THREADS)]
|
|
1434
|
+
current_worker = 0
|
|
1435
|
+
for url in signed_urls:
|
|
1436
|
+
res[current_worker].append(url)
|
|
1437
|
+
current_worker = (current_worker + 1) % PULL_DATASET_MAX_THREADS
|
|
1438
|
+
|
|
1439
|
+
return res
|
|
1440
|
+
|
|
1426
1441
|
rows_fetcher = DatasetRowsFetcher(
|
|
1427
1442
|
metastore,
|
|
1428
1443
|
warehouse,
|
|
@@ -1431,14 +1446,11 @@ class Catalog:
|
|
|
1431
1446
|
local_ds_name,
|
|
1432
1447
|
local_ds_version,
|
|
1433
1448
|
schema,
|
|
1449
|
+
progress_bar=dataset_save_progress_bar,
|
|
1434
1450
|
)
|
|
1435
1451
|
try:
|
|
1436
1452
|
rows_fetcher.run(
|
|
1437
|
-
|
|
1438
|
-
signed_urls,
|
|
1439
|
-
math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
|
|
1440
|
-
),
|
|
1441
|
-
dataset_save_progress_bar,
|
|
1453
|
+
iter(batch(signed_urls)), dataset_save_progress_bar
|
|
1442
1454
|
)
|
|
1443
1455
|
except:
|
|
1444
1456
|
self.remove_dataset(local_ds_name, local_ds_version)
|
datachain/cli.py
CHANGED
|
@@ -294,6 +294,28 @@ def add_studio_parser(subparsers, parent_parser) -> None:
|
|
|
294
294
|
help="Python package requirement. Can be specified multiple times.",
|
|
295
295
|
)
|
|
296
296
|
|
|
297
|
+
studio_cancel_help = "Cancel a job in Studio"
|
|
298
|
+
studio_cancel_description = "This command cancels a job in Studio."
|
|
299
|
+
|
|
300
|
+
studio_cancel_parser = studio_subparser.add_parser(
|
|
301
|
+
"cancel",
|
|
302
|
+
parents=[parent_parser],
|
|
303
|
+
description=studio_cancel_description,
|
|
304
|
+
help=studio_cancel_help,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
studio_cancel_parser.add_argument(
|
|
308
|
+
"job_id",
|
|
309
|
+
action="store",
|
|
310
|
+
help="The job ID to cancel.",
|
|
311
|
+
)
|
|
312
|
+
studio_cancel_parser.add_argument(
|
|
313
|
+
"--team",
|
|
314
|
+
action="store",
|
|
315
|
+
default=None,
|
|
316
|
+
help="The team to cancel a job for. By default, it will use team from config.",
|
|
317
|
+
)
|
|
318
|
+
|
|
297
319
|
|
|
298
320
|
def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
299
321
|
try:
|
|
@@ -457,10 +479,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
457
479
|
help="Copy directories recursively",
|
|
458
480
|
)
|
|
459
481
|
parse_pull.add_argument(
|
|
460
|
-
"--
|
|
482
|
+
"--cp",
|
|
461
483
|
default=False,
|
|
462
484
|
action="store_true",
|
|
463
|
-
help="
|
|
485
|
+
help="Copy actual files after pulling remote dataset into local DB",
|
|
464
486
|
)
|
|
465
487
|
parse_pull.add_argument(
|
|
466
488
|
"--edatachain",
|
|
@@ -1300,7 +1322,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1300
1322
|
args.output,
|
|
1301
1323
|
local_ds_name=args.local_name,
|
|
1302
1324
|
local_ds_version=args.local_version,
|
|
1303
|
-
|
|
1325
|
+
cp=args.cp,
|
|
1304
1326
|
force=bool(args.force),
|
|
1305
1327
|
edatachain=args.edatachain,
|
|
1306
1328
|
edatachain_file=args.edatachain_file,
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -209,10 +209,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
209
209
|
|
|
210
210
|
@retry_sqlite_locks
|
|
211
211
|
def executemany(
|
|
212
|
-
self, query, params, cursor: Optional[sqlite3.Cursor] = None
|
|
212
|
+
self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
|
|
213
213
|
) -> sqlite3.Cursor:
|
|
214
214
|
if cursor:
|
|
215
215
|
return cursor.executemany(self.compile(query).string, params)
|
|
216
|
+
if conn:
|
|
217
|
+
return conn.executemany(self.compile(query).string, params)
|
|
216
218
|
return self.db.executemany(self.compile(query).string, params)
|
|
217
219
|
|
|
218
220
|
@retry_sqlite_locks
|
|
@@ -222,7 +224,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
222
224
|
return self.db.execute(sql, parameters)
|
|
223
225
|
|
|
224
226
|
def insert_dataframe(self, table_name: str, df) -> int:
|
|
225
|
-
return df.to_sql(
|
|
227
|
+
return df.to_sql(
|
|
228
|
+
table_name,
|
|
229
|
+
self.db,
|
|
230
|
+
if_exists="append",
|
|
231
|
+
index=False,
|
|
232
|
+
method="multi",
|
|
233
|
+
chunksize=1000,
|
|
234
|
+
)
|
|
226
235
|
|
|
227
236
|
def cursor(self, factory=None):
|
|
228
237
|
if factory is None:
|
|
@@ -545,10 +554,15 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
545
554
|
rows = list(rows)
|
|
546
555
|
if not rows:
|
|
547
556
|
return
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
557
|
+
|
|
558
|
+
with self.db.transaction() as conn:
|
|
559
|
+
# transactions speeds up inserts significantly as there is no separate
|
|
560
|
+
# transaction created for each insert row
|
|
561
|
+
self.db.executemany(
|
|
562
|
+
table.insert().values({f: bindparam(f) for f in rows[0]}),
|
|
563
|
+
rows,
|
|
564
|
+
conn=conn,
|
|
565
|
+
)
|
|
552
566
|
|
|
553
567
|
def insert_dataset_rows(self, df, dataset: DatasetRecord, version: int) -> int:
|
|
554
568
|
dr = self.dataset_rows(dataset, version)
|
datachain/lib/dc.py
CHANGED
|
@@ -41,7 +41,7 @@ from datachain.lib.listing import (
|
|
|
41
41
|
parse_listing_uri,
|
|
42
42
|
)
|
|
43
43
|
from datachain.lib.listing_info import ListingInfo
|
|
44
|
-
from datachain.lib.meta_formats import read_meta
|
|
44
|
+
from datachain.lib.meta_formats import read_meta
|
|
45
45
|
from datachain.lib.model_store import ModelStore
|
|
46
46
|
from datachain.lib.settings import Settings
|
|
47
47
|
from datachain.lib.signal_schema import SignalSchema
|
|
@@ -554,8 +554,7 @@ class DataChain:
|
|
|
554
554
|
jmespath: Optional[str] = None,
|
|
555
555
|
object_name: Optional[str] = "",
|
|
556
556
|
model_name: Optional[str] = None,
|
|
557
|
-
|
|
558
|
-
meta_type: Optional[str] = "json",
|
|
557
|
+
format: Optional[str] = "json",
|
|
559
558
|
nrows=None,
|
|
560
559
|
**kwargs,
|
|
561
560
|
) -> "DataChain":
|
|
@@ -564,12 +563,12 @@ class DataChain:
|
|
|
564
563
|
Parameters:
|
|
565
564
|
path : storage URI with directory. URI must start with storage prefix such
|
|
566
565
|
as `s3://`, `gs://`, `az://` or "file:///"
|
|
567
|
-
type : read file as "binary", "text", or "image" data. Default is "
|
|
566
|
+
type : read file as "binary", "text", or "image" data. Default is "text".
|
|
568
567
|
spec : optional Data Model
|
|
569
568
|
schema_from : path to sample to infer spec (if schema not provided)
|
|
570
569
|
object_name : generated object column name
|
|
571
570
|
model_name : optional generated model name
|
|
572
|
-
|
|
571
|
+
format: "json", "jsonl"
|
|
573
572
|
jmespath : optional JMESPATH expression to reduce JSON
|
|
574
573
|
nrows : optional row limit for jsonl and JSON arrays
|
|
575
574
|
|
|
@@ -594,75 +593,14 @@ class DataChain:
|
|
|
594
593
|
if (not object_name) and jmespath:
|
|
595
594
|
object_name = jmespath_to_name(jmespath)
|
|
596
595
|
if not object_name:
|
|
597
|
-
object_name =
|
|
598
|
-
chain = DataChain.from_storage(uri=path, type=type, **kwargs)
|
|
599
|
-
signal_dict = {
|
|
600
|
-
object_name: read_meta(
|
|
601
|
-
schema_from=schema_from,
|
|
602
|
-
meta_type=meta_type,
|
|
603
|
-
spec=spec,
|
|
604
|
-
model_name=model_name,
|
|
605
|
-
print_schema=print_schema,
|
|
606
|
-
jmespath=jmespath,
|
|
607
|
-
nrows=nrows,
|
|
608
|
-
)
|
|
609
|
-
}
|
|
610
|
-
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
611
|
-
|
|
612
|
-
@classmethod
|
|
613
|
-
def from_jsonl(
|
|
614
|
-
cls,
|
|
615
|
-
path,
|
|
616
|
-
type: Literal["binary", "text", "image"] = "text",
|
|
617
|
-
spec: Optional[DataType] = None,
|
|
618
|
-
schema_from: Optional[str] = "auto",
|
|
619
|
-
jmespath: Optional[str] = None,
|
|
620
|
-
object_name: Optional[str] = "",
|
|
621
|
-
model_name: Optional[str] = None,
|
|
622
|
-
print_schema: Optional[bool] = False,
|
|
623
|
-
meta_type: Optional[str] = "jsonl",
|
|
624
|
-
nrows=None,
|
|
625
|
-
**kwargs,
|
|
626
|
-
) -> "DataChain":
|
|
627
|
-
"""Get data from JSON lines. It returns the chain itself.
|
|
628
|
-
|
|
629
|
-
Parameters:
|
|
630
|
-
path : storage URI with directory. URI must start with storage prefix such
|
|
631
|
-
as `s3://`, `gs://`, `az://` or "file:///"
|
|
632
|
-
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
633
|
-
spec : optional Data Model
|
|
634
|
-
schema_from : path to sample to infer spec (if schema not provided)
|
|
635
|
-
object_name : generated object column name
|
|
636
|
-
model_name : optional generated model name
|
|
637
|
-
print_schema : print auto-generated schema
|
|
638
|
-
jmespath : optional JMESPATH expression to reduce JSON
|
|
639
|
-
nrows : optional row limit for jsonl and JSON arrays
|
|
640
|
-
|
|
641
|
-
Example:
|
|
642
|
-
infer JSONl schema from data, limit parsing to 1 row
|
|
643
|
-
```py
|
|
644
|
-
chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
|
|
645
|
-
```
|
|
646
|
-
"""
|
|
647
|
-
if schema_from == "auto":
|
|
648
|
-
schema_from = path
|
|
649
|
-
|
|
650
|
-
def jmespath_to_name(s: str):
|
|
651
|
-
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
|
|
652
|
-
return s[:name_end]
|
|
653
|
-
|
|
654
|
-
if (not object_name) and jmespath:
|
|
655
|
-
object_name = jmespath_to_name(jmespath)
|
|
656
|
-
if not object_name:
|
|
657
|
-
object_name = meta_type
|
|
596
|
+
object_name = format
|
|
658
597
|
chain = DataChain.from_storage(uri=path, type=type, **kwargs)
|
|
659
598
|
signal_dict = {
|
|
660
599
|
object_name: read_meta(
|
|
661
600
|
schema_from=schema_from,
|
|
662
|
-
|
|
601
|
+
format=format,
|
|
663
602
|
spec=spec,
|
|
664
603
|
model_name=model_name,
|
|
665
|
-
print_schema=print_schema,
|
|
666
604
|
jmespath=jmespath,
|
|
667
605
|
nrows=nrows,
|
|
668
606
|
)
|
|
@@ -793,47 +731,6 @@ class DataChain:
|
|
|
793
731
|
**{object_name: catalog.listings()}, # type: ignore[arg-type]
|
|
794
732
|
)
|
|
795
733
|
|
|
796
|
-
def print_json_schema( # type: ignore[override]
|
|
797
|
-
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
798
|
-
) -> "Self":
|
|
799
|
-
"""Print JSON data model and save it. It returns the chain itself.
|
|
800
|
-
|
|
801
|
-
Parameters:
|
|
802
|
-
jmespath : JMESPATH expression to reduce JSON
|
|
803
|
-
model_name : generated model name
|
|
804
|
-
|
|
805
|
-
Example:
|
|
806
|
-
print JSON schema and save to column "meta_from":
|
|
807
|
-
```py
|
|
808
|
-
uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
809
|
-
chain = DataChain.from_storage(uri)
|
|
810
|
-
chain = chain.print_json_schema()
|
|
811
|
-
chain.save()
|
|
812
|
-
```
|
|
813
|
-
"""
|
|
814
|
-
return self.map(
|
|
815
|
-
meta_schema=lambda file: read_schema(
|
|
816
|
-
file, data_type="json", expr=jmespath, model_name=model_name
|
|
817
|
-
),
|
|
818
|
-
output=str,
|
|
819
|
-
)
|
|
820
|
-
|
|
821
|
-
def print_jsonl_schema( # type: ignore[override]
|
|
822
|
-
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
823
|
-
) -> "Self":
|
|
824
|
-
"""Print JSON data model and save it. It returns the chain itself.
|
|
825
|
-
|
|
826
|
-
Parameters:
|
|
827
|
-
jmespath : JMESPATH expression to reduce JSON
|
|
828
|
-
model_name : generated model name
|
|
829
|
-
"""
|
|
830
|
-
return self.map(
|
|
831
|
-
meta_schema=lambda file: read_schema(
|
|
832
|
-
file, data_type="jsonl", expr=jmespath, model_name=model_name
|
|
833
|
-
),
|
|
834
|
-
output=str,
|
|
835
|
-
)
|
|
836
|
-
|
|
837
734
|
def save( # type: ignore[override]
|
|
838
735
|
self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
|
|
839
736
|
) -> "Self":
|
|
@@ -1624,6 +1521,155 @@ class DataChain:
|
|
|
1624
1521
|
)
|
|
1625
1522
|
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
1626
1523
|
|
|
1524
|
+
def compare(
|
|
1525
|
+
self,
|
|
1526
|
+
other: "DataChain",
|
|
1527
|
+
on: Union[str, Sequence[str]],
|
|
1528
|
+
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
1529
|
+
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
1530
|
+
right_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
1531
|
+
added: bool = True,
|
|
1532
|
+
deleted: bool = True,
|
|
1533
|
+
modified: bool = True,
|
|
1534
|
+
same: bool = False,
|
|
1535
|
+
status_col: Optional[str] = None,
|
|
1536
|
+
) -> "DataChain":
|
|
1537
|
+
"""Comparing two chains by identifying rows that are added, deleted, modified
|
|
1538
|
+
or same. Result is the new chain that has additional column with possible
|
|
1539
|
+
values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
|
|
1540
|
+
rows respectively. Note that if only one "status" is asked, by setting proper
|
|
1541
|
+
flags, this additional column is not created as it would have only one value
|
|
1542
|
+
for all rows. Beside additional diff column, new chain has schema of the chain
|
|
1543
|
+
on which method was called.
|
|
1544
|
+
|
|
1545
|
+
Parameters:
|
|
1546
|
+
other: Chain to calculate diff from.
|
|
1547
|
+
on: Column or list of columns to match on. If both chains have the
|
|
1548
|
+
same columns then this column is enough for the match. Otherwise,
|
|
1549
|
+
`right_on` parameter has to specify the columns for the other chain.
|
|
1550
|
+
This value is used to find corresponding row in other dataset. If not
|
|
1551
|
+
found there, row is considered as added (or removed if vice versa), and
|
|
1552
|
+
if found then row can be either modified or same.
|
|
1553
|
+
right_on: Optional column or list of columns
|
|
1554
|
+
for the `other` to match.
|
|
1555
|
+
compare: Column or list of columns to compare on. If both chains have
|
|
1556
|
+
the same columns then this column is enough for the compare. Otherwise,
|
|
1557
|
+
`right_compare` parameter has to specify the columns for the other
|
|
1558
|
+
chain. This value is used to see if row is modified or same. If
|
|
1559
|
+
not set, all columns will be used for comparison
|
|
1560
|
+
right_compare: Optional column or list of columns
|
|
1561
|
+
for the `other` to compare to.
|
|
1562
|
+
added (bool): Whether to return added rows in resulting chain.
|
|
1563
|
+
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1564
|
+
modified (bool): Whether to return modified rows in resulting chain.
|
|
1565
|
+
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1566
|
+
status_col (str): Name of the new column that is created in resulting chain
|
|
1567
|
+
representing diff status.
|
|
1568
|
+
|
|
1569
|
+
Example:
|
|
1570
|
+
```py
|
|
1571
|
+
diff = persons.diff(
|
|
1572
|
+
new_persons,
|
|
1573
|
+
on=["id"],
|
|
1574
|
+
right_on=["other_id"],
|
|
1575
|
+
compare=["name"],
|
|
1576
|
+
added=True,
|
|
1577
|
+
deleted=True,
|
|
1578
|
+
modified=True,
|
|
1579
|
+
same=True,
|
|
1580
|
+
status_col="diff"
|
|
1581
|
+
)
|
|
1582
|
+
```
|
|
1583
|
+
"""
|
|
1584
|
+
from datachain.lib.diff import compare as chain_compare
|
|
1585
|
+
|
|
1586
|
+
return chain_compare(
|
|
1587
|
+
self,
|
|
1588
|
+
other,
|
|
1589
|
+
on,
|
|
1590
|
+
right_on=right_on,
|
|
1591
|
+
compare=compare,
|
|
1592
|
+
right_compare=right_compare,
|
|
1593
|
+
added=added,
|
|
1594
|
+
deleted=deleted,
|
|
1595
|
+
modified=modified,
|
|
1596
|
+
same=same,
|
|
1597
|
+
status_col=status_col,
|
|
1598
|
+
)
|
|
1599
|
+
|
|
1600
|
+
def diff(
|
|
1601
|
+
self,
|
|
1602
|
+
other: "DataChain",
|
|
1603
|
+
on: str = "file",
|
|
1604
|
+
right_on: Optional[str] = None,
|
|
1605
|
+
added: bool = True,
|
|
1606
|
+
modified: bool = True,
|
|
1607
|
+
deleted: bool = False,
|
|
1608
|
+
same: bool = False,
|
|
1609
|
+
status_col: Optional[str] = None,
|
|
1610
|
+
) -> "DataChain":
|
|
1611
|
+
"""Similar to `.compare()`, which is more generic method to calculate difference
|
|
1612
|
+
between two chains. Unlike `.compare()`, this method works only on those chains
|
|
1613
|
+
that have `File` object, or it's derivatives, in it. File `source` and `path`
|
|
1614
|
+
are used for matching, and file `version` and `etag` for comparing, while in
|
|
1615
|
+
`.compare()` user needs to provide arbitrary columns for matching and comparing.
|
|
1616
|
+
|
|
1617
|
+
Parameters:
|
|
1618
|
+
other: Chain to calculate diff from.
|
|
1619
|
+
on: File signal to match on. If both chains have the
|
|
1620
|
+
same file signal then this column is enough for the match. Otherwise,
|
|
1621
|
+
`right_on` parameter has to specify the file signal for the other chain.
|
|
1622
|
+
This value is used to find corresponding row in other dataset. If not
|
|
1623
|
+
found there, row is considered as added (or removed if vice versa), and
|
|
1624
|
+
if found then row can be either modified or same.
|
|
1625
|
+
right_on: Optional file signal for the `other` to match.
|
|
1626
|
+
added (bool): Whether to return added rows in resulting chain.
|
|
1627
|
+
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1628
|
+
modified (bool): Whether to return modified rows in resulting chain.
|
|
1629
|
+
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1630
|
+
status_col (str): Optional name of the new column that is created in
|
|
1631
|
+
resulting chain representing diff status.
|
|
1632
|
+
|
|
1633
|
+
Example:
|
|
1634
|
+
```py
|
|
1635
|
+
diff = images.diff(
|
|
1636
|
+
new_images,
|
|
1637
|
+
on="file",
|
|
1638
|
+
right_on="other_file",
|
|
1639
|
+
added=True,
|
|
1640
|
+
deleted=True,
|
|
1641
|
+
modified=True,
|
|
1642
|
+
same=True,
|
|
1643
|
+
status_col="diff"
|
|
1644
|
+
)
|
|
1645
|
+
```
|
|
1646
|
+
"""
|
|
1647
|
+
on_file_signals = ["source", "path"]
|
|
1648
|
+
compare_file_signals = ["version", "etag"]
|
|
1649
|
+
|
|
1650
|
+
def get_file_signals(file: str, signals):
|
|
1651
|
+
return [f"{file}.{c}" for c in signals]
|
|
1652
|
+
|
|
1653
|
+
right_on = right_on or on
|
|
1654
|
+
|
|
1655
|
+
on_cols = get_file_signals(on, on_file_signals)
|
|
1656
|
+
right_on_cols = get_file_signals(right_on, on_file_signals)
|
|
1657
|
+
compare_cols = get_file_signals(on, compare_file_signals)
|
|
1658
|
+
right_compare_cols = get_file_signals(right_on, compare_file_signals)
|
|
1659
|
+
|
|
1660
|
+
return self.compare(
|
|
1661
|
+
other,
|
|
1662
|
+
on_cols,
|
|
1663
|
+
right_on=right_on_cols,
|
|
1664
|
+
compare=compare_cols,
|
|
1665
|
+
right_compare=right_compare_cols,
|
|
1666
|
+
added=added,
|
|
1667
|
+
deleted=deleted,
|
|
1668
|
+
modified=modified,
|
|
1669
|
+
same=same,
|
|
1670
|
+
status_col=status_col,
|
|
1671
|
+
)
|
|
1672
|
+
|
|
1627
1673
|
@classmethod
|
|
1628
1674
|
def from_values(
|
|
1629
1675
|
cls,
|
datachain/lib/diff.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import string
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
5
|
+
|
|
6
|
+
import sqlalchemy as sa
|
|
7
|
+
|
|
8
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
9
|
+
from datachain.query.schema import Column
|
|
10
|
+
from datachain.sql.types import String
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from datachain.lib.dc import DataChain
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
C = Column
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def compare( # noqa: PLR0912, PLR0915, C901
|
|
20
|
+
left: "DataChain",
|
|
21
|
+
right: "DataChain",
|
|
22
|
+
on: Union[str, Sequence[str]],
|
|
23
|
+
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
24
|
+
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
25
|
+
right_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
26
|
+
added: bool = True,
|
|
27
|
+
deleted: bool = True,
|
|
28
|
+
modified: bool = True,
|
|
29
|
+
same: bool = True,
|
|
30
|
+
status_col: Optional[str] = None,
|
|
31
|
+
) -> "DataChain":
|
|
32
|
+
"""Comparing two chains by identifying rows that are added, deleted, modified
|
|
33
|
+
or same"""
|
|
34
|
+
dialect = left._query.dialect
|
|
35
|
+
|
|
36
|
+
rname = "right_"
|
|
37
|
+
|
|
38
|
+
def _rprefix(c: str, rc: str) -> str:
|
|
39
|
+
"""Returns prefix of right of two companion left - right columns
|
|
40
|
+
from merge. If companion columns have the same name then prefix will
|
|
41
|
+
be present in right column name, otherwise it won't.
|
|
42
|
+
"""
|
|
43
|
+
return rname if c == rc else ""
|
|
44
|
+
|
|
45
|
+
def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
|
|
46
|
+
return [obj] if isinstance(obj, str) else list(obj)
|
|
47
|
+
|
|
48
|
+
if on is None:
|
|
49
|
+
raise ValueError("'on' must be specified")
|
|
50
|
+
|
|
51
|
+
on = _to_list(on)
|
|
52
|
+
if right_on:
|
|
53
|
+
right_on = _to_list(right_on)
|
|
54
|
+
if len(on) != len(right_on):
|
|
55
|
+
raise ValueError("'on' and 'right_on' must be have the same length")
|
|
56
|
+
|
|
57
|
+
if compare:
|
|
58
|
+
compare = _to_list(compare)
|
|
59
|
+
|
|
60
|
+
if right_compare:
|
|
61
|
+
if not compare:
|
|
62
|
+
raise ValueError("'compare' must be defined if 'right_compare' is defined")
|
|
63
|
+
|
|
64
|
+
right_compare = _to_list(right_compare)
|
|
65
|
+
if len(compare) != len(right_compare):
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"'compare' and 'right_compare' must be have the same length"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
if not any([added, deleted, modified, same]):
|
|
71
|
+
raise ValueError(
|
|
72
|
+
"At least one of added, deleted, modified, same flags must be set"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# we still need status column for internal implementation even if not
|
|
76
|
+
# needed in output
|
|
77
|
+
need_status_col = bool(status_col)
|
|
78
|
+
status_col = status_col or "diff_" + "".join(
|
|
79
|
+
random.choice(string.ascii_letters) # noqa: S311
|
|
80
|
+
for _ in range(10)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# calculate on and compare column names
|
|
84
|
+
right_on = right_on or on
|
|
85
|
+
cols = left.signals_schema.clone_without_sys_signals().db_signals()
|
|
86
|
+
right_cols = right.signals_schema.clone_without_sys_signals().db_signals()
|
|
87
|
+
|
|
88
|
+
on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
|
|
89
|
+
right_on = right.signals_schema.resolve(*right_on).db_signals() # type: ignore[assignment]
|
|
90
|
+
if compare:
|
|
91
|
+
right_compare = right_compare or compare
|
|
92
|
+
compare = left.signals_schema.resolve(*compare).db_signals() # type: ignore[assignment]
|
|
93
|
+
right_compare = right.signals_schema.resolve(*right_compare).db_signals() # type: ignore[assignment]
|
|
94
|
+
elif not compare and len(cols) != len(right_cols):
|
|
95
|
+
# here we will mark all rows that are not added or deleted as modified since
|
|
96
|
+
# there was no explicit list of compare columns provided (meaning we need
|
|
97
|
+
# to check all columns to determine if row is modified or same), but
|
|
98
|
+
# the number of columns on left and right is not the same (one of the chains
|
|
99
|
+
# have additional column)
|
|
100
|
+
compare = None
|
|
101
|
+
right_compare = None
|
|
102
|
+
else:
|
|
103
|
+
compare = [c for c in cols if c in right_cols] # type: ignore[misc, assignment]
|
|
104
|
+
right_compare = compare
|
|
105
|
+
|
|
106
|
+
diff_cond = []
|
|
107
|
+
|
|
108
|
+
if added:
|
|
109
|
+
added_cond = sa.and_(
|
|
110
|
+
*[
|
|
111
|
+
C(c) == None # noqa: E711
|
|
112
|
+
for c in [f"{_rprefix(c, rc)}{rc}" for c, rc in zip(on, right_on)]
|
|
113
|
+
]
|
|
114
|
+
)
|
|
115
|
+
diff_cond.append((added_cond, "A"))
|
|
116
|
+
if modified and compare:
|
|
117
|
+
modified_cond = sa.or_(
|
|
118
|
+
*[
|
|
119
|
+
C(c) != C(f"{_rprefix(c, rc)}{rc}")
|
|
120
|
+
for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
|
|
121
|
+
]
|
|
122
|
+
)
|
|
123
|
+
diff_cond.append((modified_cond, "M"))
|
|
124
|
+
if same and compare:
|
|
125
|
+
same_cond = sa.and_(
|
|
126
|
+
*[
|
|
127
|
+
C(c) == C(f"{_rprefix(c, rc)}{rc}")
|
|
128
|
+
for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
|
|
129
|
+
]
|
|
130
|
+
)
|
|
131
|
+
diff_cond.append((same_cond, "S"))
|
|
132
|
+
|
|
133
|
+
diff = sa.case(*diff_cond, else_=None if compare else "M").label(status_col)
|
|
134
|
+
diff.type = String()
|
|
135
|
+
|
|
136
|
+
left_right_merge = left.merge(
|
|
137
|
+
right, on=on, right_on=right_on, inner=False, rname=rname
|
|
138
|
+
)
|
|
139
|
+
left_right_merge_select = left_right_merge._query.select(
|
|
140
|
+
*(
|
|
141
|
+
[C(c) for c in left_right_merge.signals_schema.db_signals("sys")]
|
|
142
|
+
+ [C(c) for c in on]
|
|
143
|
+
+ [C(c) for c in cols if c not in on]
|
|
144
|
+
+ [diff]
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
diff_col = sa.literal("D").label(status_col)
|
|
149
|
+
diff_col.type = String()
|
|
150
|
+
|
|
151
|
+
right_left_merge = right.merge(
|
|
152
|
+
left, on=right_on, right_on=on, inner=False, rname=rname
|
|
153
|
+
).filter(
|
|
154
|
+
sa.and_(
|
|
155
|
+
*[C(f"{_rprefix(c, rc)}{c}") == None for c, rc in zip(on, right_on)] # noqa: E711
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def _default_val(chain: "DataChain", col: str):
|
|
160
|
+
col_type = chain._query.column_types[col] # type: ignore[index]
|
|
161
|
+
val = sa.literal(col_type.default_value(dialect)).label(col)
|
|
162
|
+
val.type = col_type()
|
|
163
|
+
return val
|
|
164
|
+
|
|
165
|
+
right_left_merge_select = right_left_merge._query.select(
|
|
166
|
+
*(
|
|
167
|
+
[C(c) for c in right_left_merge.signals_schema.db_signals("sys")]
|
|
168
|
+
+ [
|
|
169
|
+
C(c) if c == rc else _default_val(left, c)
|
|
170
|
+
for c, rc in zip(on, right_on)
|
|
171
|
+
]
|
|
172
|
+
+ [
|
|
173
|
+
C(c) if c in right_cols else _default_val(left, c) # type: ignore[arg-type]
|
|
174
|
+
for c in cols
|
|
175
|
+
if c not in on
|
|
176
|
+
]
|
|
177
|
+
+ [diff_col]
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
if not deleted:
|
|
182
|
+
res = left_right_merge_select
|
|
183
|
+
elif deleted and not any([added, modified, same]):
|
|
184
|
+
res = right_left_merge_select
|
|
185
|
+
else:
|
|
186
|
+
res = left_right_merge_select.union(right_left_merge_select)
|
|
187
|
+
|
|
188
|
+
res = res.filter(C(status_col) != None) # noqa: E711
|
|
189
|
+
|
|
190
|
+
schema = left.signals_schema
|
|
191
|
+
if need_status_col:
|
|
192
|
+
res = res.select()
|
|
193
|
+
schema = SignalSchema({status_col: str}) | schema
|
|
194
|
+
else:
|
|
195
|
+
res = res.select_except(C(status_col))
|
|
196
|
+
|
|
197
|
+
return left._evolve(query=res, signal_schema=schema)
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -38,38 +38,41 @@ def process_json(data_string, jmespath):
|
|
|
38
38
|
return json_dict
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
def gen_datamodel_code(
|
|
42
|
+
source_file, format="json", jmespath=None, model_name=None
|
|
43
|
+
) -> str:
|
|
44
|
+
"""Generates Python code with Pydantic models that corresponds
|
|
45
|
+
to the provided JSON, CSV, or JSONL file.
|
|
46
|
+
It support root JSON arrays (samples the first entry).
|
|
47
|
+
"""
|
|
43
48
|
data_string = ""
|
|
44
49
|
# using uiid to get around issue #1617
|
|
45
50
|
if not model_name:
|
|
46
51
|
# comply with Python class names
|
|
47
52
|
uid_str = str(generate_uuid()).replace("-", "")
|
|
48
|
-
model_name = f"Model{
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
json_object = process_json(data_string, expr)
|
|
63
|
-
if data_type == "json" and isinstance(json_object, list):
|
|
53
|
+
model_name = f"Model{format}{uid_str}"
|
|
54
|
+
|
|
55
|
+
with source_file.open() as fd: # CSV can be larger than memory
|
|
56
|
+
if format == "csv":
|
|
57
|
+
data_string += fd.readline().replace("\r", "")
|
|
58
|
+
data_string += fd.readline().replace("\r", "")
|
|
59
|
+
elif format == "jsonl":
|
|
60
|
+
data_string = fd.readline().replace("\r", "")
|
|
61
|
+
else:
|
|
62
|
+
data_string = fd.read() # other meta must fit into RAM
|
|
63
|
+
|
|
64
|
+
if format in ("json", "jsonl"):
|
|
65
|
+
json_object = process_json(data_string, jmespath)
|
|
66
|
+
if format == "json" and isinstance(json_object, list):
|
|
64
67
|
json_object = json_object[0] # sample the 1st object from JSON array
|
|
65
|
-
if
|
|
66
|
-
|
|
68
|
+
if format == "jsonl":
|
|
69
|
+
format = "json" # treat json line as plain JSON in auto-schema
|
|
67
70
|
data_string = json.dumps(json_object)
|
|
68
71
|
|
|
69
72
|
import datamodel_code_generator
|
|
70
73
|
|
|
71
74
|
input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
|
|
72
|
-
input_file_type = input_file_types[
|
|
75
|
+
input_file_type = input_file_types[format]
|
|
73
76
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
74
77
|
output = Path(tmpdir) / "model.py"
|
|
75
78
|
datamodel_code_generator.generate(
|
|
@@ -95,36 +98,29 @@ spec = {model_name}
|
|
|
95
98
|
def read_meta( # noqa: C901
|
|
96
99
|
spec=None,
|
|
97
100
|
schema_from=None,
|
|
98
|
-
|
|
101
|
+
format="json",
|
|
99
102
|
jmespath=None,
|
|
100
|
-
print_schema=False,
|
|
101
103
|
model_name=None,
|
|
102
104
|
nrows=None,
|
|
103
105
|
) -> Callable:
|
|
104
106
|
from datachain.lib.dc import DataChain
|
|
105
107
|
|
|
106
108
|
if schema_from:
|
|
107
|
-
|
|
108
|
-
DataChain.from_storage(schema_from, type="text")
|
|
109
|
-
.limit(1)
|
|
110
|
-
.map( # dummy column created (#1615)
|
|
111
|
-
meta_schema=lambda file: read_schema(
|
|
112
|
-
file, data_type=meta_type, expr=jmespath, model_name=model_name
|
|
113
|
-
),
|
|
114
|
-
output=str,
|
|
115
|
-
)
|
|
109
|
+
file = next(
|
|
110
|
+
DataChain.from_storage(schema_from, type="text").limit(1).collect("file")
|
|
116
111
|
)
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
112
|
+
model_code = gen_datamodel_code(
|
|
113
|
+
file, format=format, jmespath=jmespath, model_name=model_name
|
|
114
|
+
)
|
|
115
|
+
assert isinstance(model_code, str)
|
|
116
|
+
|
|
121
117
|
# Below 'spec' should be a dynamically converted DataModel from Pydantic
|
|
122
118
|
if not spec:
|
|
123
119
|
gl = globals()
|
|
124
|
-
exec(
|
|
120
|
+
exec(model_code, gl) # type: ignore[arg-type] # noqa: S102
|
|
125
121
|
spec = gl["spec"]
|
|
126
122
|
|
|
127
|
-
if not
|
|
123
|
+
if not spec and not schema_from:
|
|
128
124
|
raise ValueError(
|
|
129
125
|
"Must provide a static schema in spec: or metadata sample in schema_from:"
|
|
130
126
|
)
|
|
@@ -136,7 +132,7 @@ def read_meta( # noqa: C901
|
|
|
136
132
|
def parse_data(
|
|
137
133
|
file: File,
|
|
138
134
|
data_model=spec,
|
|
139
|
-
|
|
135
|
+
format=format,
|
|
140
136
|
jmespath=jmespath,
|
|
141
137
|
nrows=nrows,
|
|
142
138
|
) -> Iterator[spec]:
|
|
@@ -148,7 +144,7 @@ def read_meta( # noqa: C901
|
|
|
148
144
|
except ValidationError as e:
|
|
149
145
|
print(f"Validation error occurred in row {nrow} file {file.name}:", e)
|
|
150
146
|
|
|
151
|
-
if
|
|
147
|
+
if format == "csv":
|
|
152
148
|
with (
|
|
153
149
|
file.open() as fd
|
|
154
150
|
): # TODO: if schema is statically given, should allow CSV without headers
|
|
@@ -156,7 +152,7 @@ def read_meta( # noqa: C901
|
|
|
156
152
|
for row in reader: # CSV can be larger than memory
|
|
157
153
|
yield from validator(row)
|
|
158
154
|
|
|
159
|
-
if
|
|
155
|
+
if format == "json":
|
|
160
156
|
try:
|
|
161
157
|
with file.open() as fd: # JSON must fit into RAM
|
|
162
158
|
data_string = fd.read()
|
|
@@ -174,7 +170,7 @@ def read_meta( # noqa: C901
|
|
|
174
170
|
return
|
|
175
171
|
yield from validator(json_dict, nrow)
|
|
176
172
|
|
|
177
|
-
if
|
|
173
|
+
if format == "jsonl":
|
|
178
174
|
try:
|
|
179
175
|
nrow = 0
|
|
180
176
|
with file.open() as fd:
|
datachain/query/dataset.py
CHANGED
|
@@ -1069,6 +1069,7 @@ class DatasetQuery:
|
|
|
1069
1069
|
if "sys__id" in self.column_types:
|
|
1070
1070
|
self.column_types.pop("sys__id")
|
|
1071
1071
|
self.starting_step = QueryStep(self.catalog, name, self.version)
|
|
1072
|
+
self.dialect = self.catalog.warehouse.db.dialect
|
|
1072
1073
|
|
|
1073
1074
|
def __iter__(self):
|
|
1074
1075
|
return iter(self.db_results())
|
datachain/remote/studio.py
CHANGED
|
@@ -2,7 +2,7 @@ import base64
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
-
from collections.abc import Iterable, Iterator
|
|
5
|
+
from collections.abc import AsyncIterator, Iterable, Iterator
|
|
6
6
|
from datetime import datetime, timedelta, timezone
|
|
7
7
|
from struct import unpack
|
|
8
8
|
from typing import (
|
|
@@ -11,6 +11,9 @@ from typing import (
|
|
|
11
11
|
Optional,
|
|
12
12
|
TypeVar,
|
|
13
13
|
)
|
|
14
|
+
from urllib.parse import urlparse, urlunparse
|
|
15
|
+
|
|
16
|
+
import websockets
|
|
14
17
|
|
|
15
18
|
from datachain.config import Config
|
|
16
19
|
from datachain.dataset import DatasetStats
|
|
@@ -22,6 +25,7 @@ LsData = Optional[list[dict[str, Any]]]
|
|
|
22
25
|
DatasetInfoData = Optional[dict[str, Any]]
|
|
23
26
|
DatasetStatsData = Optional[DatasetStats]
|
|
24
27
|
DatasetRowsData = Optional[Iterable[dict[str, Any]]]
|
|
28
|
+
DatasetJobVersionsData = Optional[dict[str, Any]]
|
|
25
29
|
DatasetExportStatus = Optional[dict[str, Any]]
|
|
26
30
|
DatasetExportSignedUrls = Optional[list[str]]
|
|
27
31
|
FileUploadData = Optional[dict[str, Any]]
|
|
@@ -231,6 +235,40 @@ class StudioClient:
|
|
|
231
235
|
|
|
232
236
|
return msgpack.ExtType(code, data)
|
|
233
237
|
|
|
238
|
+
async def tail_job_logs(self, job_id: str) -> AsyncIterator[dict]:
|
|
239
|
+
"""
|
|
240
|
+
Follow job logs via websocket connection.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
job_id: ID of the job to follow logs for
|
|
244
|
+
|
|
245
|
+
Yields:
|
|
246
|
+
Dict containing either job status updates or log messages
|
|
247
|
+
"""
|
|
248
|
+
parsed_url = urlparse(self.url)
|
|
249
|
+
ws_url = urlunparse(
|
|
250
|
+
parsed_url._replace(scheme="wss" if parsed_url.scheme == "https" else "ws")
|
|
251
|
+
)
|
|
252
|
+
ws_url = f"{ws_url}/logs/follow/?job_id={job_id}&team_name={self.team}"
|
|
253
|
+
|
|
254
|
+
async with websockets.connect(
|
|
255
|
+
ws_url,
|
|
256
|
+
additional_headers={"Authorization": f"token {self.token}"},
|
|
257
|
+
) as websocket:
|
|
258
|
+
while True:
|
|
259
|
+
try:
|
|
260
|
+
message = await websocket.recv()
|
|
261
|
+
data = json.loads(message)
|
|
262
|
+
|
|
263
|
+
# Yield the parsed message data
|
|
264
|
+
yield data
|
|
265
|
+
|
|
266
|
+
except websockets.exceptions.ConnectionClosed:
|
|
267
|
+
break
|
|
268
|
+
except Exception as e: # noqa: BLE001
|
|
269
|
+
logger.error("Error receiving websocket message: %s", e)
|
|
270
|
+
break
|
|
271
|
+
|
|
234
272
|
def ls(self, paths: Iterable[str]) -> Iterator[tuple[str, Response[LsData]]]:
|
|
235
273
|
# TODO: change LsData (response.data value) to be list of lists
|
|
236
274
|
# to handle cases where a path will be expanded (i.e. globs)
|
|
@@ -302,6 +340,13 @@ class StudioClient:
|
|
|
302
340
|
method="GET",
|
|
303
341
|
)
|
|
304
342
|
|
|
343
|
+
def dataset_job_versions(self, job_id: str) -> Response[DatasetJobVersionsData]:
|
|
344
|
+
return self._send_request(
|
|
345
|
+
"datachain/datasets/dataset_job_versions",
|
|
346
|
+
{"job_id": job_id},
|
|
347
|
+
method="GET",
|
|
348
|
+
)
|
|
349
|
+
|
|
305
350
|
def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
|
|
306
351
|
response = self._send_request(
|
|
307
352
|
"datachain/datasets/stats",
|
|
@@ -359,3 +404,10 @@ class StudioClient:
|
|
|
359
404
|
"requirements": requirements,
|
|
360
405
|
}
|
|
361
406
|
return self._send_request("datachain/job", data)
|
|
407
|
+
|
|
408
|
+
def cancel_job(
|
|
409
|
+
self,
|
|
410
|
+
job_id: str,
|
|
411
|
+
) -> Response[JobData]:
|
|
412
|
+
url = f"datachain/job/{job_id}/cancel"
|
|
413
|
+
return self._send_request(url, data={}, method="POST")
|
datachain/studio.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import os
|
|
2
3
|
from typing import TYPE_CHECKING, Optional
|
|
3
4
|
|
|
@@ -19,7 +20,7 @@ POST_LOGIN_MESSAGE = (
|
|
|
19
20
|
)
|
|
20
21
|
|
|
21
22
|
|
|
22
|
-
def process_studio_cli_args(args: "Namespace"):
|
|
23
|
+
def process_studio_cli_args(args: "Namespace"): # noqa: PLR0911
|
|
23
24
|
if args.cmd == "login":
|
|
24
25
|
return login(args)
|
|
25
26
|
if args.cmd == "logout":
|
|
@@ -47,6 +48,9 @@ def process_studio_cli_args(args: "Namespace"):
|
|
|
47
48
|
args.req_file,
|
|
48
49
|
)
|
|
49
50
|
|
|
51
|
+
if args.cmd == "cancel":
|
|
52
|
+
return cancel_job(args.job_id, args.team)
|
|
53
|
+
|
|
50
54
|
if args.cmd == "team":
|
|
51
55
|
return set_team(args)
|
|
52
56
|
raise DataChainError(f"Unknown command '{args.cmd}'.")
|
|
@@ -227,8 +231,34 @@ def create_job(
|
|
|
227
231
|
if not response.data:
|
|
228
232
|
raise DataChainError("Failed to create job")
|
|
229
233
|
|
|
230
|
-
|
|
234
|
+
job_id = response.data.get("job", {}).get("id")
|
|
235
|
+
print(f"Job {job_id} created")
|
|
231
236
|
print("Open the job in Studio at", response.data.get("job", {}).get("url"))
|
|
237
|
+
print("=" * 40)
|
|
238
|
+
|
|
239
|
+
# Sync usage
|
|
240
|
+
async def _run():
|
|
241
|
+
async for message in client.tail_job_logs(job_id):
|
|
242
|
+
if "logs" in message:
|
|
243
|
+
for log in message["logs"]:
|
|
244
|
+
print(log["message"], end="")
|
|
245
|
+
elif "job" in message:
|
|
246
|
+
print(f"\n>>>> Job is now in {message['job']['status']} status.")
|
|
247
|
+
|
|
248
|
+
asyncio.run(_run())
|
|
249
|
+
|
|
250
|
+
response = client.dataset_job_versions(job_id)
|
|
251
|
+
if not response.ok:
|
|
252
|
+
raise_remote_error(response.message)
|
|
253
|
+
|
|
254
|
+
response_data = response.data
|
|
255
|
+
if response_data:
|
|
256
|
+
dataset_versions = response_data.get("dataset_versions", [])
|
|
257
|
+
print("\n\n>>>> Dataset versions created during the job:")
|
|
258
|
+
for version in dataset_versions:
|
|
259
|
+
print(f" - {version.get('dataset_name')}@v{version.get('version')}")
|
|
260
|
+
else:
|
|
261
|
+
print("No dataset versions created during the job.")
|
|
232
262
|
|
|
233
263
|
|
|
234
264
|
def upload_files(client: StudioClient, files: list[str]) -> list[str]:
|
|
@@ -248,3 +278,18 @@ def upload_files(client: StudioClient, files: list[str]) -> list[str]:
|
|
|
248
278
|
if file_id:
|
|
249
279
|
file_ids.append(str(file_id))
|
|
250
280
|
return file_ids
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def cancel_job(job_id: str, team_name: Optional[str]):
|
|
284
|
+
token = Config().read().get("studio", {}).get("token")
|
|
285
|
+
if not token:
|
|
286
|
+
raise DataChainError(
|
|
287
|
+
"Not logged in to Studio. Log in with 'datachain studio login'."
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
client = StudioClient(team=team_name)
|
|
291
|
+
response = client.cancel_job(job_id)
|
|
292
|
+
if not response.ok:
|
|
293
|
+
raise_remote_error(response.message)
|
|
294
|
+
|
|
295
|
+
print(f"Job {job_id} canceled")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -46,6 +46,7 @@ Requires-Dist: iterative-telemetry>=0.0.9
|
|
|
46
46
|
Requires-Dist: platformdirs
|
|
47
47
|
Requires-Dist: dvc-studio-client<1,>=0.21
|
|
48
48
|
Requires-Dist: tabulate
|
|
49
|
+
Requires-Dist: websockets
|
|
49
50
|
Provides-Extra: docs
|
|
50
51
|
Requires-Dist: mkdocs>=1.5.2; extra == "docs"
|
|
51
52
|
Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
|
|
@@ -98,7 +99,7 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
|
|
|
98
99
|
Requires-Dist: pdfplumber==0.11.4; extra == "examples"
|
|
99
100
|
Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
|
|
100
101
|
Requires-Dist: onnx==1.16.1; extra == "examples"
|
|
101
|
-
Requires-Dist: ultralytics==8.3.
|
|
102
|
+
Requires-Dist: ultralytics==8.3.50; extra == "examples"
|
|
102
103
|
|
|
103
104
|
================
|
|
104
105
|
|logo| DataChain
|
|
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
|
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
|
|
4
4
|
datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
|
|
5
|
-
datachain/cli.py,sha256=
|
|
5
|
+
datachain/cli.py,sha256=gNXVoMfKINUhKjOpYN48tpyNBK13M0hkQWqra4jNSJQ,43137
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
8
8
|
datachain/dataset.py,sha256=P-pDBgvPqJGDhq_I7fwCfb6hY8E8mIAO8Q0NT7SNlNE,19128
|
|
@@ -14,11 +14,11 @@ datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,11
|
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
|
|
15
15
|
datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
datachain/studio.py,sha256=
|
|
17
|
+
datachain/studio.py,sha256=BegIXunW1n-sZtHSe3a30Mw2MXexVGRn_GU-OzjRRKM,8725
|
|
18
18
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
19
|
datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
|
|
20
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
21
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=nuWjSIs4MO1hJa8-LQGbiMXLWWznPB_VKSVpS7368t4,58415
|
|
22
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
23
23
|
datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
|
|
24
24
|
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
@@ -35,7 +35,7 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
|
|
|
35
35
|
datachain/data_storage/metastore.py,sha256=hfTITcesE9XlUTxcCcdDyWGGep-QSjJL9DUxko5QCeI,37524
|
|
36
36
|
datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu-AY,9895
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=iJv1QxwVifOowtYhIDqYVoea21dvkQIdxklGNIend3c,22961
|
|
39
39
|
datachain/data_storage/warehouse.py,sha256=tjIkU-5JywBR0apCyqTcwSyaRtGxhu2L7IVjrz-55uc,30802
|
|
40
40
|
datachain/func/__init__.py,sha256=TG6JHFKtLi06Nd5iLszXIflEq-VKZcKMdgo_KiQ8SGQ,1055
|
|
41
41
|
datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
|
|
@@ -53,13 +53,14 @@ datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
|
|
|
53
53
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
54
54
|
datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
|
|
55
55
|
datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
|
|
56
|
-
datachain/lib/dc.py,sha256=
|
|
56
|
+
datachain/lib/dc.py,sha256=7Wm6TEPVNCSh4bz0iA9JvEsYtYAZ9o97lK7TEJ8modE,92149
|
|
57
|
+
datachain/lib/diff.py,sha256=Yurzyi7PzZzY80HOnVTpwtbWzSJ1LqN8NgZWwZOh_UU,6732
|
|
57
58
|
datachain/lib/file.py,sha256=4dDWXVCHHP2uELDPHP_LheyTyyr01jwp5wp3HaOIeFI,15028
|
|
58
59
|
datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
|
|
59
60
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
60
61
|
datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
|
|
61
62
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
62
|
-
datachain/lib/meta_formats.py,sha256=
|
|
63
|
+
datachain/lib/meta_formats.py,sha256=hDPfEkcmiLZOjhBBXuareMdnq65Wj8vZvxjmum6cROM,6377
|
|
63
64
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
64
65
|
datachain/lib/pytorch.py,sha256=dA3r1JY0wqV_907a1D0lFaEN-7v3fMRpc1ePFE9CnvA,6168
|
|
65
66
|
datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
|
|
@@ -88,7 +89,7 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
|
|
|
88
89
|
datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
|
|
89
90
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
90
91
|
datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
|
|
91
|
-
datachain/query/dataset.py,sha256=
|
|
92
|
+
datachain/query/dataset.py,sha256=fECGctERQrfLIowN9Fo6dTSnmHEe9WbfcjHRtRObcio,54667
|
|
92
93
|
datachain/query/dispatch.py,sha256=fZ0TgGFRcsrYh1iXQoZVjkUl4Xetom9PSHoeDes3IRs,11606
|
|
93
94
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
94
95
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -96,7 +97,7 @@ datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
|
96
97
|
datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
|
|
97
98
|
datachain/query/session.py,sha256=vvLIJ5b8eElovHLAWq_CZJXmN5t7C7iAZA7x9wPPOms,5905
|
|
98
99
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
99
|
-
datachain/remote/studio.py,sha256=
|
|
100
|
+
datachain/remote/studio.py,sha256=3DlgESETzxm3dgb6zzjjGxsddSkacT2dARnteLAfMxQ,13366
|
|
100
101
|
datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
|
|
101
102
|
datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
|
|
102
103
|
datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
|
|
@@ -118,9 +119,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
118
119
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
119
120
|
datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
|
|
120
121
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
121
|
-
datachain-0.
|
|
122
|
-
datachain-0.
|
|
123
|
-
datachain-0.
|
|
124
|
-
datachain-0.
|
|
125
|
-
datachain-0.
|
|
126
|
-
datachain-0.
|
|
122
|
+
datachain-0.8.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
123
|
+
datachain-0.8.0.dist-info/METADATA,sha256=PXb2pYY67bdfDjFXR7C9hwN6LaKSmseRZJNFakrWfyg,8437
|
|
124
|
+
datachain-0.8.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
125
|
+
datachain-0.8.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
126
|
+
datachain-0.8.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
127
|
+
datachain-0.8.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|