datachain 0.7.10__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +53 -41
- datachain/cli.py +25 -3
- datachain/client/__init__.py +1 -2
- datachain/data_storage/sqlite.py +20 -6
- datachain/lib/dc.py +160 -110
- datachain/lib/diff.py +197 -0
- datachain/lib/file.py +2 -1
- datachain/lib/meta_formats.py +40 -43
- datachain/lib/pytorch.py +1 -5
- datachain/lib/signal_schema.py +28 -6
- datachain/query/dataset.py +5 -1
- datachain/remote/studio.py +53 -1
- datachain/studio.py +47 -2
- datachain/toolkit/split.py +19 -6
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/METADATA +10 -10
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/RECORD +20 -19
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/LICENSE +0 -0
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/WHEEL +0 -0
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.7.10.dist-info → datachain-0.8.0.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
-
import math
|
|
5
4
|
import os
|
|
6
5
|
import os.path
|
|
7
6
|
import posixpath
|
|
@@ -13,7 +12,6 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence
|
|
|
13
12
|
from copy import copy
|
|
14
13
|
from dataclasses import dataclass
|
|
15
14
|
from functools import cached_property, reduce
|
|
16
|
-
from random import shuffle
|
|
17
15
|
from threading import Thread
|
|
18
16
|
from typing import (
|
|
19
17
|
IO,
|
|
@@ -58,11 +56,7 @@ from datachain.node import DirType, Node, NodeWithPath
|
|
|
58
56
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
59
57
|
from datachain.remote.studio import StudioClient
|
|
60
58
|
from datachain.sql.types import DateTime, SQLType
|
|
61
|
-
from datachain.utils import
|
|
62
|
-
DataChainDir,
|
|
63
|
-
batched,
|
|
64
|
-
datachain_paths_join,
|
|
65
|
-
)
|
|
59
|
+
from datachain.utils import DataChainDir, datachain_paths_join
|
|
66
60
|
|
|
67
61
|
from .datasource import DataSource
|
|
68
62
|
|
|
@@ -90,7 +84,7 @@ QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
|
|
|
90
84
|
QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
|
|
91
85
|
|
|
92
86
|
# dataset pull
|
|
93
|
-
PULL_DATASET_MAX_THREADS =
|
|
87
|
+
PULL_DATASET_MAX_THREADS = 5
|
|
94
88
|
PULL_DATASET_CHUNK_TIMEOUT = 3600
|
|
95
89
|
PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be available
|
|
96
90
|
PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
|
|
@@ -130,6 +124,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
130
124
|
local_ds_version: int,
|
|
131
125
|
schema: dict[str, Union[SQLType, type[SQLType]]],
|
|
132
126
|
max_threads: int = PULL_DATASET_MAX_THREADS,
|
|
127
|
+
progress_bar=None,
|
|
133
128
|
):
|
|
134
129
|
super().__init__(max_threads)
|
|
135
130
|
self._check_dependencies()
|
|
@@ -142,6 +137,7 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
142
137
|
self.schema = schema
|
|
143
138
|
self.last_status_check: Optional[float] = None
|
|
144
139
|
self.studio_client = StudioClient()
|
|
140
|
+
self.progress_bar = progress_bar
|
|
145
141
|
|
|
146
142
|
def done_task(self, done):
|
|
147
143
|
for task in done:
|
|
@@ -198,6 +194,20 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
198
194
|
for c in [c for c, t in self.schema.items() if t == DateTime]:
|
|
199
195
|
df[c] = pd.to_datetime(df[c], unit="s")
|
|
200
196
|
|
|
197
|
+
# id will be autogenerated in DB
|
|
198
|
+
return df.drop("sys__id", axis=1)
|
|
199
|
+
|
|
200
|
+
def get_parquet_content(self, url: str):
|
|
201
|
+
while True:
|
|
202
|
+
if self.should_check_for_status():
|
|
203
|
+
self.check_for_status()
|
|
204
|
+
r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
|
|
205
|
+
if r.status_code == 404:
|
|
206
|
+
time.sleep(PULL_DATASET_SLEEP_INTERVAL)
|
|
207
|
+
continue
|
|
208
|
+
r.raise_for_status()
|
|
209
|
+
return r.content
|
|
210
|
+
|
|
201
211
|
def do_task(self, urls):
|
|
202
212
|
import lz4.frame
|
|
203
213
|
import pandas as pd
|
|
@@ -207,31 +217,22 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
207
217
|
local_ds = metastore.get_dataset(self.local_ds_name)
|
|
208
218
|
|
|
209
219
|
urls = list(urls)
|
|
210
|
-
while urls:
|
|
211
|
-
for url in urls:
|
|
212
|
-
if self.should_check_for_status():
|
|
213
|
-
self.check_for_status()
|
|
214
|
-
|
|
215
|
-
r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
|
|
216
|
-
if r.status_code == 404:
|
|
217
|
-
time.sleep(PULL_DATASET_SLEEP_INTERVAL)
|
|
218
|
-
# moving to the next url
|
|
219
|
-
continue
|
|
220
|
-
|
|
221
|
-
r.raise_for_status()
|
|
222
|
-
|
|
223
|
-
df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
|
|
224
220
|
|
|
225
|
-
|
|
221
|
+
for url in urls:
|
|
222
|
+
if self.should_check_for_status():
|
|
223
|
+
self.check_for_status()
|
|
226
224
|
|
|
227
|
-
|
|
228
|
-
|
|
225
|
+
df = pd.read_parquet(
|
|
226
|
+
io.BytesIO(lz4.frame.decompress(self.get_parquet_content(url)))
|
|
227
|
+
)
|
|
228
|
+
df = self.fix_columns(df)
|
|
229
229
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
230
|
+
inserted = warehouse.insert_dataset_rows(
|
|
231
|
+
df, local_ds, self.local_ds_version
|
|
232
|
+
)
|
|
233
|
+
self.increase_counter(inserted) # type: ignore [arg-type]
|
|
234
|
+
# sometimes progress bar doesn't get updated so manually updating it
|
|
235
|
+
self.update_progress_bar(self.progress_bar)
|
|
235
236
|
|
|
236
237
|
|
|
237
238
|
@dataclass
|
|
@@ -1291,13 +1292,13 @@ class Catalog:
|
|
|
1291
1292
|
for source in data_sources: # type: ignore [union-attr]
|
|
1292
1293
|
yield source, source.ls(fields)
|
|
1293
1294
|
|
|
1294
|
-
def pull_dataset( # noqa: PLR0915
|
|
1295
|
+
def pull_dataset( # noqa: C901, PLR0915
|
|
1295
1296
|
self,
|
|
1296
1297
|
remote_ds_uri: str,
|
|
1297
1298
|
output: Optional[str] = None,
|
|
1298
1299
|
local_ds_name: Optional[str] = None,
|
|
1299
1300
|
local_ds_version: Optional[int] = None,
|
|
1300
|
-
|
|
1301
|
+
cp: bool = False,
|
|
1301
1302
|
force: bool = False,
|
|
1302
1303
|
edatachain: bool = False,
|
|
1303
1304
|
edatachain_file: Optional[str] = None,
|
|
@@ -1305,7 +1306,7 @@ class Catalog:
|
|
|
1305
1306
|
client_config=None,
|
|
1306
1307
|
) -> None:
|
|
1307
1308
|
def _instantiate(ds_uri: str) -> None:
|
|
1308
|
-
if
|
|
1309
|
+
if not cp:
|
|
1309
1310
|
return
|
|
1310
1311
|
assert output
|
|
1311
1312
|
self.cp(
|
|
@@ -1318,7 +1319,7 @@ class Catalog:
|
|
|
1318
1319
|
)
|
|
1319
1320
|
print(f"Dataset {ds_uri} instantiated locally to {output}")
|
|
1320
1321
|
|
|
1321
|
-
if
|
|
1322
|
+
if cp and not output:
|
|
1322
1323
|
raise ValueError("Please provide output directory for instantiation")
|
|
1323
1324
|
|
|
1324
1325
|
studio_client = StudioClient()
|
|
@@ -1417,12 +1418,26 @@ class Catalog:
|
|
|
1417
1418
|
signed_urls = export_response.data
|
|
1418
1419
|
|
|
1419
1420
|
if signed_urls:
|
|
1420
|
-
shuffle(signed_urls)
|
|
1421
|
-
|
|
1422
1421
|
with (
|
|
1423
1422
|
self.metastore.clone() as metastore,
|
|
1424
1423
|
self.warehouse.clone() as warehouse,
|
|
1425
1424
|
):
|
|
1425
|
+
|
|
1426
|
+
def batch(urls):
|
|
1427
|
+
"""
|
|
1428
|
+
Batching urls in a way that fetching is most efficient as
|
|
1429
|
+
urls with lower id will be created first. Because that, we
|
|
1430
|
+
are making sure all threads are pulling most recent urls
|
|
1431
|
+
from beginning
|
|
1432
|
+
"""
|
|
1433
|
+
res = [[] for i in range(PULL_DATASET_MAX_THREADS)]
|
|
1434
|
+
current_worker = 0
|
|
1435
|
+
for url in signed_urls:
|
|
1436
|
+
res[current_worker].append(url)
|
|
1437
|
+
current_worker = (current_worker + 1) % PULL_DATASET_MAX_THREADS
|
|
1438
|
+
|
|
1439
|
+
return res
|
|
1440
|
+
|
|
1426
1441
|
rows_fetcher = DatasetRowsFetcher(
|
|
1427
1442
|
metastore,
|
|
1428
1443
|
warehouse,
|
|
@@ -1431,14 +1446,11 @@ class Catalog:
|
|
|
1431
1446
|
local_ds_name,
|
|
1432
1447
|
local_ds_version,
|
|
1433
1448
|
schema,
|
|
1449
|
+
progress_bar=dataset_save_progress_bar,
|
|
1434
1450
|
)
|
|
1435
1451
|
try:
|
|
1436
1452
|
rows_fetcher.run(
|
|
1437
|
-
|
|
1438
|
-
signed_urls,
|
|
1439
|
-
math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
|
|
1440
|
-
),
|
|
1441
|
-
dataset_save_progress_bar,
|
|
1453
|
+
iter(batch(signed_urls)), dataset_save_progress_bar
|
|
1442
1454
|
)
|
|
1443
1455
|
except:
|
|
1444
1456
|
self.remove_dataset(local_ds_name, local_ds_version)
|
datachain/cli.py
CHANGED
|
@@ -294,6 +294,28 @@ def add_studio_parser(subparsers, parent_parser) -> None:
|
|
|
294
294
|
help="Python package requirement. Can be specified multiple times.",
|
|
295
295
|
)
|
|
296
296
|
|
|
297
|
+
studio_cancel_help = "Cancel a job in Studio"
|
|
298
|
+
studio_cancel_description = "This command cancels a job in Studio."
|
|
299
|
+
|
|
300
|
+
studio_cancel_parser = studio_subparser.add_parser(
|
|
301
|
+
"cancel",
|
|
302
|
+
parents=[parent_parser],
|
|
303
|
+
description=studio_cancel_description,
|
|
304
|
+
help=studio_cancel_help,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
studio_cancel_parser.add_argument(
|
|
308
|
+
"job_id",
|
|
309
|
+
action="store",
|
|
310
|
+
help="The job ID to cancel.",
|
|
311
|
+
)
|
|
312
|
+
studio_cancel_parser.add_argument(
|
|
313
|
+
"--team",
|
|
314
|
+
action="store",
|
|
315
|
+
default=None,
|
|
316
|
+
help="The team to cancel a job for. By default, it will use team from config.",
|
|
317
|
+
)
|
|
318
|
+
|
|
297
319
|
|
|
298
320
|
def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
299
321
|
try:
|
|
@@ -457,10 +479,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
457
479
|
help="Copy directories recursively",
|
|
458
480
|
)
|
|
459
481
|
parse_pull.add_argument(
|
|
460
|
-
"--
|
|
482
|
+
"--cp",
|
|
461
483
|
default=False,
|
|
462
484
|
action="store_true",
|
|
463
|
-
help="
|
|
485
|
+
help="Copy actual files after pulling remote dataset into local DB",
|
|
464
486
|
)
|
|
465
487
|
parse_pull.add_argument(
|
|
466
488
|
"--edatachain",
|
|
@@ -1300,7 +1322,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1300
1322
|
args.output,
|
|
1301
1323
|
local_ds_name=args.local_name,
|
|
1302
1324
|
local_ds_version=args.local_version,
|
|
1303
|
-
|
|
1325
|
+
cp=args.cp,
|
|
1304
1326
|
force=bool(args.force),
|
|
1305
1327
|
edatachain=args.edatachain,
|
|
1306
1328
|
edatachain_file=args.edatachain_file,
|
datachain/client/__init__.py
CHANGED
datachain/data_storage/sqlite.py
CHANGED
|
@@ -209,10 +209,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
209
209
|
|
|
210
210
|
@retry_sqlite_locks
|
|
211
211
|
def executemany(
|
|
212
|
-
self, query, params, cursor: Optional[sqlite3.Cursor] = None
|
|
212
|
+
self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
|
|
213
213
|
) -> sqlite3.Cursor:
|
|
214
214
|
if cursor:
|
|
215
215
|
return cursor.executemany(self.compile(query).string, params)
|
|
216
|
+
if conn:
|
|
217
|
+
return conn.executemany(self.compile(query).string, params)
|
|
216
218
|
return self.db.executemany(self.compile(query).string, params)
|
|
217
219
|
|
|
218
220
|
@retry_sqlite_locks
|
|
@@ -222,7 +224,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
|
|
|
222
224
|
return self.db.execute(sql, parameters)
|
|
223
225
|
|
|
224
226
|
def insert_dataframe(self, table_name: str, df) -> int:
|
|
225
|
-
return df.to_sql(
|
|
227
|
+
return df.to_sql(
|
|
228
|
+
table_name,
|
|
229
|
+
self.db,
|
|
230
|
+
if_exists="append",
|
|
231
|
+
index=False,
|
|
232
|
+
method="multi",
|
|
233
|
+
chunksize=1000,
|
|
234
|
+
)
|
|
226
235
|
|
|
227
236
|
def cursor(self, factory=None):
|
|
228
237
|
if factory is None:
|
|
@@ -545,10 +554,15 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
545
554
|
rows = list(rows)
|
|
546
555
|
if not rows:
|
|
547
556
|
return
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
557
|
+
|
|
558
|
+
with self.db.transaction() as conn:
|
|
559
|
+
# transactions speeds up inserts significantly as there is no separate
|
|
560
|
+
# transaction created for each insert row
|
|
561
|
+
self.db.executemany(
|
|
562
|
+
table.insert().values({f: bindparam(f) for f in rows[0]}),
|
|
563
|
+
rows,
|
|
564
|
+
conn=conn,
|
|
565
|
+
)
|
|
552
566
|
|
|
553
567
|
def insert_dataset_rows(self, df, dataset: DatasetRecord, version: int) -> int:
|
|
554
568
|
dr = self.dataset_rows(dataset, version)
|
datachain/lib/dc.py
CHANGED
|
@@ -19,7 +19,6 @@ from typing import (
|
|
|
19
19
|
)
|
|
20
20
|
|
|
21
21
|
import orjson
|
|
22
|
-
import pandas as pd
|
|
23
22
|
import sqlalchemy
|
|
24
23
|
from pydantic import BaseModel
|
|
25
24
|
from sqlalchemy.sql.functions import GenericFunction
|
|
@@ -42,7 +41,7 @@ from datachain.lib.listing import (
|
|
|
42
41
|
parse_listing_uri,
|
|
43
42
|
)
|
|
44
43
|
from datachain.lib.listing_info import ListingInfo
|
|
45
|
-
from datachain.lib.meta_formats import read_meta
|
|
44
|
+
from datachain.lib.meta_formats import read_meta
|
|
46
45
|
from datachain.lib.model_store import ModelStore
|
|
47
46
|
from datachain.lib.settings import Settings
|
|
48
47
|
from datachain.lib.signal_schema import SignalSchema
|
|
@@ -57,6 +56,7 @@ from datachain.telemetry import telemetry
|
|
|
57
56
|
from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
|
|
58
57
|
|
|
59
58
|
if TYPE_CHECKING:
|
|
59
|
+
import pandas as pd
|
|
60
60
|
from pyarrow import DataType as ArrowDataType
|
|
61
61
|
from typing_extensions import Concatenate, ParamSpec, Self
|
|
62
62
|
|
|
@@ -554,8 +554,7 @@ class DataChain:
|
|
|
554
554
|
jmespath: Optional[str] = None,
|
|
555
555
|
object_name: Optional[str] = "",
|
|
556
556
|
model_name: Optional[str] = None,
|
|
557
|
-
|
|
558
|
-
meta_type: Optional[str] = "json",
|
|
557
|
+
format: Optional[str] = "json",
|
|
559
558
|
nrows=None,
|
|
560
559
|
**kwargs,
|
|
561
560
|
) -> "DataChain":
|
|
@@ -564,12 +563,12 @@ class DataChain:
|
|
|
564
563
|
Parameters:
|
|
565
564
|
path : storage URI with directory. URI must start with storage prefix such
|
|
566
565
|
as `s3://`, `gs://`, `az://` or "file:///"
|
|
567
|
-
type : read file as "binary", "text", or "image" data. Default is "
|
|
566
|
+
type : read file as "binary", "text", or "image" data. Default is "text".
|
|
568
567
|
spec : optional Data Model
|
|
569
568
|
schema_from : path to sample to infer spec (if schema not provided)
|
|
570
569
|
object_name : generated object column name
|
|
571
570
|
model_name : optional generated model name
|
|
572
|
-
|
|
571
|
+
format: "json", "jsonl"
|
|
573
572
|
jmespath : optional JMESPATH expression to reduce JSON
|
|
574
573
|
nrows : optional row limit for jsonl and JSON arrays
|
|
575
574
|
|
|
@@ -594,75 +593,14 @@ class DataChain:
|
|
|
594
593
|
if (not object_name) and jmespath:
|
|
595
594
|
object_name = jmespath_to_name(jmespath)
|
|
596
595
|
if not object_name:
|
|
597
|
-
object_name =
|
|
598
|
-
chain = DataChain.from_storage(uri=path, type=type, **kwargs)
|
|
599
|
-
signal_dict = {
|
|
600
|
-
object_name: read_meta(
|
|
601
|
-
schema_from=schema_from,
|
|
602
|
-
meta_type=meta_type,
|
|
603
|
-
spec=spec,
|
|
604
|
-
model_name=model_name,
|
|
605
|
-
print_schema=print_schema,
|
|
606
|
-
jmespath=jmespath,
|
|
607
|
-
nrows=nrows,
|
|
608
|
-
)
|
|
609
|
-
}
|
|
610
|
-
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
611
|
-
|
|
612
|
-
@classmethod
|
|
613
|
-
def from_jsonl(
|
|
614
|
-
cls,
|
|
615
|
-
path,
|
|
616
|
-
type: Literal["binary", "text", "image"] = "text",
|
|
617
|
-
spec: Optional[DataType] = None,
|
|
618
|
-
schema_from: Optional[str] = "auto",
|
|
619
|
-
jmespath: Optional[str] = None,
|
|
620
|
-
object_name: Optional[str] = "",
|
|
621
|
-
model_name: Optional[str] = None,
|
|
622
|
-
print_schema: Optional[bool] = False,
|
|
623
|
-
meta_type: Optional[str] = "jsonl",
|
|
624
|
-
nrows=None,
|
|
625
|
-
**kwargs,
|
|
626
|
-
) -> "DataChain":
|
|
627
|
-
"""Get data from JSON lines. It returns the chain itself.
|
|
628
|
-
|
|
629
|
-
Parameters:
|
|
630
|
-
path : storage URI with directory. URI must start with storage prefix such
|
|
631
|
-
as `s3://`, `gs://`, `az://` or "file:///"
|
|
632
|
-
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
633
|
-
spec : optional Data Model
|
|
634
|
-
schema_from : path to sample to infer spec (if schema not provided)
|
|
635
|
-
object_name : generated object column name
|
|
636
|
-
model_name : optional generated model name
|
|
637
|
-
print_schema : print auto-generated schema
|
|
638
|
-
jmespath : optional JMESPATH expression to reduce JSON
|
|
639
|
-
nrows : optional row limit for jsonl and JSON arrays
|
|
640
|
-
|
|
641
|
-
Example:
|
|
642
|
-
infer JSONl schema from data, limit parsing to 1 row
|
|
643
|
-
```py
|
|
644
|
-
chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
|
|
645
|
-
```
|
|
646
|
-
"""
|
|
647
|
-
if schema_from == "auto":
|
|
648
|
-
schema_from = path
|
|
649
|
-
|
|
650
|
-
def jmespath_to_name(s: str):
|
|
651
|
-
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
|
|
652
|
-
return s[:name_end]
|
|
653
|
-
|
|
654
|
-
if (not object_name) and jmespath:
|
|
655
|
-
object_name = jmespath_to_name(jmespath)
|
|
656
|
-
if not object_name:
|
|
657
|
-
object_name = meta_type
|
|
596
|
+
object_name = format
|
|
658
597
|
chain = DataChain.from_storage(uri=path, type=type, **kwargs)
|
|
659
598
|
signal_dict = {
|
|
660
599
|
object_name: read_meta(
|
|
661
600
|
schema_from=schema_from,
|
|
662
|
-
|
|
601
|
+
format=format,
|
|
663
602
|
spec=spec,
|
|
664
603
|
model_name=model_name,
|
|
665
|
-
print_schema=print_schema,
|
|
666
604
|
jmespath=jmespath,
|
|
667
605
|
nrows=nrows,
|
|
668
606
|
)
|
|
@@ -793,47 +731,6 @@ class DataChain:
|
|
|
793
731
|
**{object_name: catalog.listings()}, # type: ignore[arg-type]
|
|
794
732
|
)
|
|
795
733
|
|
|
796
|
-
def print_json_schema( # type: ignore[override]
|
|
797
|
-
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
798
|
-
) -> "Self":
|
|
799
|
-
"""Print JSON data model and save it. It returns the chain itself.
|
|
800
|
-
|
|
801
|
-
Parameters:
|
|
802
|
-
jmespath : JMESPATH expression to reduce JSON
|
|
803
|
-
model_name : generated model name
|
|
804
|
-
|
|
805
|
-
Example:
|
|
806
|
-
print JSON schema and save to column "meta_from":
|
|
807
|
-
```py
|
|
808
|
-
uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
809
|
-
chain = DataChain.from_storage(uri)
|
|
810
|
-
chain = chain.print_json_schema()
|
|
811
|
-
chain.save()
|
|
812
|
-
```
|
|
813
|
-
"""
|
|
814
|
-
return self.map(
|
|
815
|
-
meta_schema=lambda file: read_schema(
|
|
816
|
-
file, data_type="json", expr=jmespath, model_name=model_name
|
|
817
|
-
),
|
|
818
|
-
output=str,
|
|
819
|
-
)
|
|
820
|
-
|
|
821
|
-
def print_jsonl_schema( # type: ignore[override]
|
|
822
|
-
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
823
|
-
) -> "Self":
|
|
824
|
-
"""Print JSON data model and save it. It returns the chain itself.
|
|
825
|
-
|
|
826
|
-
Parameters:
|
|
827
|
-
jmespath : JMESPATH expression to reduce JSON
|
|
828
|
-
model_name : generated model name
|
|
829
|
-
"""
|
|
830
|
-
return self.map(
|
|
831
|
-
meta_schema=lambda file: read_schema(
|
|
832
|
-
file, data_type="jsonl", expr=jmespath, model_name=model_name
|
|
833
|
-
),
|
|
834
|
-
output=str,
|
|
835
|
-
)
|
|
836
|
-
|
|
837
734
|
def save( # type: ignore[override]
|
|
838
735
|
self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
|
|
839
736
|
) -> "Self":
|
|
@@ -1624,6 +1521,155 @@ class DataChain:
|
|
|
1624
1521
|
)
|
|
1625
1522
|
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
1626
1523
|
|
|
1524
|
+
def compare(
|
|
1525
|
+
self,
|
|
1526
|
+
other: "DataChain",
|
|
1527
|
+
on: Union[str, Sequence[str]],
|
|
1528
|
+
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
1529
|
+
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
1530
|
+
right_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
1531
|
+
added: bool = True,
|
|
1532
|
+
deleted: bool = True,
|
|
1533
|
+
modified: bool = True,
|
|
1534
|
+
same: bool = False,
|
|
1535
|
+
status_col: Optional[str] = None,
|
|
1536
|
+
) -> "DataChain":
|
|
1537
|
+
"""Comparing two chains by identifying rows that are added, deleted, modified
|
|
1538
|
+
or same. Result is the new chain that has additional column with possible
|
|
1539
|
+
values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
|
|
1540
|
+
rows respectively. Note that if only one "status" is asked, by setting proper
|
|
1541
|
+
flags, this additional column is not created as it would have only one value
|
|
1542
|
+
for all rows. Beside additional diff column, new chain has schema of the chain
|
|
1543
|
+
on which method was called.
|
|
1544
|
+
|
|
1545
|
+
Parameters:
|
|
1546
|
+
other: Chain to calculate diff from.
|
|
1547
|
+
on: Column or list of columns to match on. If both chains have the
|
|
1548
|
+
same columns then this column is enough for the match. Otherwise,
|
|
1549
|
+
`right_on` parameter has to specify the columns for the other chain.
|
|
1550
|
+
This value is used to find corresponding row in other dataset. If not
|
|
1551
|
+
found there, row is considered as added (or removed if vice versa), and
|
|
1552
|
+
if found then row can be either modified or same.
|
|
1553
|
+
right_on: Optional column or list of columns
|
|
1554
|
+
for the `other` to match.
|
|
1555
|
+
compare: Column or list of columns to compare on. If both chains have
|
|
1556
|
+
the same columns then this column is enough for the compare. Otherwise,
|
|
1557
|
+
`right_compare` parameter has to specify the columns for the other
|
|
1558
|
+
chain. This value is used to see if row is modified or same. If
|
|
1559
|
+
not set, all columns will be used for comparison
|
|
1560
|
+
right_compare: Optional column or list of columns
|
|
1561
|
+
for the `other` to compare to.
|
|
1562
|
+
added (bool): Whether to return added rows in resulting chain.
|
|
1563
|
+
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1564
|
+
modified (bool): Whether to return modified rows in resulting chain.
|
|
1565
|
+
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1566
|
+
status_col (str): Name of the new column that is created in resulting chain
|
|
1567
|
+
representing diff status.
|
|
1568
|
+
|
|
1569
|
+
Example:
|
|
1570
|
+
```py
|
|
1571
|
+
diff = persons.diff(
|
|
1572
|
+
new_persons,
|
|
1573
|
+
on=["id"],
|
|
1574
|
+
right_on=["other_id"],
|
|
1575
|
+
compare=["name"],
|
|
1576
|
+
added=True,
|
|
1577
|
+
deleted=True,
|
|
1578
|
+
modified=True,
|
|
1579
|
+
same=True,
|
|
1580
|
+
status_col="diff"
|
|
1581
|
+
)
|
|
1582
|
+
```
|
|
1583
|
+
"""
|
|
1584
|
+
from datachain.lib.diff import compare as chain_compare
|
|
1585
|
+
|
|
1586
|
+
return chain_compare(
|
|
1587
|
+
self,
|
|
1588
|
+
other,
|
|
1589
|
+
on,
|
|
1590
|
+
right_on=right_on,
|
|
1591
|
+
compare=compare,
|
|
1592
|
+
right_compare=right_compare,
|
|
1593
|
+
added=added,
|
|
1594
|
+
deleted=deleted,
|
|
1595
|
+
modified=modified,
|
|
1596
|
+
same=same,
|
|
1597
|
+
status_col=status_col,
|
|
1598
|
+
)
|
|
1599
|
+
|
|
1600
|
+
def diff(
|
|
1601
|
+
self,
|
|
1602
|
+
other: "DataChain",
|
|
1603
|
+
on: str = "file",
|
|
1604
|
+
right_on: Optional[str] = None,
|
|
1605
|
+
added: bool = True,
|
|
1606
|
+
modified: bool = True,
|
|
1607
|
+
deleted: bool = False,
|
|
1608
|
+
same: bool = False,
|
|
1609
|
+
status_col: Optional[str] = None,
|
|
1610
|
+
) -> "DataChain":
|
|
1611
|
+
"""Similar to `.compare()`, which is more generic method to calculate difference
|
|
1612
|
+
between two chains. Unlike `.compare()`, this method works only on those chains
|
|
1613
|
+
that have `File` object, or it's derivatives, in it. File `source` and `path`
|
|
1614
|
+
are used for matching, and file `version` and `etag` for comparing, while in
|
|
1615
|
+
`.compare()` user needs to provide arbitrary columns for matching and comparing.
|
|
1616
|
+
|
|
1617
|
+
Parameters:
|
|
1618
|
+
other: Chain to calculate diff from.
|
|
1619
|
+
on: File signal to match on. If both chains have the
|
|
1620
|
+
same file signal then this column is enough for the match. Otherwise,
|
|
1621
|
+
`right_on` parameter has to specify the file signal for the other chain.
|
|
1622
|
+
This value is used to find corresponding row in other dataset. If not
|
|
1623
|
+
found there, row is considered as added (or removed if vice versa), and
|
|
1624
|
+
if found then row can be either modified or same.
|
|
1625
|
+
right_on: Optional file signal for the `other` to match.
|
|
1626
|
+
added (bool): Whether to return added rows in resulting chain.
|
|
1627
|
+
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1628
|
+
modified (bool): Whether to return modified rows in resulting chain.
|
|
1629
|
+
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1630
|
+
status_col (str): Optional name of the new column that is created in
|
|
1631
|
+
resulting chain representing diff status.
|
|
1632
|
+
|
|
1633
|
+
Example:
|
|
1634
|
+
```py
|
|
1635
|
+
diff = images.diff(
|
|
1636
|
+
new_images,
|
|
1637
|
+
on="file",
|
|
1638
|
+
right_on="other_file",
|
|
1639
|
+
added=True,
|
|
1640
|
+
deleted=True,
|
|
1641
|
+
modified=True,
|
|
1642
|
+
same=True,
|
|
1643
|
+
status_col="diff"
|
|
1644
|
+
)
|
|
1645
|
+
```
|
|
1646
|
+
"""
|
|
1647
|
+
on_file_signals = ["source", "path"]
|
|
1648
|
+
compare_file_signals = ["version", "etag"]
|
|
1649
|
+
|
|
1650
|
+
def get_file_signals(file: str, signals):
|
|
1651
|
+
return [f"{file}.{c}" for c in signals]
|
|
1652
|
+
|
|
1653
|
+
right_on = right_on or on
|
|
1654
|
+
|
|
1655
|
+
on_cols = get_file_signals(on, on_file_signals)
|
|
1656
|
+
right_on_cols = get_file_signals(right_on, on_file_signals)
|
|
1657
|
+
compare_cols = get_file_signals(on, compare_file_signals)
|
|
1658
|
+
right_compare_cols = get_file_signals(right_on, compare_file_signals)
|
|
1659
|
+
|
|
1660
|
+
return self.compare(
|
|
1661
|
+
other,
|
|
1662
|
+
on_cols,
|
|
1663
|
+
right_on=right_on_cols,
|
|
1664
|
+
compare=compare_cols,
|
|
1665
|
+
right_compare=right_compare_cols,
|
|
1666
|
+
added=added,
|
|
1667
|
+
deleted=deleted,
|
|
1668
|
+
modified=modified,
|
|
1669
|
+
same=same,
|
|
1670
|
+
status_col=status_col,
|
|
1671
|
+
)
|
|
1672
|
+
|
|
1627
1673
|
@classmethod
|
|
1628
1674
|
def from_values(
|
|
1629
1675
|
cls,
|
|
@@ -1701,6 +1747,8 @@ class DataChain:
|
|
|
1701
1747
|
Parameters:
|
|
1702
1748
|
flatten : Whether to use a multiindex or flatten column names.
|
|
1703
1749
|
"""
|
|
1750
|
+
import pandas as pd
|
|
1751
|
+
|
|
1704
1752
|
headers, max_length = self._effective_signals_schema.get_headers_with_length()
|
|
1705
1753
|
if flatten or max_length < 2:
|
|
1706
1754
|
columns = [".".join(filter(None, header)) for header in headers]
|
|
@@ -1724,6 +1772,8 @@ class DataChain:
|
|
|
1724
1772
|
transpose : Whether to transpose rows and columns.
|
|
1725
1773
|
truncate : Whether or not to truncate the contents of columns.
|
|
1726
1774
|
"""
|
|
1775
|
+
import pandas as pd
|
|
1776
|
+
|
|
1727
1777
|
dc = self.limit(limit) if limit > 0 else self # type: ignore[misc]
|
|
1728
1778
|
df = dc.to_pandas(flatten)
|
|
1729
1779
|
|