datachain 0.7.10__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,7 +1,6 @@
1
1
  import io
2
2
  import json
3
3
  import logging
4
- import math
5
4
  import os
6
5
  import os.path
7
6
  import posixpath
@@ -13,7 +12,6 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence
13
12
  from copy import copy
14
13
  from dataclasses import dataclass
15
14
  from functools import cached_property, reduce
16
- from random import shuffle
17
15
  from threading import Thread
18
16
  from typing import (
19
17
  IO,
@@ -58,11 +56,7 @@ from datachain.node import DirType, Node, NodeWithPath
58
56
  from datachain.nodes_thread_pool import NodesThreadPool
59
57
  from datachain.remote.studio import StudioClient
60
58
  from datachain.sql.types import DateTime, SQLType
61
- from datachain.utils import (
62
- DataChainDir,
63
- batched,
64
- datachain_paths_join,
65
- )
59
+ from datachain.utils import DataChainDir, datachain_paths_join
66
60
 
67
61
  from .datasource import DataSource
68
62
 
@@ -90,7 +84,7 @@ QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
90
84
  QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
91
85
 
92
86
  # dataset pull
93
- PULL_DATASET_MAX_THREADS = 10
87
+ PULL_DATASET_MAX_THREADS = 5
94
88
  PULL_DATASET_CHUNK_TIMEOUT = 3600
95
89
  PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be available
96
90
  PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
@@ -130,6 +124,7 @@ class DatasetRowsFetcher(NodesThreadPool):
130
124
  local_ds_version: int,
131
125
  schema: dict[str, Union[SQLType, type[SQLType]]],
132
126
  max_threads: int = PULL_DATASET_MAX_THREADS,
127
+ progress_bar=None,
133
128
  ):
134
129
  super().__init__(max_threads)
135
130
  self._check_dependencies()
@@ -142,6 +137,7 @@ class DatasetRowsFetcher(NodesThreadPool):
142
137
  self.schema = schema
143
138
  self.last_status_check: Optional[float] = None
144
139
  self.studio_client = StudioClient()
140
+ self.progress_bar = progress_bar
145
141
 
146
142
  def done_task(self, done):
147
143
  for task in done:
@@ -198,6 +194,20 @@ class DatasetRowsFetcher(NodesThreadPool):
198
194
  for c in [c for c, t in self.schema.items() if t == DateTime]:
199
195
  df[c] = pd.to_datetime(df[c], unit="s")
200
196
 
197
+ # id will be autogenerated in DB
198
+ return df.drop("sys__id", axis=1)
199
+
200
+ def get_parquet_content(self, url: str):
201
+ while True:
202
+ if self.should_check_for_status():
203
+ self.check_for_status()
204
+ r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
205
+ if r.status_code == 404:
206
+ time.sleep(PULL_DATASET_SLEEP_INTERVAL)
207
+ continue
208
+ r.raise_for_status()
209
+ return r.content
210
+
201
211
  def do_task(self, urls):
202
212
  import lz4.frame
203
213
  import pandas as pd
@@ -207,31 +217,22 @@ class DatasetRowsFetcher(NodesThreadPool):
207
217
  local_ds = metastore.get_dataset(self.local_ds_name)
208
218
 
209
219
  urls = list(urls)
210
- while urls:
211
- for url in urls:
212
- if self.should_check_for_status():
213
- self.check_for_status()
214
-
215
- r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
216
- if r.status_code == 404:
217
- time.sleep(PULL_DATASET_SLEEP_INTERVAL)
218
- # moving to the next url
219
- continue
220
-
221
- r.raise_for_status()
222
-
223
- df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
224
220
 
225
- self.fix_columns(df)
221
+ for url in urls:
222
+ if self.should_check_for_status():
223
+ self.check_for_status()
226
224
 
227
- # id will be autogenerated in DB
228
- df = df.drop("sys__id", axis=1)
225
+ df = pd.read_parquet(
226
+ io.BytesIO(lz4.frame.decompress(self.get_parquet_content(url)))
227
+ )
228
+ df = self.fix_columns(df)
229
229
 
230
- inserted = warehouse.insert_dataset_rows(
231
- df, local_ds, self.local_ds_version
232
- )
233
- self.increase_counter(inserted) # type: ignore [arg-type]
234
- urls.remove(url)
230
+ inserted = warehouse.insert_dataset_rows(
231
+ df, local_ds, self.local_ds_version
232
+ )
233
+ self.increase_counter(inserted) # type: ignore [arg-type]
234
+ # sometimes progress bar doesn't get updated so manually updating it
235
+ self.update_progress_bar(self.progress_bar)
235
236
 
236
237
 
237
238
  @dataclass
@@ -1291,13 +1292,13 @@ class Catalog:
1291
1292
  for source in data_sources: # type: ignore [union-attr]
1292
1293
  yield source, source.ls(fields)
1293
1294
 
1294
- def pull_dataset( # noqa: PLR0915
1295
+ def pull_dataset( # noqa: C901, PLR0915
1295
1296
  self,
1296
1297
  remote_ds_uri: str,
1297
1298
  output: Optional[str] = None,
1298
1299
  local_ds_name: Optional[str] = None,
1299
1300
  local_ds_version: Optional[int] = None,
1300
- no_cp: bool = False,
1301
+ cp: bool = False,
1301
1302
  force: bool = False,
1302
1303
  edatachain: bool = False,
1303
1304
  edatachain_file: Optional[str] = None,
@@ -1305,7 +1306,7 @@ class Catalog:
1305
1306
  client_config=None,
1306
1307
  ) -> None:
1307
1308
  def _instantiate(ds_uri: str) -> None:
1308
- if no_cp:
1309
+ if not cp:
1309
1310
  return
1310
1311
  assert output
1311
1312
  self.cp(
@@ -1318,7 +1319,7 @@ class Catalog:
1318
1319
  )
1319
1320
  print(f"Dataset {ds_uri} instantiated locally to {output}")
1320
1321
 
1321
- if not output and not no_cp:
1322
+ if cp and not output:
1322
1323
  raise ValueError("Please provide output directory for instantiation")
1323
1324
 
1324
1325
  studio_client = StudioClient()
@@ -1417,12 +1418,26 @@ class Catalog:
1417
1418
  signed_urls = export_response.data
1418
1419
 
1419
1420
  if signed_urls:
1420
- shuffle(signed_urls)
1421
-
1422
1421
  with (
1423
1422
  self.metastore.clone() as metastore,
1424
1423
  self.warehouse.clone() as warehouse,
1425
1424
  ):
1425
+
1426
+ def batch(urls):
1427
+ """
1428
+ Batching urls in a way that fetching is most efficient as
1429
+ urls with lower id will be created first. Because that, we
1430
+ are making sure all threads are pulling most recent urls
1431
+ from beginning
1432
+ """
1433
+ res = [[] for i in range(PULL_DATASET_MAX_THREADS)]
1434
+ current_worker = 0
1435
+ for url in signed_urls:
1436
+ res[current_worker].append(url)
1437
+ current_worker = (current_worker + 1) % PULL_DATASET_MAX_THREADS
1438
+
1439
+ return res
1440
+
1426
1441
  rows_fetcher = DatasetRowsFetcher(
1427
1442
  metastore,
1428
1443
  warehouse,
@@ -1431,14 +1446,11 @@ class Catalog:
1431
1446
  local_ds_name,
1432
1447
  local_ds_version,
1433
1448
  schema,
1449
+ progress_bar=dataset_save_progress_bar,
1434
1450
  )
1435
1451
  try:
1436
1452
  rows_fetcher.run(
1437
- batched(
1438
- signed_urls,
1439
- math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
1440
- ),
1441
- dataset_save_progress_bar,
1453
+ iter(batch(signed_urls)), dataset_save_progress_bar
1442
1454
  )
1443
1455
  except:
1444
1456
  self.remove_dataset(local_ds_name, local_ds_version)
datachain/cli.py CHANGED
@@ -294,6 +294,28 @@ def add_studio_parser(subparsers, parent_parser) -> None:
294
294
  help="Python package requirement. Can be specified multiple times.",
295
295
  )
296
296
 
297
+ studio_cancel_help = "Cancel a job in Studio"
298
+ studio_cancel_description = "This command cancels a job in Studio."
299
+
300
+ studio_cancel_parser = studio_subparser.add_parser(
301
+ "cancel",
302
+ parents=[parent_parser],
303
+ description=studio_cancel_description,
304
+ help=studio_cancel_help,
305
+ )
306
+
307
+ studio_cancel_parser.add_argument(
308
+ "job_id",
309
+ action="store",
310
+ help="The job ID to cancel.",
311
+ )
312
+ studio_cancel_parser.add_argument(
313
+ "--team",
314
+ action="store",
315
+ default=None,
316
+ help="The team to cancel a job for. By default, it will use team from config.",
317
+ )
318
+
297
319
 
298
320
  def get_parser() -> ArgumentParser: # noqa: PLR0915
299
321
  try:
@@ -457,10 +479,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
457
479
  help="Copy directories recursively",
458
480
  )
459
481
  parse_pull.add_argument(
460
- "--no-cp",
482
+ "--cp",
461
483
  default=False,
462
484
  action="store_true",
463
- help="Do not copy files, just pull a remote dataset into local DB",
485
+ help="Copy actual files after pulling remote dataset into local DB",
464
486
  )
465
487
  parse_pull.add_argument(
466
488
  "--edatachain",
@@ -1300,7 +1322,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1300
1322
  args.output,
1301
1323
  local_ds_name=args.local_name,
1302
1324
  local_ds_version=args.local_version,
1303
- no_cp=args.no_cp,
1325
+ cp=args.cp,
1304
1326
  force=bool(args.force),
1305
1327
  edatachain=args.edatachain,
1306
1328
  edatachain_file=args.edatachain_file,
@@ -1,4 +1,3 @@
1
1
  from .fsspec import Client
2
- from .s3 import ClientS3
3
2
 
4
- __all__ = ["Client", "ClientS3"]
3
+ __all__ = ["Client"]
@@ -209,10 +209,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
209
209
 
210
210
  @retry_sqlite_locks
211
211
  def executemany(
212
- self, query, params, cursor: Optional[sqlite3.Cursor] = None
212
+ self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
213
213
  ) -> sqlite3.Cursor:
214
214
  if cursor:
215
215
  return cursor.executemany(self.compile(query).string, params)
216
+ if conn:
217
+ return conn.executemany(self.compile(query).string, params)
216
218
  return self.db.executemany(self.compile(query).string, params)
217
219
 
218
220
  @retry_sqlite_locks
@@ -222,7 +224,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
222
224
  return self.db.execute(sql, parameters)
223
225
 
224
226
  def insert_dataframe(self, table_name: str, df) -> int:
225
- return df.to_sql(table_name, self.db, if_exists="append", index=False)
227
+ return df.to_sql(
228
+ table_name,
229
+ self.db,
230
+ if_exists="append",
231
+ index=False,
232
+ method="multi",
233
+ chunksize=1000,
234
+ )
226
235
 
227
236
  def cursor(self, factory=None):
228
237
  if factory is None:
@@ -545,10 +554,15 @@ class SQLiteWarehouse(AbstractWarehouse):
545
554
  rows = list(rows)
546
555
  if not rows:
547
556
  return
548
- self.db.executemany(
549
- table.insert().values({f: bindparam(f) for f in rows[0]}),
550
- rows,
551
- )
557
+
558
+ with self.db.transaction() as conn:
559
+ # transactions speeds up inserts significantly as there is no separate
560
+ # transaction created for each insert row
561
+ self.db.executemany(
562
+ table.insert().values({f: bindparam(f) for f in rows[0]}),
563
+ rows,
564
+ conn=conn,
565
+ )
552
566
 
553
567
  def insert_dataset_rows(self, df, dataset: DatasetRecord, version: int) -> int:
554
568
  dr = self.dataset_rows(dataset, version)
datachain/lib/dc.py CHANGED
@@ -19,7 +19,6 @@ from typing import (
19
19
  )
20
20
 
21
21
  import orjson
22
- import pandas as pd
23
22
  import sqlalchemy
24
23
  from pydantic import BaseModel
25
24
  from sqlalchemy.sql.functions import GenericFunction
@@ -42,7 +41,7 @@ from datachain.lib.listing import (
42
41
  parse_listing_uri,
43
42
  )
44
43
  from datachain.lib.listing_info import ListingInfo
45
- from datachain.lib.meta_formats import read_meta, read_schema
44
+ from datachain.lib.meta_formats import read_meta
46
45
  from datachain.lib.model_store import ModelStore
47
46
  from datachain.lib.settings import Settings
48
47
  from datachain.lib.signal_schema import SignalSchema
@@ -57,6 +56,7 @@ from datachain.telemetry import telemetry
57
56
  from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
58
57
 
59
58
  if TYPE_CHECKING:
59
+ import pandas as pd
60
60
  from pyarrow import DataType as ArrowDataType
61
61
  from typing_extensions import Concatenate, ParamSpec, Self
62
62
 
@@ -554,8 +554,7 @@ class DataChain:
554
554
  jmespath: Optional[str] = None,
555
555
  object_name: Optional[str] = "",
556
556
  model_name: Optional[str] = None,
557
- print_schema: Optional[bool] = False,
558
- meta_type: Optional[str] = "json",
557
+ format: Optional[str] = "json",
559
558
  nrows=None,
560
559
  **kwargs,
561
560
  ) -> "DataChain":
@@ -564,12 +563,12 @@ class DataChain:
564
563
  Parameters:
565
564
  path : storage URI with directory. URI must start with storage prefix such
566
565
  as `s3://`, `gs://`, `az://` or "file:///"
567
- type : read file as "binary", "text", or "image" data. Default is "binary".
566
+ type : read file as "binary", "text", or "image" data. Default is "text".
568
567
  spec : optional Data Model
569
568
  schema_from : path to sample to infer spec (if schema not provided)
570
569
  object_name : generated object column name
571
570
  model_name : optional generated model name
572
- print_schema : print auto-generated schema
571
+ format: "json", "jsonl"
573
572
  jmespath : optional JMESPATH expression to reduce JSON
574
573
  nrows : optional row limit for jsonl and JSON arrays
575
574
 
@@ -594,75 +593,14 @@ class DataChain:
594
593
  if (not object_name) and jmespath:
595
594
  object_name = jmespath_to_name(jmespath)
596
595
  if not object_name:
597
- object_name = meta_type
598
- chain = DataChain.from_storage(uri=path, type=type, **kwargs)
599
- signal_dict = {
600
- object_name: read_meta(
601
- schema_from=schema_from,
602
- meta_type=meta_type,
603
- spec=spec,
604
- model_name=model_name,
605
- print_schema=print_schema,
606
- jmespath=jmespath,
607
- nrows=nrows,
608
- )
609
- }
610
- return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
611
-
612
- @classmethod
613
- def from_jsonl(
614
- cls,
615
- path,
616
- type: Literal["binary", "text", "image"] = "text",
617
- spec: Optional[DataType] = None,
618
- schema_from: Optional[str] = "auto",
619
- jmespath: Optional[str] = None,
620
- object_name: Optional[str] = "",
621
- model_name: Optional[str] = None,
622
- print_schema: Optional[bool] = False,
623
- meta_type: Optional[str] = "jsonl",
624
- nrows=None,
625
- **kwargs,
626
- ) -> "DataChain":
627
- """Get data from JSON lines. It returns the chain itself.
628
-
629
- Parameters:
630
- path : storage URI with directory. URI must start with storage prefix such
631
- as `s3://`, `gs://`, `az://` or "file:///"
632
- type : read file as "binary", "text", or "image" data. Default is "binary".
633
- spec : optional Data Model
634
- schema_from : path to sample to infer spec (if schema not provided)
635
- object_name : generated object column name
636
- model_name : optional generated model name
637
- print_schema : print auto-generated schema
638
- jmespath : optional JMESPATH expression to reduce JSON
639
- nrows : optional row limit for jsonl and JSON arrays
640
-
641
- Example:
642
- infer JSONl schema from data, limit parsing to 1 row
643
- ```py
644
- chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
645
- ```
646
- """
647
- if schema_from == "auto":
648
- schema_from = path
649
-
650
- def jmespath_to_name(s: str):
651
- name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
652
- return s[:name_end]
653
-
654
- if (not object_name) and jmespath:
655
- object_name = jmespath_to_name(jmespath)
656
- if not object_name:
657
- object_name = meta_type
596
+ object_name = format
658
597
  chain = DataChain.from_storage(uri=path, type=type, **kwargs)
659
598
  signal_dict = {
660
599
  object_name: read_meta(
661
600
  schema_from=schema_from,
662
- meta_type=meta_type,
601
+ format=format,
663
602
  spec=spec,
664
603
  model_name=model_name,
665
- print_schema=print_schema,
666
604
  jmespath=jmespath,
667
605
  nrows=nrows,
668
606
  )
@@ -793,47 +731,6 @@ class DataChain:
793
731
  **{object_name: catalog.listings()}, # type: ignore[arg-type]
794
732
  )
795
733
 
796
- def print_json_schema( # type: ignore[override]
797
- self, jmespath: Optional[str] = None, model_name: Optional[str] = None
798
- ) -> "Self":
799
- """Print JSON data model and save it. It returns the chain itself.
800
-
801
- Parameters:
802
- jmespath : JMESPATH expression to reduce JSON
803
- model_name : generated model name
804
-
805
- Example:
806
- print JSON schema and save to column "meta_from":
807
- ```py
808
- uri = "gs://datachain-demo/coco2017/annotations_captions/"
809
- chain = DataChain.from_storage(uri)
810
- chain = chain.print_json_schema()
811
- chain.save()
812
- ```
813
- """
814
- return self.map(
815
- meta_schema=lambda file: read_schema(
816
- file, data_type="json", expr=jmespath, model_name=model_name
817
- ),
818
- output=str,
819
- )
820
-
821
- def print_jsonl_schema( # type: ignore[override]
822
- self, jmespath: Optional[str] = None, model_name: Optional[str] = None
823
- ) -> "Self":
824
- """Print JSON data model and save it. It returns the chain itself.
825
-
826
- Parameters:
827
- jmespath : JMESPATH expression to reduce JSON
828
- model_name : generated model name
829
- """
830
- return self.map(
831
- meta_schema=lambda file: read_schema(
832
- file, data_type="jsonl", expr=jmespath, model_name=model_name
833
- ),
834
- output=str,
835
- )
836
-
837
734
  def save( # type: ignore[override]
838
735
  self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
839
736
  ) -> "Self":
@@ -1624,6 +1521,155 @@ class DataChain:
1624
1521
  )
1625
1522
  return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
1626
1523
 
1524
+ def compare(
1525
+ self,
1526
+ other: "DataChain",
1527
+ on: Union[str, Sequence[str]],
1528
+ right_on: Optional[Union[str, Sequence[str]]] = None,
1529
+ compare: Optional[Union[str, Sequence[str]]] = None,
1530
+ right_compare: Optional[Union[str, Sequence[str]]] = None,
1531
+ added: bool = True,
1532
+ deleted: bool = True,
1533
+ modified: bool = True,
1534
+ same: bool = False,
1535
+ status_col: Optional[str] = None,
1536
+ ) -> "DataChain":
1537
+ """Comparing two chains by identifying rows that are added, deleted, modified
1538
+ or same. Result is the new chain that has additional column with possible
1539
+ values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
1540
+ rows respectively. Note that if only one "status" is asked, by setting proper
1541
+ flags, this additional column is not created as it would have only one value
1542
+ for all rows. Beside additional diff column, new chain has schema of the chain
1543
+ on which method was called.
1544
+
1545
+ Parameters:
1546
+ other: Chain to calculate diff from.
1547
+ on: Column or list of columns to match on. If both chains have the
1548
+ same columns then this column is enough for the match. Otherwise,
1549
+ `right_on` parameter has to specify the columns for the other chain.
1550
+ This value is used to find corresponding row in other dataset. If not
1551
+ found there, row is considered as added (or removed if vice versa), and
1552
+ if found then row can be either modified or same.
1553
+ right_on: Optional column or list of columns
1554
+ for the `other` to match.
1555
+ compare: Column or list of columns to compare on. If both chains have
1556
+ the same columns then this column is enough for the compare. Otherwise,
1557
+ `right_compare` parameter has to specify the columns for the other
1558
+ chain. This value is used to see if row is modified or same. If
1559
+ not set, all columns will be used for comparison
1560
+ right_compare: Optional column or list of columns
1561
+ for the `other` to compare to.
1562
+ added (bool): Whether to return added rows in resulting chain.
1563
+ deleted (bool): Whether to return deleted rows in resulting chain.
1564
+ modified (bool): Whether to return modified rows in resulting chain.
1565
+ same (bool): Whether to return unchanged rows in resulting chain.
1566
+ status_col (str): Name of the new column that is created in resulting chain
1567
+ representing diff status.
1568
+
1569
+ Example:
1570
+ ```py
1571
+ diff = persons.diff(
1572
+ new_persons,
1573
+ on=["id"],
1574
+ right_on=["other_id"],
1575
+ compare=["name"],
1576
+ added=True,
1577
+ deleted=True,
1578
+ modified=True,
1579
+ same=True,
1580
+ status_col="diff"
1581
+ )
1582
+ ```
1583
+ """
1584
+ from datachain.lib.diff import compare as chain_compare
1585
+
1586
+ return chain_compare(
1587
+ self,
1588
+ other,
1589
+ on,
1590
+ right_on=right_on,
1591
+ compare=compare,
1592
+ right_compare=right_compare,
1593
+ added=added,
1594
+ deleted=deleted,
1595
+ modified=modified,
1596
+ same=same,
1597
+ status_col=status_col,
1598
+ )
1599
+
1600
+ def diff(
1601
+ self,
1602
+ other: "DataChain",
1603
+ on: str = "file",
1604
+ right_on: Optional[str] = None,
1605
+ added: bool = True,
1606
+ modified: bool = True,
1607
+ deleted: bool = False,
1608
+ same: bool = False,
1609
+ status_col: Optional[str] = None,
1610
+ ) -> "DataChain":
1611
+ """Similar to `.compare()`, which is more generic method to calculate difference
1612
+ between two chains. Unlike `.compare()`, this method works only on those chains
1613
+ that have `File` object, or it's derivatives, in it. File `source` and `path`
1614
+ are used for matching, and file `version` and `etag` for comparing, while in
1615
+ `.compare()` user needs to provide arbitrary columns for matching and comparing.
1616
+
1617
+ Parameters:
1618
+ other: Chain to calculate diff from.
1619
+ on: File signal to match on. If both chains have the
1620
+ same file signal then this column is enough for the match. Otherwise,
1621
+ `right_on` parameter has to specify the file signal for the other chain.
1622
+ This value is used to find corresponding row in other dataset. If not
1623
+ found there, row is considered as added (or removed if vice versa), and
1624
+ if found then row can be either modified or same.
1625
+ right_on: Optional file signal for the `other` to match.
1626
+ added (bool): Whether to return added rows in resulting chain.
1627
+ deleted (bool): Whether to return deleted rows in resulting chain.
1628
+ modified (bool): Whether to return modified rows in resulting chain.
1629
+ same (bool): Whether to return unchanged rows in resulting chain.
1630
+ status_col (str): Optional name of the new column that is created in
1631
+ resulting chain representing diff status.
1632
+
1633
+ Example:
1634
+ ```py
1635
+ diff = images.diff(
1636
+ new_images,
1637
+ on="file",
1638
+ right_on="other_file",
1639
+ added=True,
1640
+ deleted=True,
1641
+ modified=True,
1642
+ same=True,
1643
+ status_col="diff"
1644
+ )
1645
+ ```
1646
+ """
1647
+ on_file_signals = ["source", "path"]
1648
+ compare_file_signals = ["version", "etag"]
1649
+
1650
+ def get_file_signals(file: str, signals):
1651
+ return [f"{file}.{c}" for c in signals]
1652
+
1653
+ right_on = right_on or on
1654
+
1655
+ on_cols = get_file_signals(on, on_file_signals)
1656
+ right_on_cols = get_file_signals(right_on, on_file_signals)
1657
+ compare_cols = get_file_signals(on, compare_file_signals)
1658
+ right_compare_cols = get_file_signals(right_on, compare_file_signals)
1659
+
1660
+ return self.compare(
1661
+ other,
1662
+ on_cols,
1663
+ right_on=right_on_cols,
1664
+ compare=compare_cols,
1665
+ right_compare=right_compare_cols,
1666
+ added=added,
1667
+ deleted=deleted,
1668
+ modified=modified,
1669
+ same=same,
1670
+ status_col=status_col,
1671
+ )
1672
+
1627
1673
  @classmethod
1628
1674
  def from_values(
1629
1675
  cls,
@@ -1701,6 +1747,8 @@ class DataChain:
1701
1747
  Parameters:
1702
1748
  flatten : Whether to use a multiindex or flatten column names.
1703
1749
  """
1750
+ import pandas as pd
1751
+
1704
1752
  headers, max_length = self._effective_signals_schema.get_headers_with_length()
1705
1753
  if flatten or max_length < 2:
1706
1754
  columns = [".".join(filter(None, header)) for header in headers]
@@ -1724,6 +1772,8 @@ class DataChain:
1724
1772
  transpose : Whether to transpose rows and columns.
1725
1773
  truncate : Whether or not to truncate the contents of columns.
1726
1774
  """
1775
+ import pandas as pd
1776
+
1727
1777
  dc = self.limit(limit) if limit > 0 else self # type: ignore[misc]
1728
1778
  df = dc.to_pandas(flatten)
1729
1779