datachain 0.7.11__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,7 +1,6 @@
1
1
  import io
2
2
  import json
3
3
  import logging
4
- import math
5
4
  import os
6
5
  import os.path
7
6
  import posixpath
@@ -13,7 +12,6 @@ from collections.abc import Iterable, Iterator, Mapping, Sequence
13
12
  from copy import copy
14
13
  from dataclasses import dataclass
15
14
  from functools import cached_property, reduce
16
- from random import shuffle
17
15
  from threading import Thread
18
16
  from typing import (
19
17
  IO,
@@ -58,11 +56,7 @@ from datachain.node import DirType, Node, NodeWithPath
58
56
  from datachain.nodes_thread_pool import NodesThreadPool
59
57
  from datachain.remote.studio import StudioClient
60
58
  from datachain.sql.types import DateTime, SQLType
61
- from datachain.utils import (
62
- DataChainDir,
63
- batched,
64
- datachain_paths_join,
65
- )
59
+ from datachain.utils import DataChainDir, datachain_paths_join
66
60
 
67
61
  from .datasource import DataSource
68
62
 
@@ -90,7 +84,7 @@ QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE = 10
90
84
  QUERY_SCRIPT_CANCELED_EXIT_CODE = 11
91
85
 
92
86
  # dataset pull
93
- PULL_DATASET_MAX_THREADS = 10
87
+ PULL_DATASET_MAX_THREADS = 5
94
88
  PULL_DATASET_CHUNK_TIMEOUT = 3600
95
89
  PULL_DATASET_SLEEP_INTERVAL = 0.1 # sleep time while waiting for chunk to be available
96
90
  PULL_DATASET_CHECK_STATUS_INTERVAL = 20 # interval to check export status in Studio
@@ -130,6 +124,7 @@ class DatasetRowsFetcher(NodesThreadPool):
130
124
  local_ds_version: int,
131
125
  schema: dict[str, Union[SQLType, type[SQLType]]],
132
126
  max_threads: int = PULL_DATASET_MAX_THREADS,
127
+ progress_bar=None,
133
128
  ):
134
129
  super().__init__(max_threads)
135
130
  self._check_dependencies()
@@ -142,6 +137,7 @@ class DatasetRowsFetcher(NodesThreadPool):
142
137
  self.schema = schema
143
138
  self.last_status_check: Optional[float] = None
144
139
  self.studio_client = StudioClient()
140
+ self.progress_bar = progress_bar
145
141
 
146
142
  def done_task(self, done):
147
143
  for task in done:
@@ -198,6 +194,20 @@ class DatasetRowsFetcher(NodesThreadPool):
198
194
  for c in [c for c, t in self.schema.items() if t == DateTime]:
199
195
  df[c] = pd.to_datetime(df[c], unit="s")
200
196
 
197
+ # id will be autogenerated in DB
198
+ return df.drop("sys__id", axis=1)
199
+
200
+ def get_parquet_content(self, url: str):
201
+ while True:
202
+ if self.should_check_for_status():
203
+ self.check_for_status()
204
+ r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
205
+ if r.status_code == 404:
206
+ time.sleep(PULL_DATASET_SLEEP_INTERVAL)
207
+ continue
208
+ r.raise_for_status()
209
+ return r.content
210
+
201
211
  def do_task(self, urls):
202
212
  import lz4.frame
203
213
  import pandas as pd
@@ -207,31 +217,22 @@ class DatasetRowsFetcher(NodesThreadPool):
207
217
  local_ds = metastore.get_dataset(self.local_ds_name)
208
218
 
209
219
  urls = list(urls)
210
- while urls:
211
- for url in urls:
212
- if self.should_check_for_status():
213
- self.check_for_status()
214
-
215
- r = requests.get(url, timeout=PULL_DATASET_CHUNK_TIMEOUT)
216
- if r.status_code == 404:
217
- time.sleep(PULL_DATASET_SLEEP_INTERVAL)
218
- # moving to the next url
219
- continue
220
-
221
- r.raise_for_status()
222
-
223
- df = pd.read_parquet(io.BytesIO(lz4.frame.decompress(r.content)))
224
220
 
225
- self.fix_columns(df)
221
+ for url in urls:
222
+ if self.should_check_for_status():
223
+ self.check_for_status()
226
224
 
227
- # id will be autogenerated in DB
228
- df = df.drop("sys__id", axis=1)
225
+ df = pd.read_parquet(
226
+ io.BytesIO(lz4.frame.decompress(self.get_parquet_content(url)))
227
+ )
228
+ df = self.fix_columns(df)
229
229
 
230
- inserted = warehouse.insert_dataset_rows(
231
- df, local_ds, self.local_ds_version
232
- )
233
- self.increase_counter(inserted) # type: ignore [arg-type]
234
- urls.remove(url)
230
+ inserted = warehouse.insert_dataset_rows(
231
+ df, local_ds, self.local_ds_version
232
+ )
233
+ self.increase_counter(inserted) # type: ignore [arg-type]
234
+ # sometimes progress bar doesn't get updated so manually updating it
235
+ self.update_progress_bar(self.progress_bar)
235
236
 
236
237
 
237
238
  @dataclass
@@ -1291,13 +1292,13 @@ class Catalog:
1291
1292
  for source in data_sources: # type: ignore [union-attr]
1292
1293
  yield source, source.ls(fields)
1293
1294
 
1294
- def pull_dataset( # noqa: PLR0915
1295
+ def pull_dataset( # noqa: C901, PLR0915
1295
1296
  self,
1296
1297
  remote_ds_uri: str,
1297
1298
  output: Optional[str] = None,
1298
1299
  local_ds_name: Optional[str] = None,
1299
1300
  local_ds_version: Optional[int] = None,
1300
- no_cp: bool = False,
1301
+ cp: bool = False,
1301
1302
  force: bool = False,
1302
1303
  edatachain: bool = False,
1303
1304
  edatachain_file: Optional[str] = None,
@@ -1305,7 +1306,7 @@ class Catalog:
1305
1306
  client_config=None,
1306
1307
  ) -> None:
1307
1308
  def _instantiate(ds_uri: str) -> None:
1308
- if no_cp:
1309
+ if not cp:
1309
1310
  return
1310
1311
  assert output
1311
1312
  self.cp(
@@ -1318,7 +1319,7 @@ class Catalog:
1318
1319
  )
1319
1320
  print(f"Dataset {ds_uri} instantiated locally to {output}")
1320
1321
 
1321
- if not output and not no_cp:
1322
+ if cp and not output:
1322
1323
  raise ValueError("Please provide output directory for instantiation")
1323
1324
 
1324
1325
  studio_client = StudioClient()
@@ -1417,12 +1418,26 @@ class Catalog:
1417
1418
  signed_urls = export_response.data
1418
1419
 
1419
1420
  if signed_urls:
1420
- shuffle(signed_urls)
1421
-
1422
1421
  with (
1423
1422
  self.metastore.clone() as metastore,
1424
1423
  self.warehouse.clone() as warehouse,
1425
1424
  ):
1425
+
1426
+ def batch(urls):
1427
+ """
1428
+ Batching urls in a way that fetching is most efficient as
1429
+ urls with lower id will be created first. Because that, we
1430
+ are making sure all threads are pulling most recent urls
1431
+ from beginning
1432
+ """
1433
+ res = [[] for i in range(PULL_DATASET_MAX_THREADS)]
1434
+ current_worker = 0
1435
+ for url in signed_urls:
1436
+ res[current_worker].append(url)
1437
+ current_worker = (current_worker + 1) % PULL_DATASET_MAX_THREADS
1438
+
1439
+ return res
1440
+
1426
1441
  rows_fetcher = DatasetRowsFetcher(
1427
1442
  metastore,
1428
1443
  warehouse,
@@ -1431,14 +1446,11 @@ class Catalog:
1431
1446
  local_ds_name,
1432
1447
  local_ds_version,
1433
1448
  schema,
1449
+ progress_bar=dataset_save_progress_bar,
1434
1450
  )
1435
1451
  try:
1436
1452
  rows_fetcher.run(
1437
- batched(
1438
- signed_urls,
1439
- math.ceil(len(signed_urls) / PULL_DATASET_MAX_THREADS),
1440
- ),
1441
- dataset_save_progress_bar,
1453
+ iter(batch(signed_urls)), dataset_save_progress_bar
1442
1454
  )
1443
1455
  except:
1444
1456
  self.remove_dataset(local_ds_name, local_ds_version)
datachain/cli.py CHANGED
@@ -294,6 +294,28 @@ def add_studio_parser(subparsers, parent_parser) -> None:
294
294
  help="Python package requirement. Can be specified multiple times.",
295
295
  )
296
296
 
297
+ studio_cancel_help = "Cancel a job in Studio"
298
+ studio_cancel_description = "This command cancels a job in Studio."
299
+
300
+ studio_cancel_parser = studio_subparser.add_parser(
301
+ "cancel",
302
+ parents=[parent_parser],
303
+ description=studio_cancel_description,
304
+ help=studio_cancel_help,
305
+ )
306
+
307
+ studio_cancel_parser.add_argument(
308
+ "job_id",
309
+ action="store",
310
+ help="The job ID to cancel.",
311
+ )
312
+ studio_cancel_parser.add_argument(
313
+ "--team",
314
+ action="store",
315
+ default=None,
316
+ help="The team to cancel a job for. By default, it will use team from config.",
317
+ )
318
+
297
319
 
298
320
  def get_parser() -> ArgumentParser: # noqa: PLR0915
299
321
  try:
@@ -457,10 +479,10 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
457
479
  help="Copy directories recursively",
458
480
  )
459
481
  parse_pull.add_argument(
460
- "--no-cp",
482
+ "--cp",
461
483
  default=False,
462
484
  action="store_true",
463
- help="Do not copy files, just pull a remote dataset into local DB",
485
+ help="Copy actual files after pulling remote dataset into local DB",
464
486
  )
465
487
  parse_pull.add_argument(
466
488
  "--edatachain",
@@ -1300,7 +1322,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1300
1322
  args.output,
1301
1323
  local_ds_name=args.local_name,
1302
1324
  local_ds_version=args.local_version,
1303
- no_cp=args.no_cp,
1325
+ cp=args.cp,
1304
1326
  force=bool(args.force),
1305
1327
  edatachain=args.edatachain,
1306
1328
  edatachain_file=args.edatachain_file,
@@ -209,10 +209,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
209
209
 
210
210
  @retry_sqlite_locks
211
211
  def executemany(
212
- self, query, params, cursor: Optional[sqlite3.Cursor] = None
212
+ self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
213
213
  ) -> sqlite3.Cursor:
214
214
  if cursor:
215
215
  return cursor.executemany(self.compile(query).string, params)
216
+ if conn:
217
+ return conn.executemany(self.compile(query).string, params)
216
218
  return self.db.executemany(self.compile(query).string, params)
217
219
 
218
220
  @retry_sqlite_locks
@@ -222,7 +224,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
222
224
  return self.db.execute(sql, parameters)
223
225
 
224
226
  def insert_dataframe(self, table_name: str, df) -> int:
225
- return df.to_sql(table_name, self.db, if_exists="append", index=False)
227
+ return df.to_sql(
228
+ table_name,
229
+ self.db,
230
+ if_exists="append",
231
+ index=False,
232
+ method="multi",
233
+ chunksize=1000,
234
+ )
226
235
 
227
236
  def cursor(self, factory=None):
228
237
  if factory is None:
@@ -545,10 +554,15 @@ class SQLiteWarehouse(AbstractWarehouse):
545
554
  rows = list(rows)
546
555
  if not rows:
547
556
  return
548
- self.db.executemany(
549
- table.insert().values({f: bindparam(f) for f in rows[0]}),
550
- rows,
551
- )
557
+
558
+ with self.db.transaction() as conn:
559
+ # transactions speeds up inserts significantly as there is no separate
560
+ # transaction created for each insert row
561
+ self.db.executemany(
562
+ table.insert().values({f: bindparam(f) for f in rows[0]}),
563
+ rows,
564
+ conn=conn,
565
+ )
552
566
 
553
567
  def insert_dataset_rows(self, df, dataset: DatasetRecord, version: int) -> int:
554
568
  dr = self.dataset_rows(dataset, version)
datachain/lib/dc.py CHANGED
@@ -41,7 +41,7 @@ from datachain.lib.listing import (
41
41
  parse_listing_uri,
42
42
  )
43
43
  from datachain.lib.listing_info import ListingInfo
44
- from datachain.lib.meta_formats import read_meta, read_schema
44
+ from datachain.lib.meta_formats import read_meta
45
45
  from datachain.lib.model_store import ModelStore
46
46
  from datachain.lib.settings import Settings
47
47
  from datachain.lib.signal_schema import SignalSchema
@@ -554,8 +554,7 @@ class DataChain:
554
554
  jmespath: Optional[str] = None,
555
555
  object_name: Optional[str] = "",
556
556
  model_name: Optional[str] = None,
557
- print_schema: Optional[bool] = False,
558
- meta_type: Optional[str] = "json",
557
+ format: Optional[str] = "json",
559
558
  nrows=None,
560
559
  **kwargs,
561
560
  ) -> "DataChain":
@@ -564,12 +563,12 @@ class DataChain:
564
563
  Parameters:
565
564
  path : storage URI with directory. URI must start with storage prefix such
566
565
  as `s3://`, `gs://`, `az://` or "file:///"
567
- type : read file as "binary", "text", or "image" data. Default is "binary".
566
+ type : read file as "binary", "text", or "image" data. Default is "text".
568
567
  spec : optional Data Model
569
568
  schema_from : path to sample to infer spec (if schema not provided)
570
569
  object_name : generated object column name
571
570
  model_name : optional generated model name
572
- print_schema : print auto-generated schema
571
+ format: "json", "jsonl"
573
572
  jmespath : optional JMESPATH expression to reduce JSON
574
573
  nrows : optional row limit for jsonl and JSON arrays
575
574
 
@@ -594,75 +593,14 @@ class DataChain:
594
593
  if (not object_name) and jmespath:
595
594
  object_name = jmespath_to_name(jmespath)
596
595
  if not object_name:
597
- object_name = meta_type
598
- chain = DataChain.from_storage(uri=path, type=type, **kwargs)
599
- signal_dict = {
600
- object_name: read_meta(
601
- schema_from=schema_from,
602
- meta_type=meta_type,
603
- spec=spec,
604
- model_name=model_name,
605
- print_schema=print_schema,
606
- jmespath=jmespath,
607
- nrows=nrows,
608
- )
609
- }
610
- return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
611
-
612
- @classmethod
613
- def from_jsonl(
614
- cls,
615
- path,
616
- type: Literal["binary", "text", "image"] = "text",
617
- spec: Optional[DataType] = None,
618
- schema_from: Optional[str] = "auto",
619
- jmespath: Optional[str] = None,
620
- object_name: Optional[str] = "",
621
- model_name: Optional[str] = None,
622
- print_schema: Optional[bool] = False,
623
- meta_type: Optional[str] = "jsonl",
624
- nrows=None,
625
- **kwargs,
626
- ) -> "DataChain":
627
- """Get data from JSON lines. It returns the chain itself.
628
-
629
- Parameters:
630
- path : storage URI with directory. URI must start with storage prefix such
631
- as `s3://`, `gs://`, `az://` or "file:///"
632
- type : read file as "binary", "text", or "image" data. Default is "binary".
633
- spec : optional Data Model
634
- schema_from : path to sample to infer spec (if schema not provided)
635
- object_name : generated object column name
636
- model_name : optional generated model name
637
- print_schema : print auto-generated schema
638
- jmespath : optional JMESPATH expression to reduce JSON
639
- nrows : optional row limit for jsonl and JSON arrays
640
-
641
- Example:
642
- infer JSONl schema from data, limit parsing to 1 row
643
- ```py
644
- chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
645
- ```
646
- """
647
- if schema_from == "auto":
648
- schema_from = path
649
-
650
- def jmespath_to_name(s: str):
651
- name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
652
- return s[:name_end]
653
-
654
- if (not object_name) and jmespath:
655
- object_name = jmespath_to_name(jmespath)
656
- if not object_name:
657
- object_name = meta_type
596
+ object_name = format
658
597
  chain = DataChain.from_storage(uri=path, type=type, **kwargs)
659
598
  signal_dict = {
660
599
  object_name: read_meta(
661
600
  schema_from=schema_from,
662
- meta_type=meta_type,
601
+ format=format,
663
602
  spec=spec,
664
603
  model_name=model_name,
665
- print_schema=print_schema,
666
604
  jmespath=jmespath,
667
605
  nrows=nrows,
668
606
  )
@@ -793,47 +731,6 @@ class DataChain:
793
731
  **{object_name: catalog.listings()}, # type: ignore[arg-type]
794
732
  )
795
733
 
796
- def print_json_schema( # type: ignore[override]
797
- self, jmespath: Optional[str] = None, model_name: Optional[str] = None
798
- ) -> "Self":
799
- """Print JSON data model and save it. It returns the chain itself.
800
-
801
- Parameters:
802
- jmespath : JMESPATH expression to reduce JSON
803
- model_name : generated model name
804
-
805
- Example:
806
- print JSON schema and save to column "meta_from":
807
- ```py
808
- uri = "gs://datachain-demo/coco2017/annotations_captions/"
809
- chain = DataChain.from_storage(uri)
810
- chain = chain.print_json_schema()
811
- chain.save()
812
- ```
813
- """
814
- return self.map(
815
- meta_schema=lambda file: read_schema(
816
- file, data_type="json", expr=jmespath, model_name=model_name
817
- ),
818
- output=str,
819
- )
820
-
821
- def print_jsonl_schema( # type: ignore[override]
822
- self, jmespath: Optional[str] = None, model_name: Optional[str] = None
823
- ) -> "Self":
824
- """Print JSON data model and save it. It returns the chain itself.
825
-
826
- Parameters:
827
- jmespath : JMESPATH expression to reduce JSON
828
- model_name : generated model name
829
- """
830
- return self.map(
831
- meta_schema=lambda file: read_schema(
832
- file, data_type="jsonl", expr=jmespath, model_name=model_name
833
- ),
834
- output=str,
835
- )
836
-
837
734
  def save( # type: ignore[override]
838
735
  self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
839
736
  ) -> "Self":
@@ -1624,6 +1521,155 @@ class DataChain:
1624
1521
  )
1625
1522
  return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
1626
1523
 
1524
+ def compare(
1525
+ self,
1526
+ other: "DataChain",
1527
+ on: Union[str, Sequence[str]],
1528
+ right_on: Optional[Union[str, Sequence[str]]] = None,
1529
+ compare: Optional[Union[str, Sequence[str]]] = None,
1530
+ right_compare: Optional[Union[str, Sequence[str]]] = None,
1531
+ added: bool = True,
1532
+ deleted: bool = True,
1533
+ modified: bool = True,
1534
+ same: bool = False,
1535
+ status_col: Optional[str] = None,
1536
+ ) -> "DataChain":
1537
+ """Comparing two chains by identifying rows that are added, deleted, modified
1538
+ or same. Result is the new chain that has additional column with possible
1539
+ values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
1540
+ rows respectively. Note that if only one "status" is asked, by setting proper
1541
+ flags, this additional column is not created as it would have only one value
1542
+ for all rows. Beside additional diff column, new chain has schema of the chain
1543
+ on which method was called.
1544
+
1545
+ Parameters:
1546
+ other: Chain to calculate diff from.
1547
+ on: Column or list of columns to match on. If both chains have the
1548
+ same columns then this column is enough for the match. Otherwise,
1549
+ `right_on` parameter has to specify the columns for the other chain.
1550
+ This value is used to find corresponding row in other dataset. If not
1551
+ found there, row is considered as added (or removed if vice versa), and
1552
+ if found then row can be either modified or same.
1553
+ right_on: Optional column or list of columns
1554
+ for the `other` to match.
1555
+ compare: Column or list of columns to compare on. If both chains have
1556
+ the same columns then this column is enough for the compare. Otherwise,
1557
+ `right_compare` parameter has to specify the columns for the other
1558
+ chain. This value is used to see if row is modified or same. If
1559
+ not set, all columns will be used for comparison
1560
+ right_compare: Optional column or list of columns
1561
+ for the `other` to compare to.
1562
+ added (bool): Whether to return added rows in resulting chain.
1563
+ deleted (bool): Whether to return deleted rows in resulting chain.
1564
+ modified (bool): Whether to return modified rows in resulting chain.
1565
+ same (bool): Whether to return unchanged rows in resulting chain.
1566
+ status_col (str): Name of the new column that is created in resulting chain
1567
+ representing diff status.
1568
+
1569
+ Example:
1570
+ ```py
1571
+ diff = persons.diff(
1572
+ new_persons,
1573
+ on=["id"],
1574
+ right_on=["other_id"],
1575
+ compare=["name"],
1576
+ added=True,
1577
+ deleted=True,
1578
+ modified=True,
1579
+ same=True,
1580
+ status_col="diff"
1581
+ )
1582
+ ```
1583
+ """
1584
+ from datachain.lib.diff import compare as chain_compare
1585
+
1586
+ return chain_compare(
1587
+ self,
1588
+ other,
1589
+ on,
1590
+ right_on=right_on,
1591
+ compare=compare,
1592
+ right_compare=right_compare,
1593
+ added=added,
1594
+ deleted=deleted,
1595
+ modified=modified,
1596
+ same=same,
1597
+ status_col=status_col,
1598
+ )
1599
+
1600
+ def diff(
1601
+ self,
1602
+ other: "DataChain",
1603
+ on: str = "file",
1604
+ right_on: Optional[str] = None,
1605
+ added: bool = True,
1606
+ modified: bool = True,
1607
+ deleted: bool = False,
1608
+ same: bool = False,
1609
+ status_col: Optional[str] = None,
1610
+ ) -> "DataChain":
1611
+ """Similar to `.compare()`, which is more generic method to calculate difference
1612
+ between two chains. Unlike `.compare()`, this method works only on those chains
1613
+ that have `File` object, or it's derivatives, in it. File `source` and `path`
1614
+ are used for matching, and file `version` and `etag` for comparing, while in
1615
+ `.compare()` user needs to provide arbitrary columns for matching and comparing.
1616
+
1617
+ Parameters:
1618
+ other: Chain to calculate diff from.
1619
+ on: File signal to match on. If both chains have the
1620
+ same file signal then this column is enough for the match. Otherwise,
1621
+ `right_on` parameter has to specify the file signal for the other chain.
1622
+ This value is used to find corresponding row in other dataset. If not
1623
+ found there, row is considered as added (or removed if vice versa), and
1624
+ if found then row can be either modified or same.
1625
+ right_on: Optional file signal for the `other` to match.
1626
+ added (bool): Whether to return added rows in resulting chain.
1627
+ deleted (bool): Whether to return deleted rows in resulting chain.
1628
+ modified (bool): Whether to return modified rows in resulting chain.
1629
+ same (bool): Whether to return unchanged rows in resulting chain.
1630
+ status_col (str): Optional name of the new column that is created in
1631
+ resulting chain representing diff status.
1632
+
1633
+ Example:
1634
+ ```py
1635
+ diff = images.diff(
1636
+ new_images,
1637
+ on="file",
1638
+ right_on="other_file",
1639
+ added=True,
1640
+ deleted=True,
1641
+ modified=True,
1642
+ same=True,
1643
+ status_col="diff"
1644
+ )
1645
+ ```
1646
+ """
1647
+ on_file_signals = ["source", "path"]
1648
+ compare_file_signals = ["version", "etag"]
1649
+
1650
+ def get_file_signals(file: str, signals):
1651
+ return [f"{file}.{c}" for c in signals]
1652
+
1653
+ right_on = right_on or on
1654
+
1655
+ on_cols = get_file_signals(on, on_file_signals)
1656
+ right_on_cols = get_file_signals(right_on, on_file_signals)
1657
+ compare_cols = get_file_signals(on, compare_file_signals)
1658
+ right_compare_cols = get_file_signals(right_on, compare_file_signals)
1659
+
1660
+ return self.compare(
1661
+ other,
1662
+ on_cols,
1663
+ right_on=right_on_cols,
1664
+ compare=compare_cols,
1665
+ right_compare=right_compare_cols,
1666
+ added=added,
1667
+ deleted=deleted,
1668
+ modified=modified,
1669
+ same=same,
1670
+ status_col=status_col,
1671
+ )
1672
+
1627
1673
  @classmethod
1628
1674
  def from_values(
1629
1675
  cls,
datachain/lib/diff.py ADDED
@@ -0,0 +1,197 @@
1
+ import random
2
+ import string
3
+ from collections.abc import Sequence
4
+ from typing import TYPE_CHECKING, Optional, Union
5
+
6
+ import sqlalchemy as sa
7
+
8
+ from datachain.lib.signal_schema import SignalSchema
9
+ from datachain.query.schema import Column
10
+ from datachain.sql.types import String
11
+
12
+ if TYPE_CHECKING:
13
+ from datachain.lib.dc import DataChain
14
+
15
+
16
+ C = Column
17
+
18
+
19
+ def compare( # noqa: PLR0912, PLR0915, C901
20
+ left: "DataChain",
21
+ right: "DataChain",
22
+ on: Union[str, Sequence[str]],
23
+ right_on: Optional[Union[str, Sequence[str]]] = None,
24
+ compare: Optional[Union[str, Sequence[str]]] = None,
25
+ right_compare: Optional[Union[str, Sequence[str]]] = None,
26
+ added: bool = True,
27
+ deleted: bool = True,
28
+ modified: bool = True,
29
+ same: bool = True,
30
+ status_col: Optional[str] = None,
31
+ ) -> "DataChain":
32
+ """Comparing two chains by identifying rows that are added, deleted, modified
33
+ or same"""
34
+ dialect = left._query.dialect
35
+
36
+ rname = "right_"
37
+
38
+ def _rprefix(c: str, rc: str) -> str:
39
+ """Returns prefix of right of two companion left - right columns
40
+ from merge. If companion columns have the same name then prefix will
41
+ be present in right column name, otherwise it won't.
42
+ """
43
+ return rname if c == rc else ""
44
+
45
+ def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
46
+ return [obj] if isinstance(obj, str) else list(obj)
47
+
48
+ if on is None:
49
+ raise ValueError("'on' must be specified")
50
+
51
+ on = _to_list(on)
52
+ if right_on:
53
+ right_on = _to_list(right_on)
54
+ if len(on) != len(right_on):
55
+ raise ValueError("'on' and 'right_on' must be have the same length")
56
+
57
+ if compare:
58
+ compare = _to_list(compare)
59
+
60
+ if right_compare:
61
+ if not compare:
62
+ raise ValueError("'compare' must be defined if 'right_compare' is defined")
63
+
64
+ right_compare = _to_list(right_compare)
65
+ if len(compare) != len(right_compare):
66
+ raise ValueError(
67
+ "'compare' and 'right_compare' must be have the same length"
68
+ )
69
+
70
+ if not any([added, deleted, modified, same]):
71
+ raise ValueError(
72
+ "At least one of added, deleted, modified, same flags must be set"
73
+ )
74
+
75
+ # we still need status column for internal implementation even if not
76
+ # needed in output
77
+ need_status_col = bool(status_col)
78
+ status_col = status_col or "diff_" + "".join(
79
+ random.choice(string.ascii_letters) # noqa: S311
80
+ for _ in range(10)
81
+ )
82
+
83
+ # calculate on and compare column names
84
+ right_on = right_on or on
85
+ cols = left.signals_schema.clone_without_sys_signals().db_signals()
86
+ right_cols = right.signals_schema.clone_without_sys_signals().db_signals()
87
+
88
+ on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
89
+ right_on = right.signals_schema.resolve(*right_on).db_signals() # type: ignore[assignment]
90
+ if compare:
91
+ right_compare = right_compare or compare
92
+ compare = left.signals_schema.resolve(*compare).db_signals() # type: ignore[assignment]
93
+ right_compare = right.signals_schema.resolve(*right_compare).db_signals() # type: ignore[assignment]
94
+ elif not compare and len(cols) != len(right_cols):
95
+ # here we will mark all rows that are not added or deleted as modified since
96
+ # there was no explicit list of compare columns provided (meaning we need
97
+ # to check all columns to determine if row is modified or same), but
98
+ # the number of columns on left and right is not the same (one of the chains
99
+ # have additional column)
100
+ compare = None
101
+ right_compare = None
102
+ else:
103
+ compare = [c for c in cols if c in right_cols] # type: ignore[misc, assignment]
104
+ right_compare = compare
105
+
106
+ diff_cond = []
107
+
108
+ if added:
109
+ added_cond = sa.and_(
110
+ *[
111
+ C(c) == None # noqa: E711
112
+ for c in [f"{_rprefix(c, rc)}{rc}" for c, rc in zip(on, right_on)]
113
+ ]
114
+ )
115
+ diff_cond.append((added_cond, "A"))
116
+ if modified and compare:
117
+ modified_cond = sa.or_(
118
+ *[
119
+ C(c) != C(f"{_rprefix(c, rc)}{rc}")
120
+ for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
121
+ ]
122
+ )
123
+ diff_cond.append((modified_cond, "M"))
124
+ if same and compare:
125
+ same_cond = sa.and_(
126
+ *[
127
+ C(c) == C(f"{_rprefix(c, rc)}{rc}")
128
+ for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
129
+ ]
130
+ )
131
+ diff_cond.append((same_cond, "S"))
132
+
133
+ diff = sa.case(*diff_cond, else_=None if compare else "M").label(status_col)
134
+ diff.type = String()
135
+
136
+ left_right_merge = left.merge(
137
+ right, on=on, right_on=right_on, inner=False, rname=rname
138
+ )
139
+ left_right_merge_select = left_right_merge._query.select(
140
+ *(
141
+ [C(c) for c in left_right_merge.signals_schema.db_signals("sys")]
142
+ + [C(c) for c in on]
143
+ + [C(c) for c in cols if c not in on]
144
+ + [diff]
145
+ )
146
+ )
147
+
148
+ diff_col = sa.literal("D").label(status_col)
149
+ diff_col.type = String()
150
+
151
+ right_left_merge = right.merge(
152
+ left, on=right_on, right_on=on, inner=False, rname=rname
153
+ ).filter(
154
+ sa.and_(
155
+ *[C(f"{_rprefix(c, rc)}{c}") == None for c, rc in zip(on, right_on)] # noqa: E711
156
+ )
157
+ )
158
+
159
+ def _default_val(chain: "DataChain", col: str):
160
+ col_type = chain._query.column_types[col] # type: ignore[index]
161
+ val = sa.literal(col_type.default_value(dialect)).label(col)
162
+ val.type = col_type()
163
+ return val
164
+
165
+ right_left_merge_select = right_left_merge._query.select(
166
+ *(
167
+ [C(c) for c in right_left_merge.signals_schema.db_signals("sys")]
168
+ + [
169
+ C(c) if c == rc else _default_val(left, c)
170
+ for c, rc in zip(on, right_on)
171
+ ]
172
+ + [
173
+ C(c) if c in right_cols else _default_val(left, c) # type: ignore[arg-type]
174
+ for c in cols
175
+ if c not in on
176
+ ]
177
+ + [diff_col]
178
+ )
179
+ )
180
+
181
+ if not deleted:
182
+ res = left_right_merge_select
183
+ elif deleted and not any([added, modified, same]):
184
+ res = right_left_merge_select
185
+ else:
186
+ res = left_right_merge_select.union(right_left_merge_select)
187
+
188
+ res = res.filter(C(status_col) != None) # noqa: E711
189
+
190
+ schema = left.signals_schema
191
+ if need_status_col:
192
+ res = res.select()
193
+ schema = SignalSchema({status_col: str}) | schema
194
+ else:
195
+ res = res.select_except(C(status_col))
196
+
197
+ return left._evolve(query=res, signal_schema=schema)
@@ -38,38 +38,41 @@ def process_json(data_string, jmespath):
38
38
  return json_dict
39
39
 
40
40
 
41
- # Print a dynamic datamodel-codegen output from JSON or CSV on stdout
42
- def read_schema(source_file, data_type="csv", expr=None, model_name=None):
41
+ def gen_datamodel_code(
42
+ source_file, format="json", jmespath=None, model_name=None
43
+ ) -> str:
44
+ """Generates Python code with Pydantic models that corresponds
45
+ to the provided JSON, CSV, or JSONL file.
46
+ It support root JSON arrays (samples the first entry).
47
+ """
43
48
  data_string = ""
44
49
  # using uiid to get around issue #1617
45
50
  if not model_name:
46
51
  # comply with Python class names
47
52
  uid_str = str(generate_uuid()).replace("-", "")
48
- model_name = f"Model{data_type}{uid_str}"
49
- try:
50
- with source_file.open() as fd: # CSV can be larger than memory
51
- if data_type == "csv":
52
- data_string += fd.readline().replace("\r", "")
53
- data_string += fd.readline().replace("\r", "")
54
- elif data_type == "jsonl":
55
- data_string = fd.readline().replace("\r", "")
56
- else:
57
- data_string = fd.read() # other meta must fit into RAM
58
- except OSError as e:
59
- print(f"An unexpected file error occurred: {e}")
60
- return
61
- if data_type in ("json", "jsonl"):
62
- json_object = process_json(data_string, expr)
63
- if data_type == "json" and isinstance(json_object, list):
53
+ model_name = f"Model{format}{uid_str}"
54
+
55
+ with source_file.open() as fd: # CSV can be larger than memory
56
+ if format == "csv":
57
+ data_string += fd.readline().replace("\r", "")
58
+ data_string += fd.readline().replace("\r", "")
59
+ elif format == "jsonl":
60
+ data_string = fd.readline().replace("\r", "")
61
+ else:
62
+ data_string = fd.read() # other meta must fit into RAM
63
+
64
+ if format in ("json", "jsonl"):
65
+ json_object = process_json(data_string, jmespath)
66
+ if format == "json" and isinstance(json_object, list):
64
67
  json_object = json_object[0] # sample the 1st object from JSON array
65
- if data_type == "jsonl":
66
- data_type = "json" # treat json line as plain JSON in auto-schema
68
+ if format == "jsonl":
69
+ format = "json" # treat json line as plain JSON in auto-schema
67
70
  data_string = json.dumps(json_object)
68
71
 
69
72
  import datamodel_code_generator
70
73
 
71
74
  input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
72
- input_file_type = input_file_types[data_type]
75
+ input_file_type = input_file_types[format]
73
76
  with tempfile.TemporaryDirectory() as tmpdir:
74
77
  output = Path(tmpdir) / "model.py"
75
78
  datamodel_code_generator.generate(
@@ -95,36 +98,29 @@ spec = {model_name}
95
98
  def read_meta( # noqa: C901
96
99
  spec=None,
97
100
  schema_from=None,
98
- meta_type="json",
101
+ format="json",
99
102
  jmespath=None,
100
- print_schema=False,
101
103
  model_name=None,
102
104
  nrows=None,
103
105
  ) -> Callable:
104
106
  from datachain.lib.dc import DataChain
105
107
 
106
108
  if schema_from:
107
- chain = (
108
- DataChain.from_storage(schema_from, type="text")
109
- .limit(1)
110
- .map( # dummy column created (#1615)
111
- meta_schema=lambda file: read_schema(
112
- file, data_type=meta_type, expr=jmespath, model_name=model_name
113
- ),
114
- output=str,
115
- )
109
+ file = next(
110
+ DataChain.from_storage(schema_from, type="text").limit(1).collect("file")
116
111
  )
117
- (model_output,) = chain.collect("meta_schema")
118
- assert isinstance(model_output, str)
119
- if print_schema:
120
- print(f"{model_output}")
112
+ model_code = gen_datamodel_code(
113
+ file, format=format, jmespath=jmespath, model_name=model_name
114
+ )
115
+ assert isinstance(model_code, str)
116
+
121
117
  # Below 'spec' should be a dynamically converted DataModel from Pydantic
122
118
  if not spec:
123
119
  gl = globals()
124
- exec(model_output, gl) # type: ignore[arg-type] # noqa: S102
120
+ exec(model_code, gl) # type: ignore[arg-type] # noqa: S102
125
121
  spec = gl["spec"]
126
122
 
127
- if not (spec) and not (schema_from):
123
+ if not spec and not schema_from:
128
124
  raise ValueError(
129
125
  "Must provide a static schema in spec: or metadata sample in schema_from:"
130
126
  )
@@ -136,7 +132,7 @@ def read_meta( # noqa: C901
136
132
  def parse_data(
137
133
  file: File,
138
134
  data_model=spec,
139
- meta_type=meta_type,
135
+ format=format,
140
136
  jmespath=jmespath,
141
137
  nrows=nrows,
142
138
  ) -> Iterator[spec]:
@@ -148,7 +144,7 @@ def read_meta( # noqa: C901
148
144
  except ValidationError as e:
149
145
  print(f"Validation error occurred in row {nrow} file {file.name}:", e)
150
146
 
151
- if meta_type == "csv":
147
+ if format == "csv":
152
148
  with (
153
149
  file.open() as fd
154
150
  ): # TODO: if schema is statically given, should allow CSV without headers
@@ -156,7 +152,7 @@ def read_meta( # noqa: C901
156
152
  for row in reader: # CSV can be larger than memory
157
153
  yield from validator(row)
158
154
 
159
- if meta_type == "json":
155
+ if format == "json":
160
156
  try:
161
157
  with file.open() as fd: # JSON must fit into RAM
162
158
  data_string = fd.read()
@@ -174,7 +170,7 @@ def read_meta( # noqa: C901
174
170
  return
175
171
  yield from validator(json_dict, nrow)
176
172
 
177
- if meta_type == "jsonl":
173
+ if format == "jsonl":
178
174
  try:
179
175
  nrow = 0
180
176
  with file.open() as fd:
@@ -1069,6 +1069,7 @@ class DatasetQuery:
1069
1069
  if "sys__id" in self.column_types:
1070
1070
  self.column_types.pop("sys__id")
1071
1071
  self.starting_step = QueryStep(self.catalog, name, self.version)
1072
+ self.dialect = self.catalog.warehouse.db.dialect
1072
1073
 
1073
1074
  def __iter__(self):
1074
1075
  return iter(self.db_results())
@@ -2,7 +2,7 @@ import base64
2
2
  import json
3
3
  import logging
4
4
  import os
5
- from collections.abc import Iterable, Iterator
5
+ from collections.abc import AsyncIterator, Iterable, Iterator
6
6
  from datetime import datetime, timedelta, timezone
7
7
  from struct import unpack
8
8
  from typing import (
@@ -11,6 +11,9 @@ from typing import (
11
11
  Optional,
12
12
  TypeVar,
13
13
  )
14
+ from urllib.parse import urlparse, urlunparse
15
+
16
+ import websockets
14
17
 
15
18
  from datachain.config import Config
16
19
  from datachain.dataset import DatasetStats
@@ -22,6 +25,7 @@ LsData = Optional[list[dict[str, Any]]]
22
25
  DatasetInfoData = Optional[dict[str, Any]]
23
26
  DatasetStatsData = Optional[DatasetStats]
24
27
  DatasetRowsData = Optional[Iterable[dict[str, Any]]]
28
+ DatasetJobVersionsData = Optional[dict[str, Any]]
25
29
  DatasetExportStatus = Optional[dict[str, Any]]
26
30
  DatasetExportSignedUrls = Optional[list[str]]
27
31
  FileUploadData = Optional[dict[str, Any]]
@@ -231,6 +235,40 @@ class StudioClient:
231
235
 
232
236
  return msgpack.ExtType(code, data)
233
237
 
238
+ async def tail_job_logs(self, job_id: str) -> AsyncIterator[dict]:
239
+ """
240
+ Follow job logs via websocket connection.
241
+
242
+ Args:
243
+ job_id: ID of the job to follow logs for
244
+
245
+ Yields:
246
+ Dict containing either job status updates or log messages
247
+ """
248
+ parsed_url = urlparse(self.url)
249
+ ws_url = urlunparse(
250
+ parsed_url._replace(scheme="wss" if parsed_url.scheme == "https" else "ws")
251
+ )
252
+ ws_url = f"{ws_url}/logs/follow/?job_id={job_id}&team_name={self.team}"
253
+
254
+ async with websockets.connect(
255
+ ws_url,
256
+ additional_headers={"Authorization": f"token {self.token}"},
257
+ ) as websocket:
258
+ while True:
259
+ try:
260
+ message = await websocket.recv()
261
+ data = json.loads(message)
262
+
263
+ # Yield the parsed message data
264
+ yield data
265
+
266
+ except websockets.exceptions.ConnectionClosed:
267
+ break
268
+ except Exception as e: # noqa: BLE001
269
+ logger.error("Error receiving websocket message: %s", e)
270
+ break
271
+
234
272
  def ls(self, paths: Iterable[str]) -> Iterator[tuple[str, Response[LsData]]]:
235
273
  # TODO: change LsData (response.data value) to be list of lists
236
274
  # to handle cases where a path will be expanded (i.e. globs)
@@ -302,6 +340,13 @@ class StudioClient:
302
340
  method="GET",
303
341
  )
304
342
 
343
+ def dataset_job_versions(self, job_id: str) -> Response[DatasetJobVersionsData]:
344
+ return self._send_request(
345
+ "datachain/datasets/dataset_job_versions",
346
+ {"job_id": job_id},
347
+ method="GET",
348
+ )
349
+
305
350
  def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
306
351
  response = self._send_request(
307
352
  "datachain/datasets/stats",
@@ -359,3 +404,10 @@ class StudioClient:
359
404
  "requirements": requirements,
360
405
  }
361
406
  return self._send_request("datachain/job", data)
407
+
408
+ def cancel_job(
409
+ self,
410
+ job_id: str,
411
+ ) -> Response[JobData]:
412
+ url = f"datachain/job/{job_id}/cancel"
413
+ return self._send_request(url, data={}, method="POST")
datachain/studio.py CHANGED
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  import os
2
3
  from typing import TYPE_CHECKING, Optional
3
4
 
@@ -19,7 +20,7 @@ POST_LOGIN_MESSAGE = (
19
20
  )
20
21
 
21
22
 
22
- def process_studio_cli_args(args: "Namespace"):
23
+ def process_studio_cli_args(args: "Namespace"): # noqa: PLR0911
23
24
  if args.cmd == "login":
24
25
  return login(args)
25
26
  if args.cmd == "logout":
@@ -47,6 +48,9 @@ def process_studio_cli_args(args: "Namespace"):
47
48
  args.req_file,
48
49
  )
49
50
 
51
+ if args.cmd == "cancel":
52
+ return cancel_job(args.job_id, args.team)
53
+
50
54
  if args.cmd == "team":
51
55
  return set_team(args)
52
56
  raise DataChainError(f"Unknown command '{args.cmd}'.")
@@ -227,8 +231,34 @@ def create_job(
227
231
  if not response.data:
228
232
  raise DataChainError("Failed to create job")
229
233
 
230
- print(f"Job {response.data.get('job', {}).get('id')} created")
234
+ job_id = response.data.get("job", {}).get("id")
235
+ print(f"Job {job_id} created")
231
236
  print("Open the job in Studio at", response.data.get("job", {}).get("url"))
237
+ print("=" * 40)
238
+
239
+ # Sync usage
240
+ async def _run():
241
+ async for message in client.tail_job_logs(job_id):
242
+ if "logs" in message:
243
+ for log in message["logs"]:
244
+ print(log["message"], end="")
245
+ elif "job" in message:
246
+ print(f"\n>>>> Job is now in {message['job']['status']} status.")
247
+
248
+ asyncio.run(_run())
249
+
250
+ response = client.dataset_job_versions(job_id)
251
+ if not response.ok:
252
+ raise_remote_error(response.message)
253
+
254
+ response_data = response.data
255
+ if response_data:
256
+ dataset_versions = response_data.get("dataset_versions", [])
257
+ print("\n\n>>>> Dataset versions created during the job:")
258
+ for version in dataset_versions:
259
+ print(f" - {version.get('dataset_name')}@v{version.get('version')}")
260
+ else:
261
+ print("No dataset versions created during the job.")
232
262
 
233
263
 
234
264
  def upload_files(client: StudioClient, files: list[str]) -> list[str]:
@@ -248,3 +278,18 @@ def upload_files(client: StudioClient, files: list[str]) -> list[str]:
248
278
  if file_id:
249
279
  file_ids.append(str(file_id))
250
280
  return file_ids
281
+
282
+
283
+ def cancel_job(job_id: str, team_name: Optional[str]):
284
+ token = Config().read().get("studio", {}).get("token")
285
+ if not token:
286
+ raise DataChainError(
287
+ "Not logged in to Studio. Log in with 'datachain studio login'."
288
+ )
289
+
290
+ client = StudioClient(team=team_name)
291
+ response = client.cancel_job(job_id)
292
+ if not response.ok:
293
+ raise_remote_error(response.message)
294
+
295
+ print(f"Job {job_id} canceled")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.7.11
3
+ Version: 0.8.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -46,6 +46,7 @@ Requires-Dist: iterative-telemetry>=0.0.9
46
46
  Requires-Dist: platformdirs
47
47
  Requires-Dist: dvc-studio-client<1,>=0.21
48
48
  Requires-Dist: tabulate
49
+ Requires-Dist: websockets
49
50
  Provides-Extra: docs
50
51
  Requires-Dist: mkdocs>=1.5.2; extra == "docs"
51
52
  Requires-Dist: mkdocs-gen-files>=0.5.0; extra == "docs"
@@ -98,7 +99,7 @@ Requires-Dist: unstructured[pdf]; extra == "examples"
98
99
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
100
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
101
  Requires-Dist: onnx==1.16.1; extra == "examples"
101
- Requires-Dist: ultralytics==8.3.48; extra == "examples"
102
+ Requires-Dist: ultralytics==8.3.50; extra == "examples"
102
103
 
103
104
  ================
104
105
  |logo| DataChain
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
4
4
  datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
5
- datachain/cli.py,sha256=wQiYQ_qSVCGvS06pkknT9_FIBdFRzBdeRusW9uXE3vQ,42505
5
+ datachain/cli.py,sha256=gNXVoMfKINUhKjOpYN48tpyNBK13M0hkQWqra4jNSJQ,43137
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
8
8
  datachain/dataset.py,sha256=P-pDBgvPqJGDhq_I7fwCfb6hY8E8mIAO8Q0NT7SNlNE,19128
@@ -14,11 +14,11 @@ datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,11
14
14
  datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
15
15
  datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- datachain/studio.py,sha256=Hr0Ha0kou0so4i8i-gWiXC1AYlJ2arI1D55cc7mi3tg,7253
17
+ datachain/studio.py,sha256=BegIXunW1n-sZtHSe3a30Mw2MXexVGRn_GU-OzjRRKM,8725
18
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
19
  datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
20
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
21
- datachain/catalog/catalog.py,sha256=s4fat0jjP3JPq0RGQ9zfzRkX1JavxxCrcB1tJKMgsks,57686
21
+ datachain/catalog/catalog.py,sha256=nuWjSIs4MO1hJa8-LQGbiMXLWWznPB_VKSVpS7368t4,58415
22
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
23
23
  datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
24
24
  datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
@@ -35,7 +35,7 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
35
35
  datachain/data_storage/metastore.py,sha256=hfTITcesE9XlUTxcCcdDyWGGep-QSjJL9DUxko5QCeI,37524
36
36
  datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu-AY,9895
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=D_ZQ0PHmZzHO2dinv4naVJocUDIZUwV4WAz692C1cyk,22521
38
+ datachain/data_storage/sqlite.py,sha256=iJv1QxwVifOowtYhIDqYVoea21dvkQIdxklGNIend3c,22961
39
39
  datachain/data_storage/warehouse.py,sha256=tjIkU-5JywBR0apCyqTcwSyaRtGxhu2L7IVjrz-55uc,30802
40
40
  datachain/func/__init__.py,sha256=TG6JHFKtLi06Nd5iLszXIflEq-VKZcKMdgo_KiQ8SGQ,1055
41
41
  datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
@@ -53,13 +53,14 @@ datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
53
53
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
54
54
  datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
55
55
  datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
56
- datachain/lib/dc.py,sha256=qMhpVPdWeuXBDhmKKoq3fkq12Cx_ZPxDdsl_juu482o,89595
56
+ datachain/lib/dc.py,sha256=7Wm6TEPVNCSh4bz0iA9JvEsYtYAZ9o97lK7TEJ8modE,92149
57
+ datachain/lib/diff.py,sha256=Yurzyi7PzZzY80HOnVTpwtbWzSJ1LqN8NgZWwZOh_UU,6732
57
58
  datachain/lib/file.py,sha256=4dDWXVCHHP2uELDPHP_LheyTyyr01jwp5wp3HaOIeFI,15028
58
59
  datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
59
60
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
60
61
  datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
61
62
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
62
- datachain/lib/meta_formats.py,sha256=6_gB23fWlvd-edOO3UvDHvj6dBXVL61T7x8RX51FW84,6685
63
+ datachain/lib/meta_formats.py,sha256=hDPfEkcmiLZOjhBBXuareMdnq65Wj8vZvxjmum6cROM,6377
63
64
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
64
65
  datachain/lib/pytorch.py,sha256=dA3r1JY0wqV_907a1D0lFaEN-7v3fMRpc1ePFE9CnvA,6168
65
66
  datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
@@ -88,7 +89,7 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
88
89
  datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
89
90
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
90
91
  datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
91
- datachain/query/dataset.py,sha256=JrImhguXj2ZDwJpfuyhcgxSIlqSPy5NmLDLc3muFQJs,54610
92
+ datachain/query/dataset.py,sha256=fECGctERQrfLIowN9Fo6dTSnmHEe9WbfcjHRtRObcio,54667
92
93
  datachain/query/dispatch.py,sha256=fZ0TgGFRcsrYh1iXQoZVjkUl4Xetom9PSHoeDes3IRs,11606
93
94
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
94
95
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -96,7 +97,7 @@ datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
96
97
  datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
97
98
  datachain/query/session.py,sha256=vvLIJ5b8eElovHLAWq_CZJXmN5t7C7iAZA7x9wPPOms,5905
98
99
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
- datachain/remote/studio.py,sha256=WiK6fpRAw0a6Dth4XXI0YInEHH4gDU7AUHHDNd3wJzg,11616
100
+ datachain/remote/studio.py,sha256=3DlgESETzxm3dgb6zzjjGxsddSkacT2dARnteLAfMxQ,13366
100
101
  datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
101
102
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
102
103
  datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -118,9 +119,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
118
119
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
119
120
  datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
120
121
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
121
- datachain-0.7.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
122
- datachain-0.7.11.dist-info/METADATA,sha256=ADTTf0_eJImM-tIPR-jQydM3N9Iis-ECRxWgkwLM8lU,8412
123
- datachain-0.7.11.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
124
- datachain-0.7.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
125
- datachain-0.7.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
126
- datachain-0.7.11.dist-info/RECORD,,
122
+ datachain-0.8.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
123
+ datachain-0.8.0.dist-info/METADATA,sha256=PXb2pYY67bdfDjFXR7C9hwN6LaKSmseRZJNFakrWfyg,8437
124
+ datachain-0.8.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
125
+ datachain-0.8.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
126
+ datachain-0.8.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
127
+ datachain-0.8.0.dist-info/RECORD,,