datachain 0.33.0__py3-none-any.whl → 0.34.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -144,19 +144,26 @@ def shutdown_process(
144
144
  return proc.wait()
145
145
 
146
146
 
147
- def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
147
+ def process_output(stream: IO[bytes], callback: Callable[[str], None]) -> None:
148
148
  buffer = b""
149
- while byt := stream.read(1): # Read one byte at a time
150
- buffer += byt
151
149
 
152
- if byt in (b"\n", b"\r"): # Check for newline or carriage return
153
- line = buffer.decode("utf-8")
154
- callback(line)
155
- buffer = b"" # Clear buffer for next line
150
+ try:
151
+ while byt := stream.read(1): # Read one byte at a time
152
+ buffer += byt
156
153
 
157
- if buffer: # Handle any remaining data in the buffer
158
- line = buffer.decode("utf-8")
159
- callback(line)
154
+ if byt in (b"\n", b"\r"): # Check for newline or carriage return
155
+ line = buffer.decode("utf-8", errors="replace")
156
+ callback(line)
157
+ buffer = b"" # Clear buffer for the next line
158
+
159
+ if buffer: # Handle any remaining data in the buffer
160
+ line = buffer.decode("utf-8", errors="replace")
161
+ callback(line)
162
+ finally:
163
+ try:
164
+ stream.close() # Ensure output is closed
165
+ except Exception: # noqa: BLE001, S110
166
+ pass
160
167
 
161
168
 
162
169
  class DatasetRowsFetcher(NodesThreadPool):
@@ -1760,13 +1767,13 @@ class Catalog:
1760
1767
  recursive=recursive,
1761
1768
  )
1762
1769
 
1770
+ @staticmethod
1763
1771
  def query(
1764
- self,
1765
1772
  query_script: str,
1766
1773
  env: Optional[Mapping[str, str]] = None,
1767
1774
  python_executable: str = sys.executable,
1768
- capture_output: bool = False,
1769
- output_hook: Callable[[str], None] = noop,
1775
+ stdout_callback: Optional[Callable[[str], None]] = None,
1776
+ stderr_callback: Optional[Callable[[str], None]] = None,
1770
1777
  params: Optional[dict[str, str]] = None,
1771
1778
  job_id: Optional[str] = None,
1772
1779
  interrupt_timeout: Optional[int] = None,
@@ -1781,13 +1788,18 @@ class Catalog:
1781
1788
  },
1782
1789
  )
1783
1790
  popen_kwargs: dict[str, Any] = {}
1784
- if capture_output:
1785
- popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
1791
+
1792
+ if stdout_callback is not None:
1793
+ popen_kwargs = {"stdout": subprocess.PIPE}
1794
+ if stderr_callback is not None:
1795
+ popen_kwargs["stderr"] = subprocess.PIPE
1786
1796
 
1787
1797
  def raise_termination_signal(sig: int, _: Any) -> NoReturn:
1788
1798
  raise TerminationSignal(sig)
1789
1799
 
1790
- thread: Optional[Thread] = None
1800
+ stdout_thread: Optional[Thread] = None
1801
+ stderr_thread: Optional[Thread] = None
1802
+
1791
1803
  with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
1792
1804
  logger.info("Starting process %s", proc.pid)
1793
1805
 
@@ -1801,10 +1813,20 @@ class Catalog:
1801
1813
  orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
1802
1814
  signal.signal(signal.SIGTERM, raise_termination_signal)
1803
1815
  try:
1804
- if capture_output:
1805
- args = (proc.stdout, output_hook)
1806
- thread = Thread(target=_process_stream, args=args, daemon=True)
1807
- thread.start()
1816
+ if stdout_callback is not None:
1817
+ stdout_thread = Thread(
1818
+ target=process_output,
1819
+ args=(proc.stdout, stdout_callback),
1820
+ daemon=True,
1821
+ )
1822
+ stdout_thread.start()
1823
+ if stderr_callback is not None:
1824
+ stderr_thread = Thread(
1825
+ target=process_output,
1826
+ args=(proc.stderr, stderr_callback),
1827
+ daemon=True,
1828
+ )
1829
+ stderr_thread.start()
1808
1830
 
1809
1831
  proc.wait()
1810
1832
  except TerminationSignal as exc:
@@ -1822,8 +1844,22 @@ class Catalog:
1822
1844
  finally:
1823
1845
  signal.signal(signal.SIGTERM, orig_sigterm_handler)
1824
1846
  signal.signal(signal.SIGINT, orig_sigint_handler)
1825
- if thread:
1826
- thread.join() # wait for the reader thread
1847
+ # wait for the reader thread
1848
+ thread_join_timeout_seconds = 30
1849
+ if stdout_thread is not None:
1850
+ stdout_thread.join(timeout=thread_join_timeout_seconds)
1851
+ if stdout_thread.is_alive():
1852
+ logger.warning(
1853
+ "stdout thread is still alive after %s seconds",
1854
+ thread_join_timeout_seconds,
1855
+ )
1856
+ if stderr_thread is not None:
1857
+ stderr_thread.join(timeout=thread_join_timeout_seconds)
1858
+ if stderr_thread.is_alive():
1859
+ logger.warning(
1860
+ "stderr thread is still alive after %s seconds",
1861
+ thread_join_timeout_seconds,
1862
+ )
1827
1863
 
1828
1864
  logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1829
1865
  if proc.returncode in (
@@ -4,6 +4,7 @@ from enum import Enum
4
4
  class JobStatus(int, Enum):
5
5
  CREATED = 1
6
6
  SCHEDULED = 10
7
+ PROVISIONING = 12
7
8
  QUEUED = 2
8
9
  INIT = 3
9
10
  RUNNING = 4
@@ -21,6 +21,7 @@ from sqlalchemy import (
21
21
  Table,
22
22
  Text,
23
23
  UniqueConstraint,
24
+ desc,
24
25
  select,
25
26
  )
26
27
  from sqlalchemy.sql import func as f
@@ -399,6 +400,7 @@ class AbstractMetastore(ABC, Serializable):
399
400
  workers: int = 1,
400
401
  python_version: Optional[str] = None,
401
402
  params: Optional[dict[str, str]] = None,
403
+ parent_job_id: Optional[str] = None,
402
404
  ) -> str:
403
405
  """
404
406
  Creates a new job.
@@ -443,6 +445,10 @@ class AbstractMetastore(ABC, Serializable):
443
445
  def list_checkpoints(self, job_id: str, conn=None) -> Iterator["Checkpoint"]:
444
446
  """Returns all checkpoints related to some job"""
445
447
 
448
+ @abstractmethod
449
+ def get_last_checkpoint(self, job_id: str, conn=None) -> Optional[Checkpoint]:
450
+ """Get last created checkpoint for some job."""
451
+
446
452
  @abstractmethod
447
453
  def get_checkpoint_by_id(self, checkpoint_id: str, conn=None) -> Checkpoint:
448
454
  """Gets single checkpoint by id"""
@@ -1548,6 +1554,7 @@ class AbstractDBMetastore(AbstractMetastore):
1548
1554
  Column("error_stack", Text, nullable=False, default=""),
1549
1555
  Column("params", JSON, nullable=False),
1550
1556
  Column("metrics", JSON, nullable=False),
1557
+ Column("parent_job_id", Text, nullable=True),
1551
1558
  ]
1552
1559
 
1553
1560
  @cached_property
@@ -1595,6 +1602,7 @@ class AbstractDBMetastore(AbstractMetastore):
1595
1602
  workers: int = 1,
1596
1603
  python_version: Optional[str] = None,
1597
1604
  params: Optional[dict[str, str]] = None,
1605
+ parent_job_id: Optional[str] = None,
1598
1606
  conn: Optional[Any] = None,
1599
1607
  ) -> str:
1600
1608
  """
@@ -1616,6 +1624,7 @@ class AbstractDBMetastore(AbstractMetastore):
1616
1624
  error_stack="",
1617
1625
  params=json.dumps(params or {}),
1618
1626
  metrics=json.dumps({}),
1627
+ parent_job_id=parent_job_id,
1619
1628
  ),
1620
1629
  conn=conn,
1621
1630
  )
@@ -1770,7 +1779,7 @@ class AbstractDBMetastore(AbstractMetastore):
1770
1779
  )
1771
1780
  return self.get_checkpoint_by_id(checkpoint_id)
1772
1781
 
1773
- def list_checkpoints(self, job_id: str, conn=None) -> Iterator["Checkpoint"]:
1782
+ def list_checkpoints(self, job_id: str, conn=None) -> Iterator[Checkpoint]:
1774
1783
  """List checkpoints by job id."""
1775
1784
  query = self._checkpoints_query().where(self._checkpoints.c.job_id == job_id)
1776
1785
  rows = list(self.db.execute(query, conn=conn))
@@ -1800,3 +1809,15 @@ class AbstractDBMetastore(AbstractMetastore):
1800
1809
  if not rows:
1801
1810
  return None
1802
1811
  return self.checkpoint_class.parse(*rows[0])
1812
+
1813
+ def get_last_checkpoint(self, job_id: str, conn=None) -> Optional[Checkpoint]:
1814
+ query = (
1815
+ self._checkpoints_query()
1816
+ .where(self._checkpoints.c.job_id == job_id)
1817
+ .order_by(desc(self._checkpoints.c.created_at))
1818
+ .limit(1)
1819
+ )
1820
+ rows = list(self.db.execute(query, conn=conn))
1821
+ if not rows:
1822
+ return None
1823
+ return self.checkpoint_class.parse(*rows[0])
@@ -1,5 +1,3 @@
1
- import random
2
- import string
3
1
  from collections.abc import Sequence
4
2
  from enum import Enum
5
3
  from typing import TYPE_CHECKING, Optional, Union
@@ -11,16 +9,12 @@ from datachain.query.schema import Column
11
9
  if TYPE_CHECKING:
12
10
  from datachain.lib.dc import DataChain
13
11
 
14
-
15
12
  C = Column
16
13
 
17
14
 
18
- def get_status_col_name() -> str:
19
- """Returns new unique status col name"""
20
- return "diff_" + "".join(
21
- random.choice(string.ascii_letters) # noqa: S311
22
- for _ in range(10)
23
- )
15
+ STATUS_COL_NAME = "diff_7aeed3aa17ba4d50b8d1c368c76e16a6"
16
+ LEFT_DIFF_COL_NAME = "diff_95f95344064a4b819c8625cd1a5cfc2b"
17
+ RIGHT_DIFF_COL_NAME = "diff_5808838a49b54849aa461d7387376d34"
24
18
 
25
19
 
26
20
  class CompareStatus(str, Enum):
@@ -101,9 +95,9 @@ def _compare( # noqa: C901, PLR0912
101
95
  compare = right_compare = [c for c in cols if c in right_cols and c not in on] # type: ignore[misc]
102
96
 
103
97
  # get diff column names
104
- diff_col = status_col or get_status_col_name()
105
- ldiff_col = get_status_col_name()
106
- rdiff_col = get_status_col_name()
98
+ diff_col = status_col or STATUS_COL_NAME
99
+ ldiff_col = LEFT_DIFF_COL_NAME
100
+ rdiff_col = RIGHT_DIFF_COL_NAME
107
101
 
108
102
  # adding helper diff columns, which will be removed after
109
103
  left = left.mutate(**{ldiff_col: 1})
@@ -227,7 +221,7 @@ def compare_and_split(
227
221
  )
228
222
  ```
229
223
  """
230
- status_col = get_status_col_name()
224
+ status_col = STATUS_COL_NAME
231
225
 
232
226
  res = _compare(
233
227
  left,
datachain/error.py CHANGED
@@ -101,3 +101,7 @@ class OutdatedDatabaseSchemaError(DataChainError):
101
101
 
102
102
  class CheckpointNotFoundError(NotFoundError):
103
103
  pass
104
+
105
+
106
+ class JobNotFoundError(NotFoundError):
107
+ pass
@@ -0,0 +1,147 @@
1
+ import hashlib
2
+ import inspect
3
+ import json
4
+ import textwrap
5
+ from collections.abc import Sequence
6
+ from typing import TypeVar, Union
7
+
8
+ from sqlalchemy.sql.elements import (
9
+ BinaryExpression,
10
+ BindParameter,
11
+ ColumnElement,
12
+ Label,
13
+ Over,
14
+ UnaryExpression,
15
+ )
16
+ from sqlalchemy.sql.functions import Function
17
+
18
+ T = TypeVar("T", bound=ColumnElement)
19
+ ColumnLike = Union[str, T]
20
+
21
+
22
+ def serialize_column_element(expr: Union[str, ColumnElement]) -> dict: # noqa: PLR0911
23
+ """
24
+ Recursively serialize a SQLAlchemy ColumnElement into a deterministic structure.
25
+ """
26
+
27
+ # Binary operations: col > 5, col1 + col2, etc.
28
+ if isinstance(expr, BinaryExpression):
29
+ op = (
30
+ expr.operator.__name__
31
+ if hasattr(expr.operator, "__name__")
32
+ else str(expr.operator)
33
+ )
34
+ return {
35
+ "type": "binary",
36
+ "op": op,
37
+ "left": serialize_column_element(expr.left),
38
+ "right": serialize_column_element(expr.right),
39
+ }
40
+
41
+ # Unary operations: -col, NOT col, etc.
42
+ if isinstance(expr, UnaryExpression):
43
+ op = (
44
+ expr.operator.__name__
45
+ if expr.operator is not None and hasattr(expr.operator, "__name__")
46
+ else str(expr.operator)
47
+ )
48
+
49
+ return {
50
+ "type": "unary",
51
+ "op": op,
52
+ "element": serialize_column_element(expr.element), # type: ignore[arg-type]
53
+ }
54
+
55
+ # Function calls: func.lower(col), func.count(col), etc.
56
+ if isinstance(expr, Function):
57
+ return {
58
+ "type": "function",
59
+ "name": expr.name,
60
+ "clauses": [serialize_column_element(c) for c in expr.clauses],
61
+ }
62
+
63
+ # Window functions: func.row_number().over(partition_by=..., order_by=...)
64
+ if isinstance(expr, Over):
65
+ return {
66
+ "type": "window",
67
+ "function": serialize_column_element(expr.element),
68
+ "partition_by": [
69
+ serialize_column_element(p) for p in getattr(expr, "partition_by", [])
70
+ ],
71
+ "order_by": [
72
+ serialize_column_element(o) for o in getattr(expr, "order_by", [])
73
+ ],
74
+ }
75
+
76
+ # Labeled expressions: col.label("alias")
77
+ if isinstance(expr, Label):
78
+ return {
79
+ "type": "label",
80
+ "name": expr.name,
81
+ "element": serialize_column_element(expr.element),
82
+ }
83
+
84
+ # Bound values (constants)
85
+ if isinstance(expr, BindParameter):
86
+ return {"type": "bind", "value": expr.value}
87
+
88
+ # Plain columns
89
+ if hasattr(expr, "name"):
90
+ return {"type": "column", "name": expr.name}
91
+
92
+ # Fallback: stringify unknown nodes
93
+ return {"type": "other", "repr": str(expr)}
94
+
95
+
96
+ def hash_column_elements(columns: Sequence[ColumnLike]) -> str:
97
+ """
98
+ Hash a list of ColumnElements deterministically, dialect agnostic.
99
+ Only accepts ordered iterables (like list or tuple).
100
+ """
101
+ serialized = [serialize_column_element(c) for c in columns]
102
+ json_str = json.dumps(serialized, sort_keys=True) # stable JSON
103
+ return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
104
+
105
+
106
+ def hash_callable(func):
107
+ """
108
+ Calculate a hash from a callable.
109
+ Rules:
110
+ - Named functions (def) → use source code for stable, cross-version hashing
111
+ - Lambdas → use bytecode (deterministic in same Python runtime)
112
+ """
113
+ if not callable(func):
114
+ raise TypeError("Expected a callable")
115
+
116
+ # Determine if it is a lambda
117
+ is_lambda = func.__name__ == "<lambda>"
118
+
119
+ if not is_lambda:
120
+ # Try to get exact source of named function
121
+ try:
122
+ lines, _ = inspect.getsourcelines(func)
123
+ payload = textwrap.dedent("".join(lines)).strip()
124
+ except (OSError, TypeError):
125
+ # Fallback: bytecode if source not available
126
+ payload = func.__code__.co_code
127
+ else:
128
+ # For lambdas, fall back directly to bytecode
129
+ payload = func.__code__.co_code
130
+
131
+ # Normalize annotations
132
+ annotations = {
133
+ k: getattr(v, "__name__", str(v)) for k, v in func.__annotations__.items()
134
+ }
135
+
136
+ # Extras to distinguish functions with same code but different metadata
137
+ extras = {
138
+ "name": func.__name__,
139
+ "defaults": func.__defaults__,
140
+ "annotations": annotations,
141
+ }
142
+
143
+ # Compute SHA256
144
+ h = hashlib.sha256()
145
+ h.update(str(payload).encode() if isinstance(payload, str) else payload)
146
+ h.update(str(extras).encode())
147
+ return h.hexdigest()
datachain/job.py CHANGED
@@ -22,6 +22,7 @@ class Job:
22
22
  python_version: Optional[str] = None
23
23
  error_message: str = ""
24
24
  error_stack: str = ""
25
+ parent_job_id: Optional[str] = None
25
26
 
26
27
  @classmethod
27
28
  def parse(
@@ -39,6 +40,7 @@ class Job:
39
40
  error_stack: str,
40
41
  params: str,
41
42
  metrics: str,
43
+ parent_job_id: Optional[str],
42
44
  ) -> "Job":
43
45
  return cls(
44
46
  str(id),
@@ -54,4 +56,5 @@ class Job:
54
56
  python_version,
55
57
  error_message,
56
58
  error_stack,
59
+ parent_job_id,
57
60
  )
@@ -19,7 +19,6 @@ from typing import (
19
19
  cast,
20
20
  overload,
21
21
  )
22
- from uuid import uuid4
23
22
 
24
23
  import sqlalchemy
25
24
  import ujson as json
@@ -30,10 +29,15 @@ from tqdm import tqdm
30
29
  from datachain import semver
31
30
  from datachain.dataset import DatasetRecord
32
31
  from datachain.delta import delta_disabled
33
- from datachain.error import ProjectCreateNotAllowedError, ProjectNotFoundError
32
+ from datachain.error import (
33
+ JobNotFoundError,
34
+ ProjectCreateNotAllowedError,
35
+ ProjectNotFoundError,
36
+ )
34
37
  from datachain.func import literal
35
38
  from datachain.func.base import Function
36
39
  from datachain.func.func import Func
40
+ from datachain.job import Job
37
41
  from datachain.lib.convert.python_to_sql import python_to_sql
38
42
  from datachain.lib.data_model import (
39
43
  DataModel,
@@ -50,11 +54,12 @@ from datachain.lib.signal_schema import SignalResolvingError, SignalSchema
50
54
  from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
51
55
  from datachain.lib.udf_signature import UdfSignature
52
56
  from datachain.lib.utils import DataChainColumnError, DataChainParamsError
57
+ from datachain.project import Project
53
58
  from datachain.query import Session
54
59
  from datachain.query.dataset import DatasetQuery, PartitionByType
55
60
  from datachain.query.schema import DEFAULT_DELIMITER, Column
56
61
  from datachain.sql.functions import path as pathfunc
57
- from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
62
+ from datachain.utils import batched_it, env2bool, inside_notebook, row_to_nested_dict
58
63
 
59
64
  from .database import DEFAULT_DATABASE_BATCH_SIZE
60
65
  from .utils import (
@@ -209,6 +214,14 @@ class DataChain:
209
214
  self.print_schema(file=file)
210
215
  return file.getvalue()
211
216
 
217
+ def hash(self) -> str:
218
+ """
219
+ Calculates SHA hash of this chain. Hash calculation is fast and consistent.
220
+ It takes into account all the steps added to the chain and their inputs.
221
+ Order of the steps is important.
222
+ """
223
+ return self._query.hash()
224
+
212
225
  def _as_delta(
213
226
  self,
214
227
  on: Optional[Union[str, Sequence[str]]] = None,
@@ -570,6 +583,19 @@ class DataChain:
570
583
  query=self._query.save(project=project, feature_schema=schema)
571
584
  )
572
585
 
586
+ def _calculate_job_hash(self, job_id: str) -> str:
587
+ """
588
+ Calculates hash of the job at the place of this chain's save method.
589
+ Hash is calculated using previous job checkpoint hash (if exists) and
590
+ adding hash of this chain to produce new hash.
591
+ """
592
+ last_checkpoint = self.session.catalog.metastore.get_last_checkpoint(job_id)
593
+
594
+ return hashlib.sha256(
595
+ (bytes.fromhex(last_checkpoint.hash) if last_checkpoint else b"")
596
+ + bytes.fromhex(self.hash())
597
+ ).hexdigest()
598
+
573
599
  def save( # type: ignore[override]
574
600
  self,
575
601
  name: str,
@@ -594,101 +620,171 @@ class DataChain:
594
620
  update_version: which part of the dataset version to automatically increase.
595
621
  Available values: `major`, `minor` or `patch`. Default is `patch`.
596
622
  """
623
+
597
624
  catalog = self.session.catalog
598
- if version is not None:
599
- semver.validate(version)
600
625
 
601
- if update_version is not None and update_version not in [
602
- "patch",
603
- "major",
604
- "minor",
605
- ]:
606
- raise ValueError(
607
- "update_version can have one of the following values: major, minor or"
608
- " patch"
609
- )
626
+ result = None # result chain that will be returned at the end
627
+
628
+ # Version validation
629
+ self._validate_version(version)
630
+ self._validate_update_version(update_version)
610
631
 
611
632
  namespace_name, project_name, name = catalog.get_full_dataset_name(
612
633
  name,
613
634
  namespace_name=self._settings.namespace,
614
635
  project_name=self._settings.project,
615
636
  )
637
+ project = self._get_or_create_project(namespace_name, project_name)
638
+
639
+ # Checkpoint handling
640
+ job, _hash, result = self._resolve_checkpoint(name, project, kwargs)
641
+
642
+ # Schema preparation
643
+ schema = self.signals_schema.clone_without_sys_signals().serialize()
644
+
645
+ # Handle retry and delta functionality
646
+ if not result:
647
+ result = self._handle_delta(name, version, project, schema, kwargs)
648
+
649
+ if not result:
650
+ # calculate chain if we already don't have result from checkpoint or delta
651
+ result = self._evolve(
652
+ query=self._query.save(
653
+ name=name,
654
+ version=version,
655
+ project=project,
656
+ description=description,
657
+ attrs=attrs,
658
+ feature_schema=schema,
659
+ update_version=update_version,
660
+ **kwargs,
661
+ )
662
+ )
663
+
664
+ if job:
665
+ catalog.metastore.create_checkpoint(job.id, _hash) # type: ignore[arg-type]
616
666
 
667
+ return result
668
+
669
+ def _validate_version(self, version: Optional[str]) -> None:
670
+ """Validate dataset version if provided."""
671
+ if version is not None:
672
+ semver.validate(version)
673
+
674
+ def _validate_update_version(self, update_version: Optional[str]) -> None:
675
+ """Ensure update_version is one of: major, minor, patch."""
676
+ allowed = ["major", "minor", "patch"]
677
+ if update_version not in allowed:
678
+ raise ValueError(f"update_version must be one of {allowed}")
679
+
680
+ def _get_or_create_project(self, namespace: str, project_name: str) -> Project:
681
+ """Get project or raise if creation not allowed."""
617
682
  try:
618
- project = self.session.catalog.metastore.get_project(
683
+ return self.session.catalog.metastore.get_project(
619
684
  project_name,
620
- namespace_name,
685
+ namespace,
621
686
  create=is_studio(),
622
687
  )
623
688
  except ProjectNotFoundError as e:
624
- # not being able to create it as creation is not allowed
625
689
  raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
626
690
 
627
- schema = self.signals_schema.clone_without_sys_signals().serialize()
691
+ def _resolve_checkpoint(
692
+ self,
693
+ name: str,
694
+ project: Project,
695
+ kwargs: dict,
696
+ ) -> tuple[Optional[Job], Optional[str], Optional["DataChain"]]:
697
+ """Check if checkpoint exists and return cached dataset if possible."""
698
+ from .datasets import read_dataset
628
699
 
629
- # Handle retry and delta functionality
630
- if self.delta and name:
631
- from datachain.delta import delta_retry_update
700
+ metastore = self.session.catalog.metastore
632
701
 
633
- # Delta chains must have delta_on defined (ensured by _as_delta method)
634
- assert self._delta_on is not None, "Delta chain must have delta_on defined"
702
+ job_id = os.getenv("DATACHAIN_JOB_ID")
703
+ checkpoints_reset = env2bool("DATACHAIN_CHECKPOINTS_RESET", undefined=True)
635
704
 
636
- result_ds, dependencies, has_changes = delta_retry_update(
637
- self,
638
- namespace_name,
639
- project_name,
640
- name,
641
- on=self._delta_on,
642
- right_on=self._delta_result_on,
643
- compare=self._delta_compare,
644
- delta_retry=self._delta_retry,
705
+ if not job_id:
706
+ return None, None, None
707
+
708
+ job = metastore.get_job(job_id)
709
+ if not job:
710
+ raise JobNotFoundError(f"Job with id {job_id} not found")
711
+
712
+ _hash = self._calculate_job_hash(job.id)
713
+
714
+ if (
715
+ job.parent_job_id
716
+ and not checkpoints_reset
717
+ and metastore.find_checkpoint(job.parent_job_id, _hash)
718
+ ):
719
+ # checkpoint found → reuse dataset
720
+ chain = read_dataset(
721
+ name, namespace=project.namespace.name, project=project.name, **kwargs
645
722
  )
723
+ return job, _hash, chain
646
724
 
647
- if result_ds:
648
- return self._evolve(
649
- query=result_ds._query.save(
650
- name=name,
651
- version=version,
652
- project=project,
653
- feature_schema=schema,
654
- dependencies=dependencies,
655
- **kwargs,
656
- )
657
- )
725
+ return job, _hash, None
658
726
 
659
- if not has_changes:
660
- # sources have not been changed so new version of resulting dataset
661
- # would be the same as previous one. To avoid duplicating exact
662
- # datasets, we won't create new version of it and we will return
663
- # current latest version instead.
664
- from .datasets import read_dataset
727
+ def _handle_delta(
728
+ self,
729
+ name: str,
730
+ version: Optional[str],
731
+ project: Project,
732
+ schema: dict,
733
+ kwargs: dict,
734
+ ) -> Optional["DataChain"]:
735
+ """Try to save as a delta dataset.
736
+ Returns:
737
+ A DataChain if delta logic could handle it, otherwise None to fall back
738
+ to the regular save path (e.g., on first dataset creation).
739
+ """
740
+ from datachain.delta import delta_retry_update
665
741
 
666
- return read_dataset(
667
- name, namespace=namespace_name, project=project_name, **kwargs
668
- )
742
+ from .datasets import read_dataset
669
743
 
670
- result = self._evolve(
671
- query=self._query.save(
672
- name=name,
673
- version=version,
674
- project=project,
675
- description=description,
676
- attrs=attrs,
677
- feature_schema=schema,
678
- update_version=update_version,
679
- **kwargs,
680
- )
744
+ if not self.delta or not name:
745
+ return None
746
+
747
+ assert self._delta_on is not None, "Delta chain must have delta_on defined"
748
+
749
+ result_ds, dependencies, has_changes = delta_retry_update(
750
+ self,
751
+ project.namespace.name,
752
+ project.name,
753
+ name,
754
+ on=self._delta_on,
755
+ right_on=self._delta_result_on,
756
+ compare=self._delta_compare,
757
+ delta_retry=self._delta_retry,
681
758
  )
682
759
 
683
- if job_id := os.getenv("DATACHAIN_JOB_ID"):
684
- catalog.metastore.create_checkpoint(
685
- job_id, # type: ignore[arg-type]
686
- _hash=hashlib.sha256( # TODO this will be replaced with self.hash()
687
- str(uuid4()).encode()
688
- ).hexdigest(),
760
+ # Case 1: delta produced a new dataset
761
+ if result_ds:
762
+ return self._evolve(
763
+ query=result_ds._query.save(
764
+ name=name,
765
+ version=version,
766
+ project=project,
767
+ feature_schema=schema,
768
+ dependencies=dependencies,
769
+ **kwargs,
770
+ )
689
771
  )
690
772
 
691
- return result
773
+ # Case 2: no changes → reuse last version
774
+ if not has_changes:
775
+ # sources have not been changed so new version of resulting dataset
776
+ # would be the same as previous one. To avoid duplicating exact
777
+ # datasets, we won't create new version of it and we will return
778
+ # current latest version instead.
779
+ return read_dataset(
780
+ name,
781
+ namespace=project.namespace.name,
782
+ project=project.name,
783
+ **kwargs,
784
+ )
785
+
786
+ # Case 3: first creation of dataset
787
+ return None
692
788
 
693
789
  def apply(self, func, *args, **kwargs):
694
790
  """Apply any function to the chain.
@@ -1,4 +1,6 @@
1
1
  import copy
2
+ import hashlib
3
+ import json
2
4
  import warnings
3
5
  from collections.abc import Iterator, Sequence
4
6
  from dataclasses import dataclass
@@ -257,6 +259,11 @@ class SignalSchema:
257
259
  signals["_custom_types"] = custom_types
258
260
  return signals
259
261
 
262
+ def hash(self) -> str:
263
+ """Create SHA hash of this schema"""
264
+ json_str = json.dumps(self.serialize(), sort_keys=True, separators=(",", ":"))
265
+ return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
266
+
260
267
  @staticmethod
261
268
  def _split_subtypes(type_name: str) -> list[str]:
262
269
  """This splits a list of subtypes, including proper square bracket handling."""
datachain/lib/udf.py CHANGED
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import sys
2
3
  import traceback
3
4
  from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
@@ -12,6 +13,7 @@ from pydantic import BaseModel
12
13
  from datachain.asyn import AsyncMapper
13
14
  from datachain.cache import temporary_cache
14
15
  from datachain.dataset import RowDict
16
+ from datachain.hash_utils import hash_callable
15
17
  from datachain.lib.convert.flatten import flatten
16
18
  from datachain.lib.file import DataModel, File
17
19
  from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
@@ -61,6 +63,9 @@ class UDFAdapter:
61
63
  batch_size: Optional[int] = None
62
64
  batch: int = 1
63
65
 
66
+ def hash(self) -> str:
67
+ return self.inner.hash()
68
+
64
69
  def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
65
70
  if use_partitioning:
66
71
  return Partition()
@@ -151,6 +156,21 @@ class UDFBase(AbstractUDF):
151
156
  self.output = None
152
157
  self._func = None
153
158
 
159
+ def hash(self) -> str:
160
+ """
161
+ Creates SHA hash of this UDF function. It takes into account function,
162
+ inputs and outputs.
163
+ """
164
+ parts = [
165
+ hash_callable(self._func),
166
+ self.params.hash() if self.params else "",
167
+ self.output.hash(),
168
+ ]
169
+
170
+ return hashlib.sha256(
171
+ b"".join([bytes.fromhex(part) for part in parts])
172
+ ).hexdigest()
173
+
154
174
  def process(self, *args, **kwargs):
155
175
  """Processing function that needs to be defined by user"""
156
176
  if not self._func:
@@ -1,4 +1,5 @@
1
1
  import contextlib
2
+ import hashlib
2
3
  import inspect
3
4
  import logging
4
5
  import os
@@ -44,6 +45,7 @@ from datachain.data_storage.schema import (
44
45
  from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
45
46
  from datachain.error import DatasetNotFoundError, QueryScriptCancelError
46
47
  from datachain.func.base import Function
48
+ from datachain.hash_utils import hash_column_elements
47
49
  from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
48
50
  from datachain.lib.signal_schema import SignalSchema
49
51
  from datachain.lib.udf import UDFAdapter, _get_cache
@@ -57,6 +59,7 @@ from datachain.sql.types import SQLType
57
59
  from datachain.utils import (
58
60
  determine_processes,
59
61
  determine_workers,
62
+ ensure_sequence,
60
63
  filtered_cloudpickle_dumps,
61
64
  get_datachain_executable,
62
65
  safe_closing,
@@ -167,6 +170,18 @@ class Step(ABC):
167
170
  ) -> "StepResult":
168
171
  """Apply the processing step."""
169
172
 
173
+ @abstractmethod
174
+ def hash_inputs(self) -> str:
175
+ """Calculates hash of step inputs"""
176
+
177
+ def hash(self) -> str:
178
+ """
179
+ Calculates hash for step which includes step name and hash of it's inputs
180
+ """
181
+ return hashlib.sha256(
182
+ f"{self.__class__.__name__}|{self.hash_inputs()}".encode()
183
+ ).hexdigest()
184
+
170
185
 
171
186
  @frozen
172
187
  class QueryStep:
@@ -186,6 +201,11 @@ class QueryStep:
186
201
  q, dr.columns, dependencies=[(self.dataset, self.dataset_version)]
187
202
  )
188
203
 
204
+ def hash(self) -> str:
205
+ return hashlib.sha256(
206
+ self.dataset.uri(self.dataset_version).encode()
207
+ ).hexdigest()
208
+
189
209
 
190
210
  def generator_then_call(generator, func: Callable):
191
211
  """
@@ -256,6 +276,13 @@ class DatasetDiffOperation(Step):
256
276
  class Subtract(DatasetDiffOperation):
257
277
  on: Sequence[tuple[str, str]]
258
278
 
279
+ def hash_inputs(self) -> str:
280
+ on_bytes = b"".join(
281
+ f"{a}:{b}".encode() for a, b in sorted(self.on, key=lambda t: (t[0], t[1]))
282
+ )
283
+
284
+ return hashlib.sha256(bytes.fromhex(self.dq.hash()) + on_bytes).hexdigest()
285
+
259
286
  def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
260
287
  sq = source_query.alias("source_query")
261
288
  tq = target_query.alias("target_query")
@@ -393,6 +420,16 @@ class UDFStep(Step, ABC):
393
420
  min_task_size: Optional[int] = None
394
421
  batch_size: Optional[int] = None
395
422
 
423
+ def hash_inputs(self) -> str:
424
+ partition_by = ensure_sequence(self.partition_by or [])
425
+ parts = [
426
+ bytes.fromhex(self.udf.hash()),
427
+ bytes.fromhex(hash_column_elements(partition_by)),
428
+ str(self.is_generator).encode(),
429
+ ]
430
+
431
+ return hashlib.sha256(b"".join(parts)).hexdigest()
432
+
396
433
  @abstractmethod
397
434
  def create_udf_table(self, query: Select) -> "Table":
398
435
  """Method that creates a table where temp udf results will be saved"""
@@ -790,6 +827,9 @@ class SQLClause(Step, ABC):
790
827
  class SQLSelect(SQLClause):
791
828
  args: tuple[Union[Function, ColumnElement], ...]
792
829
 
830
+ def hash_inputs(self) -> str:
831
+ return hash_column_elements(self.args)
832
+
793
833
  def apply_sql_clause(self, query) -> Select:
794
834
  subquery = query.subquery()
795
835
  args = [
@@ -806,6 +846,9 @@ class SQLSelect(SQLClause):
806
846
  class SQLSelectExcept(SQLClause):
807
847
  args: tuple[Union[Function, ColumnElement], ...]
808
848
 
849
+ def hash_inputs(self) -> str:
850
+ return hash_column_elements(self.args)
851
+
809
852
  def apply_sql_clause(self, query: Select) -> Select:
810
853
  subquery = query.subquery()
811
854
  args = [c for c in subquery.c if c.name not in set(self.parse_cols(self.args))]
@@ -817,6 +860,9 @@ class SQLMutate(SQLClause):
817
860
  args: tuple[Label, ...]
818
861
  new_schema: SignalSchema
819
862
 
863
+ def hash_inputs(self) -> str:
864
+ return hash_column_elements(self.args)
865
+
820
866
  def apply_sql_clause(self, query: Select) -> Select:
821
867
  original_subquery = query.subquery()
822
868
  to_mutate = {c.name for c in self.args}
@@ -846,6 +892,9 @@ class SQLMutate(SQLClause):
846
892
  class SQLFilter(SQLClause):
847
893
  expressions: tuple[Union[Function, ColumnElement], ...]
848
894
 
895
+ def hash_inputs(self) -> str:
896
+ return hash_column_elements(self.expressions)
897
+
849
898
  def __and__(self, other):
850
899
  expressions = self.parse_cols(self.expressions)
851
900
  return self.__class__(expressions + other)
@@ -859,6 +908,9 @@ class SQLFilter(SQLClause):
859
908
  class SQLOrderBy(SQLClause):
860
909
  args: tuple[Union[Function, ColumnElement], ...]
861
910
 
911
+ def hash_inputs(self) -> str:
912
+ return hash_column_elements(self.args)
913
+
862
914
  def apply_sql_clause(self, query: Select) -> Select:
863
915
  args = self.parse_cols(self.args)
864
916
  return query.order_by(*args)
@@ -868,6 +920,9 @@ class SQLOrderBy(SQLClause):
868
920
  class SQLLimit(SQLClause):
869
921
  n: int
870
922
 
923
+ def hash_inputs(self) -> str:
924
+ return hashlib.sha256(str(self.n).encode()).hexdigest()
925
+
871
926
  def apply_sql_clause(self, query: Select) -> Select:
872
927
  return query.limit(self.n)
873
928
 
@@ -876,12 +931,18 @@ class SQLLimit(SQLClause):
876
931
  class SQLOffset(SQLClause):
877
932
  offset: int
878
933
 
934
+ def hash_inputs(self) -> str:
935
+ return hashlib.sha256(str(self.offset).encode()).hexdigest()
936
+
879
937
  def apply_sql_clause(self, query: "GenerativeSelect"):
880
938
  return query.offset(self.offset)
881
939
 
882
940
 
883
941
  @frozen
884
942
  class SQLCount(SQLClause):
943
+ def hash_inputs(self) -> str:
944
+ return ""
945
+
885
946
  def apply_sql_clause(self, query):
886
947
  return sqlalchemy.select(f.count(1)).select_from(query.subquery())
887
948
 
@@ -891,6 +952,9 @@ class SQLDistinct(SQLClause):
891
952
  args: tuple[ColumnElement, ...]
892
953
  dialect: str
893
954
 
955
+ def hash_inputs(self) -> str:
956
+ return hash_column_elements(self.args)
957
+
894
958
  def apply_sql_clause(self, query):
895
959
  if self.dialect == "sqlite":
896
960
  return query.group_by(*self.args)
@@ -903,6 +967,11 @@ class SQLUnion(Step):
903
967
  query1: "DatasetQuery"
904
968
  query2: "DatasetQuery"
905
969
 
970
+ def hash_inputs(self) -> str:
971
+ return hashlib.sha256(
972
+ bytes.fromhex(self.query1.hash()) + bytes.fromhex(self.query2.hash())
973
+ ).hexdigest()
974
+
906
975
  def apply(
907
976
  self, query_generator: QueryGenerator, temp_tables: list[str]
908
977
  ) -> StepResult:
@@ -939,6 +1008,20 @@ class SQLJoin(Step):
939
1008
  full: bool
940
1009
  rname: str
941
1010
 
1011
+ def hash_inputs(self) -> str:
1012
+ predicates = ensure_sequence(self.predicates or [])
1013
+
1014
+ parts = [
1015
+ bytes.fromhex(self.query1.hash()),
1016
+ bytes.fromhex(self.query2.hash()),
1017
+ bytes.fromhex(hash_column_elements(predicates)),
1018
+ str(self.inner).encode(),
1019
+ str(self.full).encode(),
1020
+ self.rname.encode("utf-8"),
1021
+ ]
1022
+
1023
+ return hashlib.sha256(b"".join(parts)).hexdigest()
1024
+
942
1025
  def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
943
1026
  query = dq.apply_steps().select()
944
1027
  temp_tables.extend(dq.temp_table_names)
@@ -1060,6 +1143,13 @@ class SQLGroupBy(SQLClause):
1060
1143
  cols: Sequence[Union[str, Function, ColumnElement]]
1061
1144
  group_by: Sequence[Union[str, Function, ColumnElement]]
1062
1145
 
1146
+ def hash_inputs(self) -> str:
1147
+ return hashlib.sha256(
1148
+ bytes.fromhex(
1149
+ hash_column_elements(self.cols) + hash_column_elements(self.group_by)
1150
+ )
1151
+ ).hexdigest()
1152
+
1063
1153
  def apply_sql_clause(self, query) -> Select:
1064
1154
  if not self.cols:
1065
1155
  raise ValueError("No columns to select")
@@ -1213,6 +1303,23 @@ class DatasetQuery:
1213
1303
  def __or__(self, other):
1214
1304
  return self.union(other)
1215
1305
 
1306
+ def hash(self) -> str:
1307
+ """
1308
+ Calculates hash of this class taking into account hash of starting step
1309
+ and hashes of each following steps. Ordering is important.
1310
+ """
1311
+ hasher = hashlib.sha256()
1312
+ if self.starting_step:
1313
+ hasher.update(self.starting_step.hash().encode("utf-8"))
1314
+ else:
1315
+ assert self.list_ds_name
1316
+ hasher.update(self.list_ds_name.encode("utf-8"))
1317
+
1318
+ for step in self.steps:
1319
+ hasher.update(step.hash().encode("utf-8"))
1320
+
1321
+ return hasher.hexdigest()
1322
+
1216
1323
  @staticmethod
1217
1324
  def get_table() -> "TableClause":
1218
1325
  table_name = "".join(
datachain/utils.py CHANGED
@@ -537,3 +537,9 @@ def getenv_bool(name: str, default: bool = False) -> bool:
537
537
  if val is None:
538
538
  return default
539
539
  return val.lower() in ("1", "true", "yes", "on")
540
+
541
+
542
+ def ensure_sequence(x) -> Sequence:
543
+ if isinstance(x, Sequence) and not isinstance(x, (str, bytes)):
544
+ return x
545
+ return [x]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.33.0
3
+ Version: 0.34.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -103,7 +103,7 @@ Requires-Dist: scipy; extra == "tests"
103
103
  Requires-Dist: ultralytics; extra == "tests"
104
104
  Provides-Extra: dev
105
105
  Requires-Dist: datachain[docs,tests]; extra == "dev"
106
- Requires-Dist: mypy==1.18.1; extra == "dev"
106
+ Requires-Dist: mypy==1.18.2; extra == "dev"
107
107
  Requires-Dist: types-python-dateutil; extra == "dev"
108
108
  Requires-Dist: types-dateparser; extra == "dev"
109
109
  Requires-Dist: types-pytz; extra == "dev"
@@ -6,8 +6,9 @@ datachain/checkpoint.py,sha256=Ar6SnnDMN3fr5ZZm3Xpdbj2f9buhqeApad-B1Lyrr4Y,1152
6
6
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
7
7
  datachain/dataset.py,sha256=eX7xGa3EUpAccBZWpkgDmYV6_FjGuhjkMLFHpjl6lVI,25256
8
8
  datachain/delta.py,sha256=X5Lw6GQ8MAYNl2YIExNvl0tPIkylQEWwnCw0We7NtHM,10693
9
- datachain/error.py,sha256=WR1MoO9BPI0hO1FVKVTS0hgyxxumywtDnSY7Sv1oE1c,1796
10
- datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
9
+ datachain/error.py,sha256=P_5KXlfVIsW4E42JJCoFhGsgvY8la-6jXBEWbHbgqKo,1846
10
+ datachain/hash_utils.py,sha256=tgyXlz1m0gsS3UkIxdb0fxtNfVsbO2-YrELtyGV5XYE,4515
11
+ datachain/job.py,sha256=WDkZrr4Je50nngRDaRapNpGpx_50L6wYWmAqcMT_yCw,1367
11
12
  datachain/listing.py,sha256=aqayl5St3D9PwdwM6nR1STkpLSw-S3U8pudO9PWi3N8,7241
12
13
  datachain/namespace.py,sha256=sgIF90KEaC_VlMFivDIJiFz8RUsTftMxW4kOUTyxo3A,2356
13
14
  datachain/node.py,sha256=KWDT0ClYXB7FYI-QOvzAa-UDkLJErUI2eWm5FBteYuU,5577
@@ -20,9 +21,9 @@ datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
20
21
  datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
21
22
  datachain/studio.py,sha256=IS8o4BZnhUo73Bd8m4CJxFc5utdmh2miIs25WswkFBA,15283
22
23
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
23
- datachain/utils.py,sha256=5ehFeqXau7MFmGUQRsjRyPfDMPoOF1ojpfVciYUo5fE,15659
24
+ datachain/utils.py,sha256=yW-Df5R6npqcqlNZMlBRBwyhUFmXpl9sQipPmy9HfQU,15797
24
25
  datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
25
- datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
26
+ datachain/catalog/catalog.py,sha256=oI4YBuuOJGVx_Fp1cDoFb56lPV7Or27ZquzR8oM1m3Y,69133
26
27
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
27
28
  datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6150
28
29
  datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
@@ -50,13 +51,13 @@ datachain/client/local.py,sha256=0J52Wzvw25hSucVlzBvLuMRAZwrAHZAYDvD1mNBqf4c,460
50
51
  datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
51
52
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
52
53
  datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
53
- datachain/data_storage/job.py,sha256=ZkeXCNUj_VCkoKYx29hqB4AcfVUielnRjY-GYUcUxt4,426
54
- datachain/data_storage/metastore.py,sha256=TgLYAKraH1WsmteaAqO5TW2VzNZZM4_SASgcBlDzdr8,60218
54
+ datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
55
+ datachain/data_storage/metastore.py,sha256=9Wd0MfdVrdpgvFXOddUvyz61MnoRDipv0-A38aRsqzw,61021
55
56
  datachain/data_storage/schema.py,sha256=DmxxXjNIsXib9gj5jcrb1CVjGzHf7HZLOehs1RmuiMA,9891
56
57
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
57
58
  datachain/data_storage/sqlite.py,sha256=Z6KlFk7hWoXBbjzxfk2NuIBecqP86AJzp5iEE2W4yw0,30603
58
59
  datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
59
- datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
60
+ datachain/diff/__init__.py,sha256=v03JfMxH1VvwFl3rniedS4YWs6EXSfaLCULJTKNECE4,9603
60
61
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
62
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
62
63
  datachain/fs/utils.py,sha256=s-FkTOCGBk-b6TT3toQH51s9608pofoFjUSTc1yy7oE,825
@@ -88,10 +89,10 @@ datachain/lib/namespaces.py,sha256=ZyIYUa3WMrv6R5HrSoLsmLiEbvUQDl8sBINLUmWOYG0,3
88
89
  datachain/lib/projects.py,sha256=_YeU9PPcH_pC8-sbX-47XtWSdl1ltVKnALY8azWLJkM,4112
89
90
  datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
90
91
  datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
91
- datachain/lib/signal_schema.py,sha256=YMMcc9gHIzBz88zfsreGa1nOoO_56HBtZlT6jf3V1WE,39224
92
+ datachain/lib/signal_schema.py,sha256=WDFLbzXEOhgv865TePcFpLQHxsKQHtn8kTzaQGUG_XA,39479
92
93
  datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
93
94
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
94
- datachain/lib/udf.py,sha256=08ia5T3gClen5ZQfIgop-swNnys2G-RIZpszqDnbc0w,17570
95
+ datachain/lib/udf.py,sha256=DdUxGBo9Y7Jz6aTBKgwex7YfK1RNaGm1JUlXCqs7qnw,18122
95
96
  datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
96
97
  datachain/lib/utils.py,sha256=RLji1gHnfDXtJCnBo8BcNu1obndFpVsXJ_1Vb-FQ9Qo,4554
97
98
  datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
@@ -106,7 +107,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
106
107
  datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
107
108
  datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
108
109
  datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
109
- datachain/lib/dc/datachain.py,sha256=1LvKFKqAWw8TMw2bdpfG6LfOCMMgBS6bluBp0lCX0s4,100845
110
+ datachain/lib/dc/datachain.py,sha256=uUAPchtNXyJo1tzFd3z1MLWhVC2dzO2ZjhTS0naqXiE,104032
110
111
  datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
111
112
  datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
112
113
  datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
@@ -129,7 +130,7 @@ datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigF
129
130
  datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
130
131
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
131
132
  datachain/query/batch.py,sha256=ocPeNgrJM6Y_6SYCx3O2cwlCFAhNMfoYgB99GP6A1Bg,4294
132
- datachain/query/dataset.py,sha256=1eg5EE4vKI7c_Ng04or6zzKmFcOoEubMCoOaYmYPavE,64499
133
+ datachain/query/dataset.py,sha256=P7pyRiWc9G3AfzxvyB2yToKW3bXoUCrfFOtFdiVbCrU,67836
133
134
  datachain/query/dispatch.py,sha256=pygp7xg3lUDKlYHhecKxW5fB3zOSX1fPJfZBU4dfijk,16067
134
135
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
135
136
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -163,9 +164,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
163
164
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
164
165
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
165
166
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
166
- datachain-0.33.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
167
- datachain-0.33.0.dist-info/METADATA,sha256=UGH-boSaU6Kaz6RIsQItwQe4Auzl6L4oHSeeNCKZ7pw,13655
168
- datachain-0.33.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
169
- datachain-0.33.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
170
- datachain-0.33.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
171
- datachain-0.33.0.dist-info/RECORD,,
167
+ datachain-0.34.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
168
+ datachain-0.34.0.dist-info/METADATA,sha256=YBmM_daqadosEKHBY-QLxSRxYn55XuhB0S0tfeEfzts,13655
169
+ datachain-0.34.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
170
+ datachain-0.34.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
171
+ datachain-0.34.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
172
+ datachain-0.34.0.dist-info/RECORD,,