datachain 0.33.0__py3-none-any.whl → 0.33.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -144,19 +144,26 @@ def shutdown_process(
144
144
  return proc.wait()
145
145
 
146
146
 
147
- def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
147
+ def process_output(stream: IO[bytes], callback: Callable[[str], None]) -> None:
148
148
  buffer = b""
149
- while byt := stream.read(1): # Read one byte at a time
150
- buffer += byt
151
149
 
152
- if byt in (b"\n", b"\r"): # Check for newline or carriage return
153
- line = buffer.decode("utf-8")
154
- callback(line)
155
- buffer = b"" # Clear buffer for next line
150
+ try:
151
+ while byt := stream.read(1): # Read one byte at a time
152
+ buffer += byt
156
153
 
157
- if buffer: # Handle any remaining data in the buffer
158
- line = buffer.decode("utf-8")
159
- callback(line)
154
+ if byt in (b"\n", b"\r"): # Check for newline or carriage return
155
+ line = buffer.decode("utf-8", errors="replace")
156
+ callback(line)
157
+ buffer = b"" # Clear buffer for the next line
158
+
159
+ if buffer: # Handle any remaining data in the buffer
160
+ line = buffer.decode("utf-8", errors="replace")
161
+ callback(line)
162
+ finally:
163
+ try:
164
+ stream.close() # Ensure output is closed
165
+ except Exception: # noqa: BLE001, S110
166
+ pass
160
167
 
161
168
 
162
169
  class DatasetRowsFetcher(NodesThreadPool):
@@ -1760,13 +1767,13 @@ class Catalog:
1760
1767
  recursive=recursive,
1761
1768
  )
1762
1769
 
1770
+ @staticmethod
1763
1771
  def query(
1764
- self,
1765
1772
  query_script: str,
1766
1773
  env: Optional[Mapping[str, str]] = None,
1767
1774
  python_executable: str = sys.executable,
1768
- capture_output: bool = False,
1769
- output_hook: Callable[[str], None] = noop,
1775
+ stdout_callback: Optional[Callable[[str], None]] = None,
1776
+ stderr_callback: Optional[Callable[[str], None]] = None,
1770
1777
  params: Optional[dict[str, str]] = None,
1771
1778
  job_id: Optional[str] = None,
1772
1779
  interrupt_timeout: Optional[int] = None,
@@ -1781,13 +1788,18 @@ class Catalog:
1781
1788
  },
1782
1789
  )
1783
1790
  popen_kwargs: dict[str, Any] = {}
1784
- if capture_output:
1785
- popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
1791
+
1792
+ if stdout_callback is not None:
1793
+ popen_kwargs = {"stdout": subprocess.PIPE}
1794
+ if stderr_callback is not None:
1795
+ popen_kwargs["stderr"] = subprocess.PIPE
1786
1796
 
1787
1797
  def raise_termination_signal(sig: int, _: Any) -> NoReturn:
1788
1798
  raise TerminationSignal(sig)
1789
1799
 
1790
- thread: Optional[Thread] = None
1800
+ stdout_thread: Optional[Thread] = None
1801
+ stderr_thread: Optional[Thread] = None
1802
+
1791
1803
  with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
1792
1804
  logger.info("Starting process %s", proc.pid)
1793
1805
 
@@ -1801,10 +1813,20 @@ class Catalog:
1801
1813
  orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
1802
1814
  signal.signal(signal.SIGTERM, raise_termination_signal)
1803
1815
  try:
1804
- if capture_output:
1805
- args = (proc.stdout, output_hook)
1806
- thread = Thread(target=_process_stream, args=args, daemon=True)
1807
- thread.start()
1816
+ if stdout_callback is not None:
1817
+ stdout_thread = Thread(
1818
+ target=process_output,
1819
+ args=(proc.stdout, stdout_callback),
1820
+ daemon=True,
1821
+ )
1822
+ stdout_thread.start()
1823
+ if stderr_callback is not None:
1824
+ stderr_thread = Thread(
1825
+ target=process_output,
1826
+ args=(proc.stderr, stderr_callback),
1827
+ daemon=True,
1828
+ )
1829
+ stderr_thread.start()
1808
1830
 
1809
1831
  proc.wait()
1810
1832
  except TerminationSignal as exc:
@@ -1822,8 +1844,22 @@ class Catalog:
1822
1844
  finally:
1823
1845
  signal.signal(signal.SIGTERM, orig_sigterm_handler)
1824
1846
  signal.signal(signal.SIGINT, orig_sigint_handler)
1825
- if thread:
1826
- thread.join() # wait for the reader thread
1847
+ # wait for the reader thread
1848
+ thread_join_timeout_seconds = 30
1849
+ if stdout_thread is not None:
1850
+ stdout_thread.join(timeout=thread_join_timeout_seconds)
1851
+ if stdout_thread.is_alive():
1852
+ logger.warning(
1853
+ "stdout thread is still alive after %s seconds",
1854
+ thread_join_timeout_seconds,
1855
+ )
1856
+ if stderr_thread is not None:
1857
+ stderr_thread.join(timeout=thread_join_timeout_seconds)
1858
+ if stderr_thread.is_alive():
1859
+ logger.warning(
1860
+ "stderr thread is still alive after %s seconds",
1861
+ thread_join_timeout_seconds,
1862
+ )
1827
1863
 
1828
1864
  logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
1829
1865
  if proc.returncode in (
@@ -4,6 +4,7 @@ from enum import Enum
4
4
  class JobStatus(int, Enum):
5
5
  CREATED = 1
6
6
  SCHEDULED = 10
7
+ PROVISIONING = 12
7
8
  QUEUED = 2
8
9
  INIT = 3
9
10
  RUNNING = 4
@@ -1,5 +1,3 @@
1
- import random
2
- import string
3
1
  from collections.abc import Sequence
4
2
  from enum import Enum
5
3
  from typing import TYPE_CHECKING, Optional, Union
@@ -11,16 +9,12 @@ from datachain.query.schema import Column
11
9
  if TYPE_CHECKING:
12
10
  from datachain.lib.dc import DataChain
13
11
 
14
-
15
12
  C = Column
16
13
 
17
14
 
18
- def get_status_col_name() -> str:
19
- """Returns new unique status col name"""
20
- return "diff_" + "".join(
21
- random.choice(string.ascii_letters) # noqa: S311
22
- for _ in range(10)
23
- )
15
+ STATUS_COL_NAME = "diff_7aeed3aa17ba4d50b8d1c368c76e16a6"
16
+ LEFT_DIFF_COL_NAME = "diff_95f95344064a4b819c8625cd1a5cfc2b"
17
+ RIGHT_DIFF_COL_NAME = "diff_5808838a49b54849aa461d7387376d34"
24
18
 
25
19
 
26
20
  class CompareStatus(str, Enum):
@@ -101,9 +95,9 @@ def _compare( # noqa: C901, PLR0912
101
95
  compare = right_compare = [c for c in cols if c in right_cols and c not in on] # type: ignore[misc]
102
96
 
103
97
  # get diff column names
104
- diff_col = status_col or get_status_col_name()
105
- ldiff_col = get_status_col_name()
106
- rdiff_col = get_status_col_name()
98
+ diff_col = status_col or STATUS_COL_NAME
99
+ ldiff_col = LEFT_DIFF_COL_NAME
100
+ rdiff_col = RIGHT_DIFF_COL_NAME
107
101
 
108
102
  # adding helper diff columns, which will be removed after
109
103
  left = left.mutate(**{ldiff_col: 1})
@@ -227,7 +221,7 @@ def compare_and_split(
227
221
  )
228
222
  ```
229
223
  """
230
- status_col = get_status_col_name()
224
+ status_col = STATUS_COL_NAME
231
225
 
232
226
  res = _compare(
233
227
  left,
@@ -0,0 +1,147 @@
1
+ import hashlib
2
+ import inspect
3
+ import json
4
+ import textwrap
5
+ from collections.abc import Sequence
6
+ from typing import TypeVar, Union
7
+
8
+ from sqlalchemy.sql.elements import (
9
+ BinaryExpression,
10
+ BindParameter,
11
+ ColumnElement,
12
+ Label,
13
+ Over,
14
+ UnaryExpression,
15
+ )
16
+ from sqlalchemy.sql.functions import Function
17
+
18
+ T = TypeVar("T", bound=ColumnElement)
19
+ ColumnLike = Union[str, T]
20
+
21
+
22
+ def serialize_column_element(expr: Union[str, ColumnElement]) -> dict: # noqa: PLR0911
23
+ """
24
+ Recursively serialize a SQLAlchemy ColumnElement into a deterministic structure.
25
+ """
26
+
27
+ # Binary operations: col > 5, col1 + col2, etc.
28
+ if isinstance(expr, BinaryExpression):
29
+ op = (
30
+ expr.operator.__name__
31
+ if hasattr(expr.operator, "__name__")
32
+ else str(expr.operator)
33
+ )
34
+ return {
35
+ "type": "binary",
36
+ "op": op,
37
+ "left": serialize_column_element(expr.left),
38
+ "right": serialize_column_element(expr.right),
39
+ }
40
+
41
+ # Unary operations: -col, NOT col, etc.
42
+ if isinstance(expr, UnaryExpression):
43
+ op = (
44
+ expr.operator.__name__
45
+ if expr.operator is not None and hasattr(expr.operator, "__name__")
46
+ else str(expr.operator)
47
+ )
48
+
49
+ return {
50
+ "type": "unary",
51
+ "op": op,
52
+ "element": serialize_column_element(expr.element), # type: ignore[arg-type]
53
+ }
54
+
55
+ # Function calls: func.lower(col), func.count(col), etc.
56
+ if isinstance(expr, Function):
57
+ return {
58
+ "type": "function",
59
+ "name": expr.name,
60
+ "clauses": [serialize_column_element(c) for c in expr.clauses],
61
+ }
62
+
63
+ # Window functions: func.row_number().over(partition_by=..., order_by=...)
64
+ if isinstance(expr, Over):
65
+ return {
66
+ "type": "window",
67
+ "function": serialize_column_element(expr.element),
68
+ "partition_by": [
69
+ serialize_column_element(p) for p in getattr(expr, "partition_by", [])
70
+ ],
71
+ "order_by": [
72
+ serialize_column_element(o) for o in getattr(expr, "order_by", [])
73
+ ],
74
+ }
75
+
76
+ # Labeled expressions: col.label("alias")
77
+ if isinstance(expr, Label):
78
+ return {
79
+ "type": "label",
80
+ "name": expr.name,
81
+ "element": serialize_column_element(expr.element),
82
+ }
83
+
84
+ # Bound values (constants)
85
+ if isinstance(expr, BindParameter):
86
+ return {"type": "bind", "value": expr.value}
87
+
88
+ # Plain columns
89
+ if hasattr(expr, "name"):
90
+ return {"type": "column", "name": expr.name}
91
+
92
+ # Fallback: stringify unknown nodes
93
+ return {"type": "other", "repr": str(expr)}
94
+
95
+
96
+ def hash_column_elements(columns: Sequence[ColumnLike]) -> str:
97
+ """
98
+ Hash a list of ColumnElements deterministically, dialect agnostic.
99
+ Only accepts ordered iterables (like list or tuple).
100
+ """
101
+ serialized = [serialize_column_element(c) for c in columns]
102
+ json_str = json.dumps(serialized, sort_keys=True) # stable JSON
103
+ return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
104
+
105
+
106
+ def hash_callable(func):
107
+ """
108
+ Calculate a hash from a callable.
109
+ Rules:
110
+ - Named functions (def) → use source code for stable, cross-version hashing
111
+ - Lambdas → use bytecode (deterministic in same Python runtime)
112
+ """
113
+ if not callable(func):
114
+ raise TypeError("Expected a callable")
115
+
116
+ # Determine if it is a lambda
117
+ is_lambda = func.__name__ == "<lambda>"
118
+
119
+ if not is_lambda:
120
+ # Try to get exact source of named function
121
+ try:
122
+ lines, _ = inspect.getsourcelines(func)
123
+ payload = textwrap.dedent("".join(lines)).strip()
124
+ except (OSError, TypeError):
125
+ # Fallback: bytecode if source not available
126
+ payload = func.__code__.co_code
127
+ else:
128
+ # For lambdas, fall back directly to bytecode
129
+ payload = func.__code__.co_code
130
+
131
+ # Normalize annotations
132
+ annotations = {
133
+ k: getattr(v, "__name__", str(v)) for k, v in func.__annotations__.items()
134
+ }
135
+
136
+ # Extras to distinguish functions with same code but different metadata
137
+ extras = {
138
+ "name": func.__name__,
139
+ "defaults": func.__defaults__,
140
+ "annotations": annotations,
141
+ }
142
+
143
+ # Compute SHA256
144
+ h = hashlib.sha256()
145
+ h.update(str(payload).encode() if isinstance(payload, str) else payload)
146
+ h.update(str(extras).encode())
147
+ return h.hexdigest()
@@ -209,6 +209,14 @@ class DataChain:
209
209
  self.print_schema(file=file)
210
210
  return file.getvalue()
211
211
 
212
+ def hash(self) -> str:
213
+ """
214
+ Calculates SHA hash of this chain. Hash calculation is fast and consistent.
215
+ It takes into account all the steps added to the chain and their inputs.
216
+ Order of the steps is important.
217
+ """
218
+ return self._query.hash()
219
+
212
220
  def _as_delta(
213
221
  self,
214
222
  on: Optional[Union[str, Sequence[str]]] = None,
@@ -682,7 +690,7 @@ class DataChain:
682
690
 
683
691
  if job_id := os.getenv("DATACHAIN_JOB_ID"):
684
692
  catalog.metastore.create_checkpoint(
685
- job_id, # type: ignore[arg-type]
693
+ job_id,
686
694
  _hash=hashlib.sha256( # TODO this will be replaced with self.hash()
687
695
  str(uuid4()).encode()
688
696
  ).hexdigest(),
@@ -1,4 +1,6 @@
1
1
  import copy
2
+ import hashlib
3
+ import json
2
4
  import warnings
3
5
  from collections.abc import Iterator, Sequence
4
6
  from dataclasses import dataclass
@@ -257,6 +259,11 @@ class SignalSchema:
257
259
  signals["_custom_types"] = custom_types
258
260
  return signals
259
261
 
262
+ def hash(self) -> str:
263
+ """Create SHA hash of this schema"""
264
+ json_str = json.dumps(self.serialize(), sort_keys=True, separators=(",", ":"))
265
+ return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
266
+
260
267
  @staticmethod
261
268
  def _split_subtypes(type_name: str) -> list[str]:
262
269
  """This splits a list of subtypes, including proper square bracket handling."""
datachain/lib/udf.py CHANGED
@@ -1,3 +1,4 @@
1
+ import hashlib
1
2
  import sys
2
3
  import traceback
3
4
  from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
@@ -12,6 +13,7 @@ from pydantic import BaseModel
12
13
  from datachain.asyn import AsyncMapper
13
14
  from datachain.cache import temporary_cache
14
15
  from datachain.dataset import RowDict
16
+ from datachain.hash_utils import hash_callable
15
17
  from datachain.lib.convert.flatten import flatten
16
18
  from datachain.lib.file import DataModel, File
17
19
  from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
@@ -61,6 +63,9 @@ class UDFAdapter:
61
63
  batch_size: Optional[int] = None
62
64
  batch: int = 1
63
65
 
66
+ def hash(self) -> str:
67
+ return self.inner.hash()
68
+
64
69
  def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
65
70
  if use_partitioning:
66
71
  return Partition()
@@ -151,6 +156,21 @@ class UDFBase(AbstractUDF):
151
156
  self.output = None
152
157
  self._func = None
153
158
 
159
+ def hash(self) -> str:
160
+ """
161
+ Creates SHA hash of this UDF function. It takes into account function,
162
+ inputs and outputs.
163
+ """
164
+ parts = [
165
+ hash_callable(self._func),
166
+ self.params.hash() if self.params else "",
167
+ self.output.hash(),
168
+ ]
169
+
170
+ return hashlib.sha256(
171
+ b"".join([bytes.fromhex(part) for part in parts])
172
+ ).hexdigest()
173
+
154
174
  def process(self, *args, **kwargs):
155
175
  """Processing function that needs to be defined by user"""
156
176
  if not self._func:
@@ -1,4 +1,5 @@
1
1
  import contextlib
2
+ import hashlib
2
3
  import inspect
3
4
  import logging
4
5
  import os
@@ -44,6 +45,7 @@ from datachain.data_storage.schema import (
44
45
  from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
45
46
  from datachain.error import DatasetNotFoundError, QueryScriptCancelError
46
47
  from datachain.func.base import Function
48
+ from datachain.hash_utils import hash_column_elements
47
49
  from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
48
50
  from datachain.lib.signal_schema import SignalSchema
49
51
  from datachain.lib.udf import UDFAdapter, _get_cache
@@ -57,6 +59,7 @@ from datachain.sql.types import SQLType
57
59
  from datachain.utils import (
58
60
  determine_processes,
59
61
  determine_workers,
62
+ ensure_sequence,
60
63
  filtered_cloudpickle_dumps,
61
64
  get_datachain_executable,
62
65
  safe_closing,
@@ -167,6 +170,18 @@ class Step(ABC):
167
170
  ) -> "StepResult":
168
171
  """Apply the processing step."""
169
172
 
173
+ @abstractmethod
174
+ def hash_inputs(self) -> str:
175
+ """Calculates hash of step inputs"""
176
+
177
+ def hash(self) -> str:
178
+ """
179
+ Calculates hash for step which includes step name and hash of it's inputs
180
+ """
181
+ return hashlib.sha256(
182
+ f"{self.__class__.__name__}|{self.hash_inputs()}".encode()
183
+ ).hexdigest()
184
+
170
185
 
171
186
  @frozen
172
187
  class QueryStep:
@@ -186,6 +201,11 @@ class QueryStep:
186
201
  q, dr.columns, dependencies=[(self.dataset, self.dataset_version)]
187
202
  )
188
203
 
204
+ def hash(self) -> str:
205
+ return hashlib.sha256(
206
+ self.dataset.uri(self.dataset_version).encode()
207
+ ).hexdigest()
208
+
189
209
 
190
210
  def generator_then_call(generator, func: Callable):
191
211
  """
@@ -256,6 +276,13 @@ class DatasetDiffOperation(Step):
256
276
  class Subtract(DatasetDiffOperation):
257
277
  on: Sequence[tuple[str, str]]
258
278
 
279
+ def hash_inputs(self) -> str:
280
+ on_bytes = b"".join(
281
+ f"{a}:{b}".encode() for a, b in sorted(self.on, key=lambda t: (t[0], t[1]))
282
+ )
283
+
284
+ return hashlib.sha256(bytes.fromhex(self.dq.hash()) + on_bytes).hexdigest()
285
+
259
286
  def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
260
287
  sq = source_query.alias("source_query")
261
288
  tq = target_query.alias("target_query")
@@ -393,6 +420,16 @@ class UDFStep(Step, ABC):
393
420
  min_task_size: Optional[int] = None
394
421
  batch_size: Optional[int] = None
395
422
 
423
+ def hash_inputs(self) -> str:
424
+ partition_by = ensure_sequence(self.partition_by or [])
425
+ parts = [
426
+ bytes.fromhex(self.udf.hash()),
427
+ bytes.fromhex(hash_column_elements(partition_by)),
428
+ str(self.is_generator).encode(),
429
+ ]
430
+
431
+ return hashlib.sha256(b"".join(parts)).hexdigest()
432
+
396
433
  @abstractmethod
397
434
  def create_udf_table(self, query: Select) -> "Table":
398
435
  """Method that creates a table where temp udf results will be saved"""
@@ -790,6 +827,9 @@ class SQLClause(Step, ABC):
790
827
  class SQLSelect(SQLClause):
791
828
  args: tuple[Union[Function, ColumnElement], ...]
792
829
 
830
+ def hash_inputs(self) -> str:
831
+ return hash_column_elements(self.args)
832
+
793
833
  def apply_sql_clause(self, query) -> Select:
794
834
  subquery = query.subquery()
795
835
  args = [
@@ -806,6 +846,9 @@ class SQLSelect(SQLClause):
806
846
  class SQLSelectExcept(SQLClause):
807
847
  args: tuple[Union[Function, ColumnElement], ...]
808
848
 
849
+ def hash_inputs(self) -> str:
850
+ return hash_column_elements(self.args)
851
+
809
852
  def apply_sql_clause(self, query: Select) -> Select:
810
853
  subquery = query.subquery()
811
854
  args = [c for c in subquery.c if c.name not in set(self.parse_cols(self.args))]
@@ -817,6 +860,9 @@ class SQLMutate(SQLClause):
817
860
  args: tuple[Label, ...]
818
861
  new_schema: SignalSchema
819
862
 
863
+ def hash_inputs(self) -> str:
864
+ return hash_column_elements(self.args)
865
+
820
866
  def apply_sql_clause(self, query: Select) -> Select:
821
867
  original_subquery = query.subquery()
822
868
  to_mutate = {c.name for c in self.args}
@@ -846,6 +892,9 @@ class SQLMutate(SQLClause):
846
892
  class SQLFilter(SQLClause):
847
893
  expressions: tuple[Union[Function, ColumnElement], ...]
848
894
 
895
+ def hash_inputs(self) -> str:
896
+ return hash_column_elements(self.expressions)
897
+
849
898
  def __and__(self, other):
850
899
  expressions = self.parse_cols(self.expressions)
851
900
  return self.__class__(expressions + other)
@@ -859,6 +908,9 @@ class SQLFilter(SQLClause):
859
908
  class SQLOrderBy(SQLClause):
860
909
  args: tuple[Union[Function, ColumnElement], ...]
861
910
 
911
+ def hash_inputs(self) -> str:
912
+ return hash_column_elements(self.args)
913
+
862
914
  def apply_sql_clause(self, query: Select) -> Select:
863
915
  args = self.parse_cols(self.args)
864
916
  return query.order_by(*args)
@@ -868,6 +920,9 @@ class SQLOrderBy(SQLClause):
868
920
  class SQLLimit(SQLClause):
869
921
  n: int
870
922
 
923
+ def hash_inputs(self) -> str:
924
+ return hashlib.sha256(str(self.n).encode()).hexdigest()
925
+
871
926
  def apply_sql_clause(self, query: Select) -> Select:
872
927
  return query.limit(self.n)
873
928
 
@@ -876,12 +931,18 @@ class SQLLimit(SQLClause):
876
931
  class SQLOffset(SQLClause):
877
932
  offset: int
878
933
 
934
+ def hash_inputs(self) -> str:
935
+ return hashlib.sha256(str(self.offset).encode()).hexdigest()
936
+
879
937
  def apply_sql_clause(self, query: "GenerativeSelect"):
880
938
  return query.offset(self.offset)
881
939
 
882
940
 
883
941
  @frozen
884
942
  class SQLCount(SQLClause):
943
+ def hash_inputs(self) -> str:
944
+ return ""
945
+
885
946
  def apply_sql_clause(self, query):
886
947
  return sqlalchemy.select(f.count(1)).select_from(query.subquery())
887
948
 
@@ -891,6 +952,9 @@ class SQLDistinct(SQLClause):
891
952
  args: tuple[ColumnElement, ...]
892
953
  dialect: str
893
954
 
955
+ def hash_inputs(self) -> str:
956
+ return hash_column_elements(self.args)
957
+
894
958
  def apply_sql_clause(self, query):
895
959
  if self.dialect == "sqlite":
896
960
  return query.group_by(*self.args)
@@ -903,6 +967,11 @@ class SQLUnion(Step):
903
967
  query1: "DatasetQuery"
904
968
  query2: "DatasetQuery"
905
969
 
970
+ def hash_inputs(self) -> str:
971
+ return hashlib.sha256(
972
+ bytes.fromhex(self.query1.hash()) + bytes.fromhex(self.query2.hash())
973
+ ).hexdigest()
974
+
906
975
  def apply(
907
976
  self, query_generator: QueryGenerator, temp_tables: list[str]
908
977
  ) -> StepResult:
@@ -939,6 +1008,20 @@ class SQLJoin(Step):
939
1008
  full: bool
940
1009
  rname: str
941
1010
 
1011
+ def hash_inputs(self) -> str:
1012
+ predicates = ensure_sequence(self.predicates or [])
1013
+
1014
+ parts = [
1015
+ bytes.fromhex(self.query1.hash()),
1016
+ bytes.fromhex(self.query2.hash()),
1017
+ bytes.fromhex(hash_column_elements(predicates)),
1018
+ str(self.inner).encode(),
1019
+ str(self.full).encode(),
1020
+ self.rname.encode("utf-8"),
1021
+ ]
1022
+
1023
+ return hashlib.sha256(b"".join(parts)).hexdigest()
1024
+
942
1025
  def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
943
1026
  query = dq.apply_steps().select()
944
1027
  temp_tables.extend(dq.temp_table_names)
@@ -1060,6 +1143,13 @@ class SQLGroupBy(SQLClause):
1060
1143
  cols: Sequence[Union[str, Function, ColumnElement]]
1061
1144
  group_by: Sequence[Union[str, Function, ColumnElement]]
1062
1145
 
1146
+ def hash_inputs(self) -> str:
1147
+ return hashlib.sha256(
1148
+ bytes.fromhex(
1149
+ hash_column_elements(self.cols) + hash_column_elements(self.group_by)
1150
+ )
1151
+ ).hexdigest()
1152
+
1063
1153
  def apply_sql_clause(self, query) -> Select:
1064
1154
  if not self.cols:
1065
1155
  raise ValueError("No columns to select")
@@ -1213,6 +1303,23 @@ class DatasetQuery:
1213
1303
  def __or__(self, other):
1214
1304
  return self.union(other)
1215
1305
 
1306
+ def hash(self) -> str:
1307
+ """
1308
+ Calculates hash of this class taking into account hash of starting step
1309
+ and hashes of each following steps. Ordering is important.
1310
+ """
1311
+ hasher = hashlib.sha256()
1312
+ if self.starting_step:
1313
+ hasher.update(self.starting_step.hash().encode("utf-8"))
1314
+ else:
1315
+ assert self.list_ds_name
1316
+ hasher.update(self.list_ds_name.encode("utf-8"))
1317
+
1318
+ for step in self.steps:
1319
+ hasher.update(step.hash().encode("utf-8"))
1320
+
1321
+ return hasher.hexdigest()
1322
+
1216
1323
  @staticmethod
1217
1324
  def get_table() -> "TableClause":
1218
1325
  table_name = "".join(
datachain/utils.py CHANGED
@@ -537,3 +537,9 @@ def getenv_bool(name: str, default: bool = False) -> bool:
537
537
  if val is None:
538
538
  return default
539
539
  return val.lower() in ("1", "true", "yes", "on")
540
+
541
+
542
+ def ensure_sequence(x) -> Sequence:
543
+ if isinstance(x, Sequence) and not isinstance(x, (str, bytes)):
544
+ return x
545
+ return [x]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.33.0
3
+ Version: 0.33.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -103,7 +103,7 @@ Requires-Dist: scipy; extra == "tests"
103
103
  Requires-Dist: ultralytics; extra == "tests"
104
104
  Provides-Extra: dev
105
105
  Requires-Dist: datachain[docs,tests]; extra == "dev"
106
- Requires-Dist: mypy==1.18.1; extra == "dev"
106
+ Requires-Dist: mypy==1.18.2; extra == "dev"
107
107
  Requires-Dist: types-python-dateutil; extra == "dev"
108
108
  Requires-Dist: types-dateparser; extra == "dev"
109
109
  Requires-Dist: types-pytz; extra == "dev"
@@ -7,6 +7,7 @@ datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
7
7
  datachain/dataset.py,sha256=eX7xGa3EUpAccBZWpkgDmYV6_FjGuhjkMLFHpjl6lVI,25256
8
8
  datachain/delta.py,sha256=X5Lw6GQ8MAYNl2YIExNvl0tPIkylQEWwnCw0We7NtHM,10693
9
9
  datachain/error.py,sha256=WR1MoO9BPI0hO1FVKVTS0hgyxxumywtDnSY7Sv1oE1c,1796
10
+ datachain/hash_utils.py,sha256=tgyXlz1m0gsS3UkIxdb0fxtNfVsbO2-YrELtyGV5XYE,4515
10
11
  datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
11
12
  datachain/listing.py,sha256=aqayl5St3D9PwdwM6nR1STkpLSw-S3U8pudO9PWi3N8,7241
12
13
  datachain/namespace.py,sha256=sgIF90KEaC_VlMFivDIJiFz8RUsTftMxW4kOUTyxo3A,2356
@@ -20,9 +21,9 @@ datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
20
21
  datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
21
22
  datachain/studio.py,sha256=IS8o4BZnhUo73Bd8m4CJxFc5utdmh2miIs25WswkFBA,15283
22
23
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
23
- datachain/utils.py,sha256=5ehFeqXau7MFmGUQRsjRyPfDMPoOF1ojpfVciYUo5fE,15659
24
+ datachain/utils.py,sha256=yW-Df5R6npqcqlNZMlBRBwyhUFmXpl9sQipPmy9HfQU,15797
24
25
  datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
25
- datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
26
+ datachain/catalog/catalog.py,sha256=oI4YBuuOJGVx_Fp1cDoFb56lPV7Or27ZquzR8oM1m3Y,69133
26
27
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
27
28
  datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6150
28
29
  datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
@@ -50,13 +51,13 @@ datachain/client/local.py,sha256=0J52Wzvw25hSucVlzBvLuMRAZwrAHZAYDvD1mNBqf4c,460
50
51
  datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
51
52
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
52
53
  datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
53
- datachain/data_storage/job.py,sha256=ZkeXCNUj_VCkoKYx29hqB4AcfVUielnRjY-GYUcUxt4,426
54
+ datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
54
55
  datachain/data_storage/metastore.py,sha256=TgLYAKraH1WsmteaAqO5TW2VzNZZM4_SASgcBlDzdr8,60218
55
56
  datachain/data_storage/schema.py,sha256=DmxxXjNIsXib9gj5jcrb1CVjGzHf7HZLOehs1RmuiMA,9891
56
57
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
57
58
  datachain/data_storage/sqlite.py,sha256=Z6KlFk7hWoXBbjzxfk2NuIBecqP86AJzp5iEE2W4yw0,30603
58
59
  datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
59
- datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
60
+ datachain/diff/__init__.py,sha256=v03JfMxH1VvwFl3rniedS4YWs6EXSfaLCULJTKNECE4,9603
60
61
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
62
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
62
63
  datachain/fs/utils.py,sha256=s-FkTOCGBk-b6TT3toQH51s9608pofoFjUSTc1yy7oE,825
@@ -88,10 +89,10 @@ datachain/lib/namespaces.py,sha256=ZyIYUa3WMrv6R5HrSoLsmLiEbvUQDl8sBINLUmWOYG0,3
88
89
  datachain/lib/projects.py,sha256=_YeU9PPcH_pC8-sbX-47XtWSdl1ltVKnALY8azWLJkM,4112
89
90
  datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
90
91
  datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
91
- datachain/lib/signal_schema.py,sha256=YMMcc9gHIzBz88zfsreGa1nOoO_56HBtZlT6jf3V1WE,39224
92
+ datachain/lib/signal_schema.py,sha256=WDFLbzXEOhgv865TePcFpLQHxsKQHtn8kTzaQGUG_XA,39479
92
93
  datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
93
94
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
94
- datachain/lib/udf.py,sha256=08ia5T3gClen5ZQfIgop-swNnys2G-RIZpszqDnbc0w,17570
95
+ datachain/lib/udf.py,sha256=DdUxGBo9Y7Jz6aTBKgwex7YfK1RNaGm1JUlXCqs7qnw,18122
95
96
  datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
96
97
  datachain/lib/utils.py,sha256=RLji1gHnfDXtJCnBo8BcNu1obndFpVsXJ_1Vb-FQ9Qo,4554
97
98
  datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
@@ -106,7 +107,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
106
107
  datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
107
108
  datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
108
109
  datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
109
- datachain/lib/dc/datachain.py,sha256=1LvKFKqAWw8TMw2bdpfG6LfOCMMgBS6bluBp0lCX0s4,100845
110
+ datachain/lib/dc/datachain.py,sha256=FBz-IzbLeh8cS8yI2WiGBkLjV4fN7YqqqnCuuuj0S-o,101111
110
111
  datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
111
112
  datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
112
113
  datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
@@ -129,7 +130,7 @@ datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigF
129
130
  datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
130
131
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
131
132
  datachain/query/batch.py,sha256=ocPeNgrJM6Y_6SYCx3O2cwlCFAhNMfoYgB99GP6A1Bg,4294
132
- datachain/query/dataset.py,sha256=1eg5EE4vKI7c_Ng04or6zzKmFcOoEubMCoOaYmYPavE,64499
133
+ datachain/query/dataset.py,sha256=P7pyRiWc9G3AfzxvyB2yToKW3bXoUCrfFOtFdiVbCrU,67836
133
134
  datachain/query/dispatch.py,sha256=pygp7xg3lUDKlYHhecKxW5fB3zOSX1fPJfZBU4dfijk,16067
134
135
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
135
136
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -163,9 +164,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
163
164
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
164
165
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
165
166
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
166
- datachain-0.33.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
167
- datachain-0.33.0.dist-info/METADATA,sha256=UGH-boSaU6Kaz6RIsQItwQe4Auzl6L4oHSeeNCKZ7pw,13655
168
- datachain-0.33.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
169
- datachain-0.33.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
170
- datachain-0.33.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
171
- datachain-0.33.0.dist-info/RECORD,,
167
+ datachain-0.33.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
168
+ datachain-0.33.1.dist-info/METADATA,sha256=1D-XqF5TtHydJqpLRIRpld9UKQftLhw_RkDUjI_NE2c,13655
169
+ datachain-0.33.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
170
+ datachain-0.33.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
171
+ datachain-0.33.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
172
+ datachain-0.33.1.dist-info/RECORD,,