datachain 0.33.0__py3-none-any.whl → 0.34.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +58 -22
- datachain/data_storage/job.py +1 -0
- datachain/data_storage/metastore.py +22 -1
- datachain/diff/__init__.py +7 -13
- datachain/error.py +4 -0
- datachain/hash_utils.py +147 -0
- datachain/job.py +3 -0
- datachain/lib/dc/datachain.py +166 -70
- datachain/lib/signal_schema.py +7 -0
- datachain/lib/udf.py +20 -0
- datachain/query/dataset.py +107 -0
- datachain/utils.py +6 -0
- {datachain-0.33.0.dist-info → datachain-0.34.0.dist-info}/METADATA +2 -2
- {datachain-0.33.0.dist-info → datachain-0.34.0.dist-info}/RECORD +18 -17
- {datachain-0.33.0.dist-info → datachain-0.34.0.dist-info}/WHEEL +0 -0
- {datachain-0.33.0.dist-info → datachain-0.34.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.33.0.dist-info → datachain-0.34.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.33.0.dist-info → datachain-0.34.0.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -144,19 +144,26 @@ def shutdown_process(
|
|
|
144
144
|
return proc.wait()
|
|
145
145
|
|
|
146
146
|
|
|
147
|
-
def
|
|
147
|
+
def process_output(stream: IO[bytes], callback: Callable[[str], None]) -> None:
|
|
148
148
|
buffer = b""
|
|
149
|
-
while byt := stream.read(1): # Read one byte at a time
|
|
150
|
-
buffer += byt
|
|
151
149
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
buffer = b"" # Clear buffer for next line
|
|
150
|
+
try:
|
|
151
|
+
while byt := stream.read(1): # Read one byte at a time
|
|
152
|
+
buffer += byt
|
|
156
153
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
154
|
+
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
155
|
+
line = buffer.decode("utf-8", errors="replace")
|
|
156
|
+
callback(line)
|
|
157
|
+
buffer = b"" # Clear buffer for the next line
|
|
158
|
+
|
|
159
|
+
if buffer: # Handle any remaining data in the buffer
|
|
160
|
+
line = buffer.decode("utf-8", errors="replace")
|
|
161
|
+
callback(line)
|
|
162
|
+
finally:
|
|
163
|
+
try:
|
|
164
|
+
stream.close() # Ensure output is closed
|
|
165
|
+
except Exception: # noqa: BLE001, S110
|
|
166
|
+
pass
|
|
160
167
|
|
|
161
168
|
|
|
162
169
|
class DatasetRowsFetcher(NodesThreadPool):
|
|
@@ -1760,13 +1767,13 @@ class Catalog:
|
|
|
1760
1767
|
recursive=recursive,
|
|
1761
1768
|
)
|
|
1762
1769
|
|
|
1770
|
+
@staticmethod
|
|
1763
1771
|
def query(
|
|
1764
|
-
self,
|
|
1765
1772
|
query_script: str,
|
|
1766
1773
|
env: Optional[Mapping[str, str]] = None,
|
|
1767
1774
|
python_executable: str = sys.executable,
|
|
1768
|
-
|
|
1769
|
-
|
|
1775
|
+
stdout_callback: Optional[Callable[[str], None]] = None,
|
|
1776
|
+
stderr_callback: Optional[Callable[[str], None]] = None,
|
|
1770
1777
|
params: Optional[dict[str, str]] = None,
|
|
1771
1778
|
job_id: Optional[str] = None,
|
|
1772
1779
|
interrupt_timeout: Optional[int] = None,
|
|
@@ -1781,13 +1788,18 @@ class Catalog:
|
|
|
1781
1788
|
},
|
|
1782
1789
|
)
|
|
1783
1790
|
popen_kwargs: dict[str, Any] = {}
|
|
1784
|
-
|
|
1785
|
-
|
|
1791
|
+
|
|
1792
|
+
if stdout_callback is not None:
|
|
1793
|
+
popen_kwargs = {"stdout": subprocess.PIPE}
|
|
1794
|
+
if stderr_callback is not None:
|
|
1795
|
+
popen_kwargs["stderr"] = subprocess.PIPE
|
|
1786
1796
|
|
|
1787
1797
|
def raise_termination_signal(sig: int, _: Any) -> NoReturn:
|
|
1788
1798
|
raise TerminationSignal(sig)
|
|
1789
1799
|
|
|
1790
|
-
|
|
1800
|
+
stdout_thread: Optional[Thread] = None
|
|
1801
|
+
stderr_thread: Optional[Thread] = None
|
|
1802
|
+
|
|
1791
1803
|
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
|
|
1792
1804
|
logger.info("Starting process %s", proc.pid)
|
|
1793
1805
|
|
|
@@ -1801,10 +1813,20 @@ class Catalog:
|
|
|
1801
1813
|
orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
|
|
1802
1814
|
signal.signal(signal.SIGTERM, raise_termination_signal)
|
|
1803
1815
|
try:
|
|
1804
|
-
if
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1816
|
+
if stdout_callback is not None:
|
|
1817
|
+
stdout_thread = Thread(
|
|
1818
|
+
target=process_output,
|
|
1819
|
+
args=(proc.stdout, stdout_callback),
|
|
1820
|
+
daemon=True,
|
|
1821
|
+
)
|
|
1822
|
+
stdout_thread.start()
|
|
1823
|
+
if stderr_callback is not None:
|
|
1824
|
+
stderr_thread = Thread(
|
|
1825
|
+
target=process_output,
|
|
1826
|
+
args=(proc.stderr, stderr_callback),
|
|
1827
|
+
daemon=True,
|
|
1828
|
+
)
|
|
1829
|
+
stderr_thread.start()
|
|
1808
1830
|
|
|
1809
1831
|
proc.wait()
|
|
1810
1832
|
except TerminationSignal as exc:
|
|
@@ -1822,8 +1844,22 @@ class Catalog:
|
|
|
1822
1844
|
finally:
|
|
1823
1845
|
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1824
1846
|
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1825
|
-
|
|
1826
|
-
|
|
1847
|
+
# wait for the reader thread
|
|
1848
|
+
thread_join_timeout_seconds = 30
|
|
1849
|
+
if stdout_thread is not None:
|
|
1850
|
+
stdout_thread.join(timeout=thread_join_timeout_seconds)
|
|
1851
|
+
if stdout_thread.is_alive():
|
|
1852
|
+
logger.warning(
|
|
1853
|
+
"stdout thread is still alive after %s seconds",
|
|
1854
|
+
thread_join_timeout_seconds,
|
|
1855
|
+
)
|
|
1856
|
+
if stderr_thread is not None:
|
|
1857
|
+
stderr_thread.join(timeout=thread_join_timeout_seconds)
|
|
1858
|
+
if stderr_thread.is_alive():
|
|
1859
|
+
logger.warning(
|
|
1860
|
+
"stderr thread is still alive after %s seconds",
|
|
1861
|
+
thread_join_timeout_seconds,
|
|
1862
|
+
)
|
|
1827
1863
|
|
|
1828
1864
|
logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
|
|
1829
1865
|
if proc.returncode in (
|
datachain/data_storage/job.py
CHANGED
|
@@ -21,6 +21,7 @@ from sqlalchemy import (
|
|
|
21
21
|
Table,
|
|
22
22
|
Text,
|
|
23
23
|
UniqueConstraint,
|
|
24
|
+
desc,
|
|
24
25
|
select,
|
|
25
26
|
)
|
|
26
27
|
from sqlalchemy.sql import func as f
|
|
@@ -399,6 +400,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
399
400
|
workers: int = 1,
|
|
400
401
|
python_version: Optional[str] = None,
|
|
401
402
|
params: Optional[dict[str, str]] = None,
|
|
403
|
+
parent_job_id: Optional[str] = None,
|
|
402
404
|
) -> str:
|
|
403
405
|
"""
|
|
404
406
|
Creates a new job.
|
|
@@ -443,6 +445,10 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
443
445
|
def list_checkpoints(self, job_id: str, conn=None) -> Iterator["Checkpoint"]:
|
|
444
446
|
"""Returns all checkpoints related to some job"""
|
|
445
447
|
|
|
448
|
+
@abstractmethod
|
|
449
|
+
def get_last_checkpoint(self, job_id: str, conn=None) -> Optional[Checkpoint]:
|
|
450
|
+
"""Get last created checkpoint for some job."""
|
|
451
|
+
|
|
446
452
|
@abstractmethod
|
|
447
453
|
def get_checkpoint_by_id(self, checkpoint_id: str, conn=None) -> Checkpoint:
|
|
448
454
|
"""Gets single checkpoint by id"""
|
|
@@ -1548,6 +1554,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1548
1554
|
Column("error_stack", Text, nullable=False, default=""),
|
|
1549
1555
|
Column("params", JSON, nullable=False),
|
|
1550
1556
|
Column("metrics", JSON, nullable=False),
|
|
1557
|
+
Column("parent_job_id", Text, nullable=True),
|
|
1551
1558
|
]
|
|
1552
1559
|
|
|
1553
1560
|
@cached_property
|
|
@@ -1595,6 +1602,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1595
1602
|
workers: int = 1,
|
|
1596
1603
|
python_version: Optional[str] = None,
|
|
1597
1604
|
params: Optional[dict[str, str]] = None,
|
|
1605
|
+
parent_job_id: Optional[str] = None,
|
|
1598
1606
|
conn: Optional[Any] = None,
|
|
1599
1607
|
) -> str:
|
|
1600
1608
|
"""
|
|
@@ -1616,6 +1624,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1616
1624
|
error_stack="",
|
|
1617
1625
|
params=json.dumps(params or {}),
|
|
1618
1626
|
metrics=json.dumps({}),
|
|
1627
|
+
parent_job_id=parent_job_id,
|
|
1619
1628
|
),
|
|
1620
1629
|
conn=conn,
|
|
1621
1630
|
)
|
|
@@ -1770,7 +1779,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1770
1779
|
)
|
|
1771
1780
|
return self.get_checkpoint_by_id(checkpoint_id)
|
|
1772
1781
|
|
|
1773
|
-
def list_checkpoints(self, job_id: str, conn=None) -> Iterator[
|
|
1782
|
+
def list_checkpoints(self, job_id: str, conn=None) -> Iterator[Checkpoint]:
|
|
1774
1783
|
"""List checkpoints by job id."""
|
|
1775
1784
|
query = self._checkpoints_query().where(self._checkpoints.c.job_id == job_id)
|
|
1776
1785
|
rows = list(self.db.execute(query, conn=conn))
|
|
@@ -1800,3 +1809,15 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1800
1809
|
if not rows:
|
|
1801
1810
|
return None
|
|
1802
1811
|
return self.checkpoint_class.parse(*rows[0])
|
|
1812
|
+
|
|
1813
|
+
def get_last_checkpoint(self, job_id: str, conn=None) -> Optional[Checkpoint]:
|
|
1814
|
+
query = (
|
|
1815
|
+
self._checkpoints_query()
|
|
1816
|
+
.where(self._checkpoints.c.job_id == job_id)
|
|
1817
|
+
.order_by(desc(self._checkpoints.c.created_at))
|
|
1818
|
+
.limit(1)
|
|
1819
|
+
)
|
|
1820
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
1821
|
+
if not rows:
|
|
1822
|
+
return None
|
|
1823
|
+
return self.checkpoint_class.parse(*rows[0])
|
datachain/diff/__init__.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import string
|
|
3
1
|
from collections.abc import Sequence
|
|
4
2
|
from enum import Enum
|
|
5
3
|
from typing import TYPE_CHECKING, Optional, Union
|
|
@@ -11,16 +9,12 @@ from datachain.query.schema import Column
|
|
|
11
9
|
if TYPE_CHECKING:
|
|
12
10
|
from datachain.lib.dc import DataChain
|
|
13
11
|
|
|
14
|
-
|
|
15
12
|
C = Column
|
|
16
13
|
|
|
17
14
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
random.choice(string.ascii_letters) # noqa: S311
|
|
22
|
-
for _ in range(10)
|
|
23
|
-
)
|
|
15
|
+
STATUS_COL_NAME = "diff_7aeed3aa17ba4d50b8d1c368c76e16a6"
|
|
16
|
+
LEFT_DIFF_COL_NAME = "diff_95f95344064a4b819c8625cd1a5cfc2b"
|
|
17
|
+
RIGHT_DIFF_COL_NAME = "diff_5808838a49b54849aa461d7387376d34"
|
|
24
18
|
|
|
25
19
|
|
|
26
20
|
class CompareStatus(str, Enum):
|
|
@@ -101,9 +95,9 @@ def _compare( # noqa: C901, PLR0912
|
|
|
101
95
|
compare = right_compare = [c for c in cols if c in right_cols and c not in on] # type: ignore[misc]
|
|
102
96
|
|
|
103
97
|
# get diff column names
|
|
104
|
-
diff_col = status_col or
|
|
105
|
-
ldiff_col =
|
|
106
|
-
rdiff_col =
|
|
98
|
+
diff_col = status_col or STATUS_COL_NAME
|
|
99
|
+
ldiff_col = LEFT_DIFF_COL_NAME
|
|
100
|
+
rdiff_col = RIGHT_DIFF_COL_NAME
|
|
107
101
|
|
|
108
102
|
# adding helper diff columns, which will be removed after
|
|
109
103
|
left = left.mutate(**{ldiff_col: 1})
|
|
@@ -227,7 +221,7 @@ def compare_and_split(
|
|
|
227
221
|
)
|
|
228
222
|
```
|
|
229
223
|
"""
|
|
230
|
-
status_col =
|
|
224
|
+
status_col = STATUS_COL_NAME
|
|
231
225
|
|
|
232
226
|
res = _compare(
|
|
233
227
|
left,
|
datachain/error.py
CHANGED
datachain/hash_utils.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import inspect
|
|
3
|
+
import json
|
|
4
|
+
import textwrap
|
|
5
|
+
from collections.abc import Sequence
|
|
6
|
+
from typing import TypeVar, Union
|
|
7
|
+
|
|
8
|
+
from sqlalchemy.sql.elements import (
|
|
9
|
+
BinaryExpression,
|
|
10
|
+
BindParameter,
|
|
11
|
+
ColumnElement,
|
|
12
|
+
Label,
|
|
13
|
+
Over,
|
|
14
|
+
UnaryExpression,
|
|
15
|
+
)
|
|
16
|
+
from sqlalchemy.sql.functions import Function
|
|
17
|
+
|
|
18
|
+
T = TypeVar("T", bound=ColumnElement)
|
|
19
|
+
ColumnLike = Union[str, T]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def serialize_column_element(expr: Union[str, ColumnElement]) -> dict: # noqa: PLR0911
|
|
23
|
+
"""
|
|
24
|
+
Recursively serialize a SQLAlchemy ColumnElement into a deterministic structure.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
# Binary operations: col > 5, col1 + col2, etc.
|
|
28
|
+
if isinstance(expr, BinaryExpression):
|
|
29
|
+
op = (
|
|
30
|
+
expr.operator.__name__
|
|
31
|
+
if hasattr(expr.operator, "__name__")
|
|
32
|
+
else str(expr.operator)
|
|
33
|
+
)
|
|
34
|
+
return {
|
|
35
|
+
"type": "binary",
|
|
36
|
+
"op": op,
|
|
37
|
+
"left": serialize_column_element(expr.left),
|
|
38
|
+
"right": serialize_column_element(expr.right),
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
# Unary operations: -col, NOT col, etc.
|
|
42
|
+
if isinstance(expr, UnaryExpression):
|
|
43
|
+
op = (
|
|
44
|
+
expr.operator.__name__
|
|
45
|
+
if expr.operator is not None and hasattr(expr.operator, "__name__")
|
|
46
|
+
else str(expr.operator)
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
"type": "unary",
|
|
51
|
+
"op": op,
|
|
52
|
+
"element": serialize_column_element(expr.element), # type: ignore[arg-type]
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# Function calls: func.lower(col), func.count(col), etc.
|
|
56
|
+
if isinstance(expr, Function):
|
|
57
|
+
return {
|
|
58
|
+
"type": "function",
|
|
59
|
+
"name": expr.name,
|
|
60
|
+
"clauses": [serialize_column_element(c) for c in expr.clauses],
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Window functions: func.row_number().over(partition_by=..., order_by=...)
|
|
64
|
+
if isinstance(expr, Over):
|
|
65
|
+
return {
|
|
66
|
+
"type": "window",
|
|
67
|
+
"function": serialize_column_element(expr.element),
|
|
68
|
+
"partition_by": [
|
|
69
|
+
serialize_column_element(p) for p in getattr(expr, "partition_by", [])
|
|
70
|
+
],
|
|
71
|
+
"order_by": [
|
|
72
|
+
serialize_column_element(o) for o in getattr(expr, "order_by", [])
|
|
73
|
+
],
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
# Labeled expressions: col.label("alias")
|
|
77
|
+
if isinstance(expr, Label):
|
|
78
|
+
return {
|
|
79
|
+
"type": "label",
|
|
80
|
+
"name": expr.name,
|
|
81
|
+
"element": serialize_column_element(expr.element),
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# Bound values (constants)
|
|
85
|
+
if isinstance(expr, BindParameter):
|
|
86
|
+
return {"type": "bind", "value": expr.value}
|
|
87
|
+
|
|
88
|
+
# Plain columns
|
|
89
|
+
if hasattr(expr, "name"):
|
|
90
|
+
return {"type": "column", "name": expr.name}
|
|
91
|
+
|
|
92
|
+
# Fallback: stringify unknown nodes
|
|
93
|
+
return {"type": "other", "repr": str(expr)}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def hash_column_elements(columns: Sequence[ColumnLike]) -> str:
|
|
97
|
+
"""
|
|
98
|
+
Hash a list of ColumnElements deterministically, dialect agnostic.
|
|
99
|
+
Only accepts ordered iterables (like list or tuple).
|
|
100
|
+
"""
|
|
101
|
+
serialized = [serialize_column_element(c) for c in columns]
|
|
102
|
+
json_str = json.dumps(serialized, sort_keys=True) # stable JSON
|
|
103
|
+
return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def hash_callable(func):
|
|
107
|
+
"""
|
|
108
|
+
Calculate a hash from a callable.
|
|
109
|
+
Rules:
|
|
110
|
+
- Named functions (def) → use source code for stable, cross-version hashing
|
|
111
|
+
- Lambdas → use bytecode (deterministic in same Python runtime)
|
|
112
|
+
"""
|
|
113
|
+
if not callable(func):
|
|
114
|
+
raise TypeError("Expected a callable")
|
|
115
|
+
|
|
116
|
+
# Determine if it is a lambda
|
|
117
|
+
is_lambda = func.__name__ == "<lambda>"
|
|
118
|
+
|
|
119
|
+
if not is_lambda:
|
|
120
|
+
# Try to get exact source of named function
|
|
121
|
+
try:
|
|
122
|
+
lines, _ = inspect.getsourcelines(func)
|
|
123
|
+
payload = textwrap.dedent("".join(lines)).strip()
|
|
124
|
+
except (OSError, TypeError):
|
|
125
|
+
# Fallback: bytecode if source not available
|
|
126
|
+
payload = func.__code__.co_code
|
|
127
|
+
else:
|
|
128
|
+
# For lambdas, fall back directly to bytecode
|
|
129
|
+
payload = func.__code__.co_code
|
|
130
|
+
|
|
131
|
+
# Normalize annotations
|
|
132
|
+
annotations = {
|
|
133
|
+
k: getattr(v, "__name__", str(v)) for k, v in func.__annotations__.items()
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Extras to distinguish functions with same code but different metadata
|
|
137
|
+
extras = {
|
|
138
|
+
"name": func.__name__,
|
|
139
|
+
"defaults": func.__defaults__,
|
|
140
|
+
"annotations": annotations,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# Compute SHA256
|
|
144
|
+
h = hashlib.sha256()
|
|
145
|
+
h.update(str(payload).encode() if isinstance(payload, str) else payload)
|
|
146
|
+
h.update(str(extras).encode())
|
|
147
|
+
return h.hexdigest()
|
datachain/job.py
CHANGED
|
@@ -22,6 +22,7 @@ class Job:
|
|
|
22
22
|
python_version: Optional[str] = None
|
|
23
23
|
error_message: str = ""
|
|
24
24
|
error_stack: str = ""
|
|
25
|
+
parent_job_id: Optional[str] = None
|
|
25
26
|
|
|
26
27
|
@classmethod
|
|
27
28
|
def parse(
|
|
@@ -39,6 +40,7 @@ class Job:
|
|
|
39
40
|
error_stack: str,
|
|
40
41
|
params: str,
|
|
41
42
|
metrics: str,
|
|
43
|
+
parent_job_id: Optional[str],
|
|
42
44
|
) -> "Job":
|
|
43
45
|
return cls(
|
|
44
46
|
str(id),
|
|
@@ -54,4 +56,5 @@ class Job:
|
|
|
54
56
|
python_version,
|
|
55
57
|
error_message,
|
|
56
58
|
error_stack,
|
|
59
|
+
parent_job_id,
|
|
57
60
|
)
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -19,7 +19,6 @@ from typing import (
|
|
|
19
19
|
cast,
|
|
20
20
|
overload,
|
|
21
21
|
)
|
|
22
|
-
from uuid import uuid4
|
|
23
22
|
|
|
24
23
|
import sqlalchemy
|
|
25
24
|
import ujson as json
|
|
@@ -30,10 +29,15 @@ from tqdm import tqdm
|
|
|
30
29
|
from datachain import semver
|
|
31
30
|
from datachain.dataset import DatasetRecord
|
|
32
31
|
from datachain.delta import delta_disabled
|
|
33
|
-
from datachain.error import
|
|
32
|
+
from datachain.error import (
|
|
33
|
+
JobNotFoundError,
|
|
34
|
+
ProjectCreateNotAllowedError,
|
|
35
|
+
ProjectNotFoundError,
|
|
36
|
+
)
|
|
34
37
|
from datachain.func import literal
|
|
35
38
|
from datachain.func.base import Function
|
|
36
39
|
from datachain.func.func import Func
|
|
40
|
+
from datachain.job import Job
|
|
37
41
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
38
42
|
from datachain.lib.data_model import (
|
|
39
43
|
DataModel,
|
|
@@ -50,11 +54,12 @@ from datachain.lib.signal_schema import SignalResolvingError, SignalSchema
|
|
|
50
54
|
from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
|
|
51
55
|
from datachain.lib.udf_signature import UdfSignature
|
|
52
56
|
from datachain.lib.utils import DataChainColumnError, DataChainParamsError
|
|
57
|
+
from datachain.project import Project
|
|
53
58
|
from datachain.query import Session
|
|
54
59
|
from datachain.query.dataset import DatasetQuery, PartitionByType
|
|
55
60
|
from datachain.query.schema import DEFAULT_DELIMITER, Column
|
|
56
61
|
from datachain.sql.functions import path as pathfunc
|
|
57
|
-
from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
|
|
62
|
+
from datachain.utils import batched_it, env2bool, inside_notebook, row_to_nested_dict
|
|
58
63
|
|
|
59
64
|
from .database import DEFAULT_DATABASE_BATCH_SIZE
|
|
60
65
|
from .utils import (
|
|
@@ -209,6 +214,14 @@ class DataChain:
|
|
|
209
214
|
self.print_schema(file=file)
|
|
210
215
|
return file.getvalue()
|
|
211
216
|
|
|
217
|
+
def hash(self) -> str:
|
|
218
|
+
"""
|
|
219
|
+
Calculates SHA hash of this chain. Hash calculation is fast and consistent.
|
|
220
|
+
It takes into account all the steps added to the chain and their inputs.
|
|
221
|
+
Order of the steps is important.
|
|
222
|
+
"""
|
|
223
|
+
return self._query.hash()
|
|
224
|
+
|
|
212
225
|
def _as_delta(
|
|
213
226
|
self,
|
|
214
227
|
on: Optional[Union[str, Sequence[str]]] = None,
|
|
@@ -570,6 +583,19 @@ class DataChain:
|
|
|
570
583
|
query=self._query.save(project=project, feature_schema=schema)
|
|
571
584
|
)
|
|
572
585
|
|
|
586
|
+
def _calculate_job_hash(self, job_id: str) -> str:
|
|
587
|
+
"""
|
|
588
|
+
Calculates hash of the job at the place of this chain's save method.
|
|
589
|
+
Hash is calculated using previous job checkpoint hash (if exists) and
|
|
590
|
+
adding hash of this chain to produce new hash.
|
|
591
|
+
"""
|
|
592
|
+
last_checkpoint = self.session.catalog.metastore.get_last_checkpoint(job_id)
|
|
593
|
+
|
|
594
|
+
return hashlib.sha256(
|
|
595
|
+
(bytes.fromhex(last_checkpoint.hash) if last_checkpoint else b"")
|
|
596
|
+
+ bytes.fromhex(self.hash())
|
|
597
|
+
).hexdigest()
|
|
598
|
+
|
|
573
599
|
def save( # type: ignore[override]
|
|
574
600
|
self,
|
|
575
601
|
name: str,
|
|
@@ -594,101 +620,171 @@ class DataChain:
|
|
|
594
620
|
update_version: which part of the dataset version to automatically increase.
|
|
595
621
|
Available values: `major`, `minor` or `patch`. Default is `patch`.
|
|
596
622
|
"""
|
|
623
|
+
|
|
597
624
|
catalog = self.session.catalog
|
|
598
|
-
if version is not None:
|
|
599
|
-
semver.validate(version)
|
|
600
625
|
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
raise ValueError(
|
|
607
|
-
"update_version can have one of the following values: major, minor or"
|
|
608
|
-
" patch"
|
|
609
|
-
)
|
|
626
|
+
result = None # result chain that will be returned at the end
|
|
627
|
+
|
|
628
|
+
# Version validation
|
|
629
|
+
self._validate_version(version)
|
|
630
|
+
self._validate_update_version(update_version)
|
|
610
631
|
|
|
611
632
|
namespace_name, project_name, name = catalog.get_full_dataset_name(
|
|
612
633
|
name,
|
|
613
634
|
namespace_name=self._settings.namespace,
|
|
614
635
|
project_name=self._settings.project,
|
|
615
636
|
)
|
|
637
|
+
project = self._get_or_create_project(namespace_name, project_name)
|
|
638
|
+
|
|
639
|
+
# Checkpoint handling
|
|
640
|
+
job, _hash, result = self._resolve_checkpoint(name, project, kwargs)
|
|
641
|
+
|
|
642
|
+
# Schema preparation
|
|
643
|
+
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
644
|
+
|
|
645
|
+
# Handle retry and delta functionality
|
|
646
|
+
if not result:
|
|
647
|
+
result = self._handle_delta(name, version, project, schema, kwargs)
|
|
648
|
+
|
|
649
|
+
if not result:
|
|
650
|
+
# calculate chain if we already don't have result from checkpoint or delta
|
|
651
|
+
result = self._evolve(
|
|
652
|
+
query=self._query.save(
|
|
653
|
+
name=name,
|
|
654
|
+
version=version,
|
|
655
|
+
project=project,
|
|
656
|
+
description=description,
|
|
657
|
+
attrs=attrs,
|
|
658
|
+
feature_schema=schema,
|
|
659
|
+
update_version=update_version,
|
|
660
|
+
**kwargs,
|
|
661
|
+
)
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
if job:
|
|
665
|
+
catalog.metastore.create_checkpoint(job.id, _hash) # type: ignore[arg-type]
|
|
616
666
|
|
|
667
|
+
return result
|
|
668
|
+
|
|
669
|
+
def _validate_version(self, version: Optional[str]) -> None:
|
|
670
|
+
"""Validate dataset version if provided."""
|
|
671
|
+
if version is not None:
|
|
672
|
+
semver.validate(version)
|
|
673
|
+
|
|
674
|
+
def _validate_update_version(self, update_version: Optional[str]) -> None:
|
|
675
|
+
"""Ensure update_version is one of: major, minor, patch."""
|
|
676
|
+
allowed = ["major", "minor", "patch"]
|
|
677
|
+
if update_version not in allowed:
|
|
678
|
+
raise ValueError(f"update_version must be one of {allowed}")
|
|
679
|
+
|
|
680
|
+
def _get_or_create_project(self, namespace: str, project_name: str) -> Project:
|
|
681
|
+
"""Get project or raise if creation not allowed."""
|
|
617
682
|
try:
|
|
618
|
-
|
|
683
|
+
return self.session.catalog.metastore.get_project(
|
|
619
684
|
project_name,
|
|
620
|
-
|
|
685
|
+
namespace,
|
|
621
686
|
create=is_studio(),
|
|
622
687
|
)
|
|
623
688
|
except ProjectNotFoundError as e:
|
|
624
|
-
# not being able to create it as creation is not allowed
|
|
625
689
|
raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
|
|
626
690
|
|
|
627
|
-
|
|
691
|
+
def _resolve_checkpoint(
|
|
692
|
+
self,
|
|
693
|
+
name: str,
|
|
694
|
+
project: Project,
|
|
695
|
+
kwargs: dict,
|
|
696
|
+
) -> tuple[Optional[Job], Optional[str], Optional["DataChain"]]:
|
|
697
|
+
"""Check if checkpoint exists and return cached dataset if possible."""
|
|
698
|
+
from .datasets import read_dataset
|
|
628
699
|
|
|
629
|
-
|
|
630
|
-
if self.delta and name:
|
|
631
|
-
from datachain.delta import delta_retry_update
|
|
700
|
+
metastore = self.session.catalog.metastore
|
|
632
701
|
|
|
633
|
-
|
|
634
|
-
|
|
702
|
+
job_id = os.getenv("DATACHAIN_JOB_ID")
|
|
703
|
+
checkpoints_reset = env2bool("DATACHAIN_CHECKPOINTS_RESET", undefined=True)
|
|
635
704
|
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
705
|
+
if not job_id:
|
|
706
|
+
return None, None, None
|
|
707
|
+
|
|
708
|
+
job = metastore.get_job(job_id)
|
|
709
|
+
if not job:
|
|
710
|
+
raise JobNotFoundError(f"Job with id {job_id} not found")
|
|
711
|
+
|
|
712
|
+
_hash = self._calculate_job_hash(job.id)
|
|
713
|
+
|
|
714
|
+
if (
|
|
715
|
+
job.parent_job_id
|
|
716
|
+
and not checkpoints_reset
|
|
717
|
+
and metastore.find_checkpoint(job.parent_job_id, _hash)
|
|
718
|
+
):
|
|
719
|
+
# checkpoint found → reuse dataset
|
|
720
|
+
chain = read_dataset(
|
|
721
|
+
name, namespace=project.namespace.name, project=project.name, **kwargs
|
|
645
722
|
)
|
|
723
|
+
return job, _hash, chain
|
|
646
724
|
|
|
647
|
-
|
|
648
|
-
return self._evolve(
|
|
649
|
-
query=result_ds._query.save(
|
|
650
|
-
name=name,
|
|
651
|
-
version=version,
|
|
652
|
-
project=project,
|
|
653
|
-
feature_schema=schema,
|
|
654
|
-
dependencies=dependencies,
|
|
655
|
-
**kwargs,
|
|
656
|
-
)
|
|
657
|
-
)
|
|
725
|
+
return job, _hash, None
|
|
658
726
|
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
727
|
+
def _handle_delta(
|
|
728
|
+
self,
|
|
729
|
+
name: str,
|
|
730
|
+
version: Optional[str],
|
|
731
|
+
project: Project,
|
|
732
|
+
schema: dict,
|
|
733
|
+
kwargs: dict,
|
|
734
|
+
) -> Optional["DataChain"]:
|
|
735
|
+
"""Try to save as a delta dataset.
|
|
736
|
+
Returns:
|
|
737
|
+
A DataChain if delta logic could handle it, otherwise None to fall back
|
|
738
|
+
to the regular save path (e.g., on first dataset creation).
|
|
739
|
+
"""
|
|
740
|
+
from datachain.delta import delta_retry_update
|
|
665
741
|
|
|
666
|
-
|
|
667
|
-
name, namespace=namespace_name, project=project_name, **kwargs
|
|
668
|
-
)
|
|
742
|
+
from .datasets import read_dataset
|
|
669
743
|
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
744
|
+
if not self.delta or not name:
|
|
745
|
+
return None
|
|
746
|
+
|
|
747
|
+
assert self._delta_on is not None, "Delta chain must have delta_on defined"
|
|
748
|
+
|
|
749
|
+
result_ds, dependencies, has_changes = delta_retry_update(
|
|
750
|
+
self,
|
|
751
|
+
project.namespace.name,
|
|
752
|
+
project.name,
|
|
753
|
+
name,
|
|
754
|
+
on=self._delta_on,
|
|
755
|
+
right_on=self._delta_result_on,
|
|
756
|
+
compare=self._delta_compare,
|
|
757
|
+
delta_retry=self._delta_retry,
|
|
681
758
|
)
|
|
682
759
|
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
760
|
+
# Case 1: delta produced a new dataset
|
|
761
|
+
if result_ds:
|
|
762
|
+
return self._evolve(
|
|
763
|
+
query=result_ds._query.save(
|
|
764
|
+
name=name,
|
|
765
|
+
version=version,
|
|
766
|
+
project=project,
|
|
767
|
+
feature_schema=schema,
|
|
768
|
+
dependencies=dependencies,
|
|
769
|
+
**kwargs,
|
|
770
|
+
)
|
|
689
771
|
)
|
|
690
772
|
|
|
691
|
-
|
|
773
|
+
# Case 2: no changes → reuse last version
|
|
774
|
+
if not has_changes:
|
|
775
|
+
# sources have not been changed so new version of resulting dataset
|
|
776
|
+
# would be the same as previous one. To avoid duplicating exact
|
|
777
|
+
# datasets, we won't create new version of it and we will return
|
|
778
|
+
# current latest version instead.
|
|
779
|
+
return read_dataset(
|
|
780
|
+
name,
|
|
781
|
+
namespace=project.namespace.name,
|
|
782
|
+
project=project.name,
|
|
783
|
+
**kwargs,
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
# Case 3: first creation of dataset
|
|
787
|
+
return None
|
|
692
788
|
|
|
693
789
|
def apply(self, func, *args, **kwargs):
|
|
694
790
|
"""Apply any function to the chain.
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
2
4
|
import warnings
|
|
3
5
|
from collections.abc import Iterator, Sequence
|
|
4
6
|
from dataclasses import dataclass
|
|
@@ -257,6 +259,11 @@ class SignalSchema:
|
|
|
257
259
|
signals["_custom_types"] = custom_types
|
|
258
260
|
return signals
|
|
259
261
|
|
|
262
|
+
def hash(self) -> str:
|
|
263
|
+
"""Create SHA hash of this schema"""
|
|
264
|
+
json_str = json.dumps(self.serialize(), sort_keys=True, separators=(",", ":"))
|
|
265
|
+
return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
|
|
266
|
+
|
|
260
267
|
@staticmethod
|
|
261
268
|
def _split_subtypes(type_name: str) -> list[str]:
|
|
262
269
|
"""This splits a list of subtypes, including proper square bracket handling."""
|
datachain/lib/udf.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import sys
|
|
2
3
|
import traceback
|
|
3
4
|
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
|
|
@@ -12,6 +13,7 @@ from pydantic import BaseModel
|
|
|
12
13
|
from datachain.asyn import AsyncMapper
|
|
13
14
|
from datachain.cache import temporary_cache
|
|
14
15
|
from datachain.dataset import RowDict
|
|
16
|
+
from datachain.hash_utils import hash_callable
|
|
15
17
|
from datachain.lib.convert.flatten import flatten
|
|
16
18
|
from datachain.lib.file import DataModel, File
|
|
17
19
|
from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
|
|
@@ -61,6 +63,9 @@ class UDFAdapter:
|
|
|
61
63
|
batch_size: Optional[int] = None
|
|
62
64
|
batch: int = 1
|
|
63
65
|
|
|
66
|
+
def hash(self) -> str:
|
|
67
|
+
return self.inner.hash()
|
|
68
|
+
|
|
64
69
|
def get_batching(self, use_partitioning: bool = False) -> BatchingStrategy:
|
|
65
70
|
if use_partitioning:
|
|
66
71
|
return Partition()
|
|
@@ -151,6 +156,21 @@ class UDFBase(AbstractUDF):
|
|
|
151
156
|
self.output = None
|
|
152
157
|
self._func = None
|
|
153
158
|
|
|
159
|
+
def hash(self) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Creates SHA hash of this UDF function. It takes into account function,
|
|
162
|
+
inputs and outputs.
|
|
163
|
+
"""
|
|
164
|
+
parts = [
|
|
165
|
+
hash_callable(self._func),
|
|
166
|
+
self.params.hash() if self.params else "",
|
|
167
|
+
self.output.hash(),
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
return hashlib.sha256(
|
|
171
|
+
b"".join([bytes.fromhex(part) for part in parts])
|
|
172
|
+
).hexdigest()
|
|
173
|
+
|
|
154
174
|
def process(self, *args, **kwargs):
|
|
155
175
|
"""Processing function that needs to be defined by user"""
|
|
156
176
|
if not self._func:
|
datachain/query/dataset.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import contextlib
|
|
2
|
+
import hashlib
|
|
2
3
|
import inspect
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
@@ -44,6 +45,7 @@ from datachain.data_storage.schema import (
|
|
|
44
45
|
from datachain.dataset import DatasetDependency, DatasetStatus, RowDict
|
|
45
46
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
46
47
|
from datachain.func.base import Function
|
|
48
|
+
from datachain.hash_utils import hash_column_elements
|
|
47
49
|
from datachain.lib.listing import is_listing_dataset, listing_dataset_expired
|
|
48
50
|
from datachain.lib.signal_schema import SignalSchema
|
|
49
51
|
from datachain.lib.udf import UDFAdapter, _get_cache
|
|
@@ -57,6 +59,7 @@ from datachain.sql.types import SQLType
|
|
|
57
59
|
from datachain.utils import (
|
|
58
60
|
determine_processes,
|
|
59
61
|
determine_workers,
|
|
62
|
+
ensure_sequence,
|
|
60
63
|
filtered_cloudpickle_dumps,
|
|
61
64
|
get_datachain_executable,
|
|
62
65
|
safe_closing,
|
|
@@ -167,6 +170,18 @@ class Step(ABC):
|
|
|
167
170
|
) -> "StepResult":
|
|
168
171
|
"""Apply the processing step."""
|
|
169
172
|
|
|
173
|
+
@abstractmethod
|
|
174
|
+
def hash_inputs(self) -> str:
|
|
175
|
+
"""Calculates hash of step inputs"""
|
|
176
|
+
|
|
177
|
+
def hash(self) -> str:
|
|
178
|
+
"""
|
|
179
|
+
Calculates hash for step which includes step name and hash of it's inputs
|
|
180
|
+
"""
|
|
181
|
+
return hashlib.sha256(
|
|
182
|
+
f"{self.__class__.__name__}|{self.hash_inputs()}".encode()
|
|
183
|
+
).hexdigest()
|
|
184
|
+
|
|
170
185
|
|
|
171
186
|
@frozen
|
|
172
187
|
class QueryStep:
|
|
@@ -186,6 +201,11 @@ class QueryStep:
|
|
|
186
201
|
q, dr.columns, dependencies=[(self.dataset, self.dataset_version)]
|
|
187
202
|
)
|
|
188
203
|
|
|
204
|
+
def hash(self) -> str:
|
|
205
|
+
return hashlib.sha256(
|
|
206
|
+
self.dataset.uri(self.dataset_version).encode()
|
|
207
|
+
).hexdigest()
|
|
208
|
+
|
|
189
209
|
|
|
190
210
|
def generator_then_call(generator, func: Callable):
|
|
191
211
|
"""
|
|
@@ -256,6 +276,13 @@ class DatasetDiffOperation(Step):
|
|
|
256
276
|
class Subtract(DatasetDiffOperation):
|
|
257
277
|
on: Sequence[tuple[str, str]]
|
|
258
278
|
|
|
279
|
+
def hash_inputs(self) -> str:
|
|
280
|
+
on_bytes = b"".join(
|
|
281
|
+
f"{a}:{b}".encode() for a, b in sorted(self.on, key=lambda t: (t[0], t[1]))
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
return hashlib.sha256(bytes.fromhex(self.dq.hash()) + on_bytes).hexdigest()
|
|
285
|
+
|
|
259
286
|
def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
|
|
260
287
|
sq = source_query.alias("source_query")
|
|
261
288
|
tq = target_query.alias("target_query")
|
|
@@ -393,6 +420,16 @@ class UDFStep(Step, ABC):
|
|
|
393
420
|
min_task_size: Optional[int] = None
|
|
394
421
|
batch_size: Optional[int] = None
|
|
395
422
|
|
|
423
|
+
def hash_inputs(self) -> str:
|
|
424
|
+
partition_by = ensure_sequence(self.partition_by or [])
|
|
425
|
+
parts = [
|
|
426
|
+
bytes.fromhex(self.udf.hash()),
|
|
427
|
+
bytes.fromhex(hash_column_elements(partition_by)),
|
|
428
|
+
str(self.is_generator).encode(),
|
|
429
|
+
]
|
|
430
|
+
|
|
431
|
+
return hashlib.sha256(b"".join(parts)).hexdigest()
|
|
432
|
+
|
|
396
433
|
@abstractmethod
|
|
397
434
|
def create_udf_table(self, query: Select) -> "Table":
|
|
398
435
|
"""Method that creates a table where temp udf results will be saved"""
|
|
@@ -790,6 +827,9 @@ class SQLClause(Step, ABC):
|
|
|
790
827
|
class SQLSelect(SQLClause):
|
|
791
828
|
args: tuple[Union[Function, ColumnElement], ...]
|
|
792
829
|
|
|
830
|
+
def hash_inputs(self) -> str:
|
|
831
|
+
return hash_column_elements(self.args)
|
|
832
|
+
|
|
793
833
|
def apply_sql_clause(self, query) -> Select:
|
|
794
834
|
subquery = query.subquery()
|
|
795
835
|
args = [
|
|
@@ -806,6 +846,9 @@ class SQLSelect(SQLClause):
|
|
|
806
846
|
class SQLSelectExcept(SQLClause):
|
|
807
847
|
args: tuple[Union[Function, ColumnElement], ...]
|
|
808
848
|
|
|
849
|
+
def hash_inputs(self) -> str:
|
|
850
|
+
return hash_column_elements(self.args)
|
|
851
|
+
|
|
809
852
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
810
853
|
subquery = query.subquery()
|
|
811
854
|
args = [c for c in subquery.c if c.name not in set(self.parse_cols(self.args))]
|
|
@@ -817,6 +860,9 @@ class SQLMutate(SQLClause):
|
|
|
817
860
|
args: tuple[Label, ...]
|
|
818
861
|
new_schema: SignalSchema
|
|
819
862
|
|
|
863
|
+
def hash_inputs(self) -> str:
|
|
864
|
+
return hash_column_elements(self.args)
|
|
865
|
+
|
|
820
866
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
821
867
|
original_subquery = query.subquery()
|
|
822
868
|
to_mutate = {c.name for c in self.args}
|
|
@@ -846,6 +892,9 @@ class SQLMutate(SQLClause):
|
|
|
846
892
|
class SQLFilter(SQLClause):
|
|
847
893
|
expressions: tuple[Union[Function, ColumnElement], ...]
|
|
848
894
|
|
|
895
|
+
def hash_inputs(self) -> str:
|
|
896
|
+
return hash_column_elements(self.expressions)
|
|
897
|
+
|
|
849
898
|
def __and__(self, other):
|
|
850
899
|
expressions = self.parse_cols(self.expressions)
|
|
851
900
|
return self.__class__(expressions + other)
|
|
@@ -859,6 +908,9 @@ class SQLFilter(SQLClause):
|
|
|
859
908
|
class SQLOrderBy(SQLClause):
|
|
860
909
|
args: tuple[Union[Function, ColumnElement], ...]
|
|
861
910
|
|
|
911
|
+
def hash_inputs(self) -> str:
|
|
912
|
+
return hash_column_elements(self.args)
|
|
913
|
+
|
|
862
914
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
863
915
|
args = self.parse_cols(self.args)
|
|
864
916
|
return query.order_by(*args)
|
|
@@ -868,6 +920,9 @@ class SQLOrderBy(SQLClause):
|
|
|
868
920
|
class SQLLimit(SQLClause):
|
|
869
921
|
n: int
|
|
870
922
|
|
|
923
|
+
def hash_inputs(self) -> str:
|
|
924
|
+
return hashlib.sha256(str(self.n).encode()).hexdigest()
|
|
925
|
+
|
|
871
926
|
def apply_sql_clause(self, query: Select) -> Select:
|
|
872
927
|
return query.limit(self.n)
|
|
873
928
|
|
|
@@ -876,12 +931,18 @@ class SQLLimit(SQLClause):
|
|
|
876
931
|
class SQLOffset(SQLClause):
|
|
877
932
|
offset: int
|
|
878
933
|
|
|
934
|
+
def hash_inputs(self) -> str:
|
|
935
|
+
return hashlib.sha256(str(self.offset).encode()).hexdigest()
|
|
936
|
+
|
|
879
937
|
def apply_sql_clause(self, query: "GenerativeSelect"):
|
|
880
938
|
return query.offset(self.offset)
|
|
881
939
|
|
|
882
940
|
|
|
883
941
|
@frozen
|
|
884
942
|
class SQLCount(SQLClause):
|
|
943
|
+
def hash_inputs(self) -> str:
|
|
944
|
+
return ""
|
|
945
|
+
|
|
885
946
|
def apply_sql_clause(self, query):
|
|
886
947
|
return sqlalchemy.select(f.count(1)).select_from(query.subquery())
|
|
887
948
|
|
|
@@ -891,6 +952,9 @@ class SQLDistinct(SQLClause):
|
|
|
891
952
|
args: tuple[ColumnElement, ...]
|
|
892
953
|
dialect: str
|
|
893
954
|
|
|
955
|
+
def hash_inputs(self) -> str:
|
|
956
|
+
return hash_column_elements(self.args)
|
|
957
|
+
|
|
894
958
|
def apply_sql_clause(self, query):
|
|
895
959
|
if self.dialect == "sqlite":
|
|
896
960
|
return query.group_by(*self.args)
|
|
@@ -903,6 +967,11 @@ class SQLUnion(Step):
|
|
|
903
967
|
query1: "DatasetQuery"
|
|
904
968
|
query2: "DatasetQuery"
|
|
905
969
|
|
|
970
|
+
def hash_inputs(self) -> str:
|
|
971
|
+
return hashlib.sha256(
|
|
972
|
+
bytes.fromhex(self.query1.hash()) + bytes.fromhex(self.query2.hash())
|
|
973
|
+
).hexdigest()
|
|
974
|
+
|
|
906
975
|
def apply(
|
|
907
976
|
self, query_generator: QueryGenerator, temp_tables: list[str]
|
|
908
977
|
) -> StepResult:
|
|
@@ -939,6 +1008,20 @@ class SQLJoin(Step):
|
|
|
939
1008
|
full: bool
|
|
940
1009
|
rname: str
|
|
941
1010
|
|
|
1011
|
+
def hash_inputs(self) -> str:
|
|
1012
|
+
predicates = ensure_sequence(self.predicates or [])
|
|
1013
|
+
|
|
1014
|
+
parts = [
|
|
1015
|
+
bytes.fromhex(self.query1.hash()),
|
|
1016
|
+
bytes.fromhex(self.query2.hash()),
|
|
1017
|
+
bytes.fromhex(hash_column_elements(predicates)),
|
|
1018
|
+
str(self.inner).encode(),
|
|
1019
|
+
str(self.full).encode(),
|
|
1020
|
+
self.rname.encode("utf-8"),
|
|
1021
|
+
]
|
|
1022
|
+
|
|
1023
|
+
return hashlib.sha256(b"".join(parts)).hexdigest()
|
|
1024
|
+
|
|
942
1025
|
def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
|
|
943
1026
|
query = dq.apply_steps().select()
|
|
944
1027
|
temp_tables.extend(dq.temp_table_names)
|
|
@@ -1060,6 +1143,13 @@ class SQLGroupBy(SQLClause):
|
|
|
1060
1143
|
cols: Sequence[Union[str, Function, ColumnElement]]
|
|
1061
1144
|
group_by: Sequence[Union[str, Function, ColumnElement]]
|
|
1062
1145
|
|
|
1146
|
+
def hash_inputs(self) -> str:
|
|
1147
|
+
return hashlib.sha256(
|
|
1148
|
+
bytes.fromhex(
|
|
1149
|
+
hash_column_elements(self.cols) + hash_column_elements(self.group_by)
|
|
1150
|
+
)
|
|
1151
|
+
).hexdigest()
|
|
1152
|
+
|
|
1063
1153
|
def apply_sql_clause(self, query) -> Select:
|
|
1064
1154
|
if not self.cols:
|
|
1065
1155
|
raise ValueError("No columns to select")
|
|
@@ -1213,6 +1303,23 @@ class DatasetQuery:
|
|
|
1213
1303
|
def __or__(self, other):
|
|
1214
1304
|
return self.union(other)
|
|
1215
1305
|
|
|
1306
|
+
def hash(self) -> str:
|
|
1307
|
+
"""
|
|
1308
|
+
Calculates hash of this class taking into account hash of starting step
|
|
1309
|
+
and hashes of each following steps. Ordering is important.
|
|
1310
|
+
"""
|
|
1311
|
+
hasher = hashlib.sha256()
|
|
1312
|
+
if self.starting_step:
|
|
1313
|
+
hasher.update(self.starting_step.hash().encode("utf-8"))
|
|
1314
|
+
else:
|
|
1315
|
+
assert self.list_ds_name
|
|
1316
|
+
hasher.update(self.list_ds_name.encode("utf-8"))
|
|
1317
|
+
|
|
1318
|
+
for step in self.steps:
|
|
1319
|
+
hasher.update(step.hash().encode("utf-8"))
|
|
1320
|
+
|
|
1321
|
+
return hasher.hexdigest()
|
|
1322
|
+
|
|
1216
1323
|
@staticmethod
|
|
1217
1324
|
def get_table() -> "TableClause":
|
|
1218
1325
|
table_name = "".join(
|
datachain/utils.py
CHANGED
|
@@ -537,3 +537,9 @@ def getenv_bool(name: str, default: bool = False) -> bool:
|
|
|
537
537
|
if val is None:
|
|
538
538
|
return default
|
|
539
539
|
return val.lower() in ("1", "true", "yes", "on")
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def ensure_sequence(x) -> Sequence:
|
|
543
|
+
if isinstance(x, Sequence) and not isinstance(x, (str, bytes)):
|
|
544
|
+
return x
|
|
545
|
+
return [x]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.34.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -103,7 +103,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
103
103
|
Requires-Dist: ultralytics; extra == "tests"
|
|
104
104
|
Provides-Extra: dev
|
|
105
105
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
106
|
-
Requires-Dist: mypy==1.18.
|
|
106
|
+
Requires-Dist: mypy==1.18.2; extra == "dev"
|
|
107
107
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
108
108
|
Requires-Dist: types-dateparser; extra == "dev"
|
|
109
109
|
Requires-Dist: types-pytz; extra == "dev"
|
|
@@ -6,8 +6,9 @@ datachain/checkpoint.py,sha256=Ar6SnnDMN3fr5ZZm3Xpdbj2f9buhqeApad-B1Lyrr4Y,1152
|
|
|
6
6
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
7
7
|
datachain/dataset.py,sha256=eX7xGa3EUpAccBZWpkgDmYV6_FjGuhjkMLFHpjl6lVI,25256
|
|
8
8
|
datachain/delta.py,sha256=X5Lw6GQ8MAYNl2YIExNvl0tPIkylQEWwnCw0We7NtHM,10693
|
|
9
|
-
datachain/error.py,sha256=
|
|
10
|
-
datachain/
|
|
9
|
+
datachain/error.py,sha256=P_5KXlfVIsW4E42JJCoFhGsgvY8la-6jXBEWbHbgqKo,1846
|
|
10
|
+
datachain/hash_utils.py,sha256=tgyXlz1m0gsS3UkIxdb0fxtNfVsbO2-YrELtyGV5XYE,4515
|
|
11
|
+
datachain/job.py,sha256=WDkZrr4Je50nngRDaRapNpGpx_50L6wYWmAqcMT_yCw,1367
|
|
11
12
|
datachain/listing.py,sha256=aqayl5St3D9PwdwM6nR1STkpLSw-S3U8pudO9PWi3N8,7241
|
|
12
13
|
datachain/namespace.py,sha256=sgIF90KEaC_VlMFivDIJiFz8RUsTftMxW4kOUTyxo3A,2356
|
|
13
14
|
datachain/node.py,sha256=KWDT0ClYXB7FYI-QOvzAa-UDkLJErUI2eWm5FBteYuU,5577
|
|
@@ -20,9 +21,9 @@ datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
|
|
|
20
21
|
datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
|
|
21
22
|
datachain/studio.py,sha256=IS8o4BZnhUo73Bd8m4CJxFc5utdmh2miIs25WswkFBA,15283
|
|
22
23
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
23
|
-
datachain/utils.py,sha256=
|
|
24
|
+
datachain/utils.py,sha256=yW-Df5R6npqcqlNZMlBRBwyhUFmXpl9sQipPmy9HfQU,15797
|
|
24
25
|
datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
|
|
25
|
-
datachain/catalog/catalog.py,sha256=
|
|
26
|
+
datachain/catalog/catalog.py,sha256=oI4YBuuOJGVx_Fp1cDoFb56lPV7Or27ZquzR8oM1m3Y,69133
|
|
26
27
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
27
28
|
datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6150
|
|
28
29
|
datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
|
|
@@ -50,13 +51,13 @@ datachain/client/local.py,sha256=0J52Wzvw25hSucVlzBvLuMRAZwrAHZAYDvD1mNBqf4c,460
|
|
|
50
51
|
datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
|
|
51
52
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
52
53
|
datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
|
|
53
|
-
datachain/data_storage/job.py,sha256=
|
|
54
|
-
datachain/data_storage/metastore.py,sha256=
|
|
54
|
+
datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
|
|
55
|
+
datachain/data_storage/metastore.py,sha256=9Wd0MfdVrdpgvFXOddUvyz61MnoRDipv0-A38aRsqzw,61021
|
|
55
56
|
datachain/data_storage/schema.py,sha256=DmxxXjNIsXib9gj5jcrb1CVjGzHf7HZLOehs1RmuiMA,9891
|
|
56
57
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
57
58
|
datachain/data_storage/sqlite.py,sha256=Z6KlFk7hWoXBbjzxfk2NuIBecqP86AJzp5iEE2W4yw0,30603
|
|
58
59
|
datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
|
|
59
|
-
datachain/diff/__init__.py,sha256
|
|
60
|
+
datachain/diff/__init__.py,sha256=v03JfMxH1VvwFl3rniedS4YWs6EXSfaLCULJTKNECE4,9603
|
|
60
61
|
datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
62
|
datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
|
|
62
63
|
datachain/fs/utils.py,sha256=s-FkTOCGBk-b6TT3toQH51s9608pofoFjUSTc1yy7oE,825
|
|
@@ -88,10 +89,10 @@ datachain/lib/namespaces.py,sha256=ZyIYUa3WMrv6R5HrSoLsmLiEbvUQDl8sBINLUmWOYG0,3
|
|
|
88
89
|
datachain/lib/projects.py,sha256=_YeU9PPcH_pC8-sbX-47XtWSdl1ltVKnALY8azWLJkM,4112
|
|
89
90
|
datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
|
|
90
91
|
datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
|
|
91
|
-
datachain/lib/signal_schema.py,sha256=
|
|
92
|
+
datachain/lib/signal_schema.py,sha256=WDFLbzXEOhgv865TePcFpLQHxsKQHtn8kTzaQGUG_XA,39479
|
|
92
93
|
datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
|
|
93
94
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
94
|
-
datachain/lib/udf.py,sha256=
|
|
95
|
+
datachain/lib/udf.py,sha256=DdUxGBo9Y7Jz6aTBKgwex7YfK1RNaGm1JUlXCqs7qnw,18122
|
|
95
96
|
datachain/lib/udf_signature.py,sha256=Yz20iJ-WF1pijT3hvcDIKFzgWV9gFxZM73KZRx3NbPk,7560
|
|
96
97
|
datachain/lib/utils.py,sha256=RLji1gHnfDXtJCnBo8BcNu1obndFpVsXJ_1Vb-FQ9Qo,4554
|
|
97
98
|
datachain/lib/video.py,sha256=ddVstiMkfxyBPDsnjCKY0d_93bw-DcMqGqN60yzsZoo,6851
|
|
@@ -106,7 +107,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
|
|
|
106
107
|
datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
|
|
107
108
|
datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
|
|
108
109
|
datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
|
|
109
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
110
|
+
datachain/lib/dc/datachain.py,sha256=uUAPchtNXyJo1tzFd3z1MLWhVC2dzO2ZjhTS0naqXiE,104032
|
|
110
111
|
datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
|
|
111
112
|
datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
|
|
112
113
|
datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
|
|
@@ -129,7 +130,7 @@ datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigF
|
|
|
129
130
|
datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
|
|
130
131
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
131
132
|
datachain/query/batch.py,sha256=ocPeNgrJM6Y_6SYCx3O2cwlCFAhNMfoYgB99GP6A1Bg,4294
|
|
132
|
-
datachain/query/dataset.py,sha256=
|
|
133
|
+
datachain/query/dataset.py,sha256=P7pyRiWc9G3AfzxvyB2yToKW3bXoUCrfFOtFdiVbCrU,67836
|
|
133
134
|
datachain/query/dispatch.py,sha256=pygp7xg3lUDKlYHhecKxW5fB3zOSX1fPJfZBU4dfijk,16067
|
|
134
135
|
datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
|
|
135
136
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -163,9 +164,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
163
164
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
164
165
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
165
166
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
166
|
-
datachain-0.
|
|
167
|
-
datachain-0.
|
|
168
|
-
datachain-0.
|
|
169
|
-
datachain-0.
|
|
170
|
-
datachain-0.
|
|
171
|
-
datachain-0.
|
|
167
|
+
datachain-0.34.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
168
|
+
datachain-0.34.0.dist-info/METADATA,sha256=YBmM_daqadosEKHBY-QLxSRxYn55XuhB0S0tfeEfzts,13655
|
|
169
|
+
datachain-0.34.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
170
|
+
datachain-0.34.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
171
|
+
datachain-0.34.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
172
|
+
datachain-0.34.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|