datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +4 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +276 -354
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +8 -3
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +10 -17
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +42 -27
- datachain/cli/commands/ls.py +15 -15
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/__init__.py +3 -43
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +34 -23
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +157 -0
- datachain/client/local.py +11 -7
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +2 -0
- datachain/data_storage/metastore.py +716 -137
- datachain/data_storage/schema.py +20 -27
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +114 -114
- datachain/data_storage/warehouse.py +140 -48
- datachain/dataset.py +109 -89
- datachain/delta.py +117 -42
- datachain/diff/__init__.py +25 -33
- datachain/error.py +24 -0
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +63 -45
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +18 -15
- datachain/lib/audio.py +60 -59
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/values_to_tuples.py +151 -53
- datachain/lib/data_model.py +23 -19
- datachain/lib/dataset_info.py +7 -7
- datachain/lib/dc/__init__.py +2 -1
- datachain/lib/dc/csv.py +22 -26
- datachain/lib/dc/database.py +37 -34
- datachain/lib/dc/datachain.py +518 -324
- datachain/lib/dc/datasets.py +38 -30
- datachain/lib/dc/hf.py +16 -20
- datachain/lib/dc/json.py +17 -18
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +33 -21
- datachain/lib/dc/records.py +9 -13
- datachain/lib/dc/storage.py +103 -65
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +17 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +187 -50
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +2 -3
- datachain/lib/model_store.py +20 -8
- datachain/lib/namespaces.py +59 -7
- datachain/lib/projects.py +51 -9
- datachain/lib/pytorch.py +31 -23
- datachain/lib/settings.py +188 -85
- datachain/lib/signal_schema.py +302 -64
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +103 -63
- datachain/lib/udf_signature.py +59 -34
- datachain/lib/utils.py +20 -0
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +31 -36
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +12 -5
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +22 -3
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +4 -4
- datachain/query/batch.py +10 -12
- datachain/query/dataset.py +376 -194
- datachain/query/dispatch.py +112 -84
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/queue.py +2 -1
- datachain/query/schema.py +7 -6
- datachain/query/session.py +190 -33
- datachain/query/udf.py +9 -6
- datachain/remote/studio.py +90 -53
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +37 -25
- datachain/sql/sqlite/types.py +1 -1
- datachain/sql/types.py +36 -5
- datachain/studio.py +49 -40
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +39 -48
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
- datachain-0.39.0.dist-info/RECORD +173 -0
- datachain/cli/commands/query.py +0 -54
- datachain/query/utils.py +0 -36
- datachain-0.30.5.dist-info/RECORD +0 -168
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/func/conditional.py
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from typing import Optional, Union
|
|
2
|
-
|
|
3
1
|
from sqlalchemy import ColumnElement
|
|
4
2
|
from sqlalchemy import and_ as sql_and
|
|
5
3
|
from sqlalchemy import case as sql_case
|
|
@@ -12,10 +10,10 @@ from datachain.sql.functions import conditional
|
|
|
12
10
|
|
|
13
11
|
from .func import Func
|
|
14
12
|
|
|
15
|
-
CaseT =
|
|
13
|
+
CaseT = int | float | complex | bool | str | Func | ColumnElement
|
|
16
14
|
|
|
17
15
|
|
|
18
|
-
def greatest(*args:
|
|
16
|
+
def greatest(*args: str | Column | Func | float) -> Func:
|
|
19
17
|
"""
|
|
20
18
|
Returns the greatest (largest) value from the given input values.
|
|
21
19
|
|
|
@@ -56,7 +54,7 @@ def greatest(*args: Union[str, Column, Func, float]) -> Func:
|
|
|
56
54
|
)
|
|
57
55
|
|
|
58
56
|
|
|
59
|
-
def least(*args:
|
|
57
|
+
def least(*args: str | Column | Func | float) -> Func:
|
|
60
58
|
"""
|
|
61
59
|
Returns the least (smallest) value from the given input values.
|
|
62
60
|
|
|
@@ -94,7 +92,7 @@ def least(*args: Union[str, Column, Func, float]) -> Func:
|
|
|
94
92
|
|
|
95
93
|
|
|
96
94
|
def case(
|
|
97
|
-
*args: tuple[
|
|
95
|
+
*args: tuple[ColumnElement | Func | bool, CaseT], else_: CaseT | None = None
|
|
98
96
|
) -> Func:
|
|
99
97
|
"""
|
|
100
98
|
Returns a case expression that evaluates a list of conditions and returns
|
|
@@ -163,9 +161,7 @@ def case(
|
|
|
163
161
|
return Func("case", inner=sql_case, cols=args, kwargs=kwargs, result_type=type_)
|
|
164
162
|
|
|
165
163
|
|
|
166
|
-
def ifelse(
|
|
167
|
-
condition: Union[ColumnElement, Func], if_val: CaseT, else_val: CaseT
|
|
168
|
-
) -> Func:
|
|
164
|
+
def ifelse(condition: ColumnElement | Func, if_val: CaseT, else_val: CaseT) -> Func:
|
|
169
165
|
"""
|
|
170
166
|
Returns an if-else expression that evaluates a condition and returns one
|
|
171
167
|
of two values based on the result. Values can be Python primitives
|
|
@@ -193,7 +189,7 @@ def ifelse(
|
|
|
193
189
|
return case((condition, if_val), else_=else_val)
|
|
194
190
|
|
|
195
191
|
|
|
196
|
-
def isnone(col:
|
|
192
|
+
def isnone(col: str | ColumnElement) -> Func:
|
|
197
193
|
"""
|
|
198
194
|
Returns a function that checks if the column value is `None` (NULL in DB).
|
|
199
195
|
|
|
@@ -221,7 +217,7 @@ def isnone(col: Union[str, ColumnElement]) -> Func:
|
|
|
221
217
|
return case((col.is_(None) if col is not None else True, True), else_=False)
|
|
222
218
|
|
|
223
219
|
|
|
224
|
-
def or_(*args:
|
|
220
|
+
def or_(*args: ColumnElement | Func) -> Func:
|
|
225
221
|
"""
|
|
226
222
|
Returns the function that produces conjunction of expressions joined by OR
|
|
227
223
|
logical operator.
|
|
@@ -256,7 +252,7 @@ def or_(*args: Union[ColumnElement, Func]) -> Func:
|
|
|
256
252
|
return Func("or", inner=sql_or, cols=cols, args=func_args, result_type=bool)
|
|
257
253
|
|
|
258
254
|
|
|
259
|
-
def and_(*args:
|
|
255
|
+
def and_(*args: ColumnElement | Func) -> Func:
|
|
260
256
|
"""
|
|
261
257
|
Returns the function that produces conjunction of expressions joined by AND
|
|
262
258
|
logical operator.
|
|
@@ -291,7 +287,7 @@ def and_(*args: Union[ColumnElement, Func]) -> Func:
|
|
|
291
287
|
return Func("and", inner=sql_and, cols=cols, args=func_args, result_type=bool)
|
|
292
288
|
|
|
293
289
|
|
|
294
|
-
def not_(arg:
|
|
290
|
+
def not_(arg: ColumnElement | Func) -> Func:
|
|
295
291
|
"""
|
|
296
292
|
Returns the function that produces NOT of the given expressions.
|
|
297
293
|
|
datachain/func/func.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import inspect
|
|
2
|
-
from collections.abc import Sequence
|
|
3
|
-
from typing import TYPE_CHECKING, Any,
|
|
2
|
+
from collections.abc import Callable, Sequence
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Union, get_args, get_origin
|
|
4
4
|
|
|
5
5
|
from sqlalchemy import BindParameter, Case, ColumnElement, Integer, cast, desc
|
|
6
6
|
from sqlalchemy.sql import func as sa_func
|
|
7
7
|
|
|
8
8
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
9
9
|
from datachain.lib.convert.sql_to_python import sql_to_python
|
|
10
|
+
from datachain.lib.model_store import ModelStore
|
|
10
11
|
from datachain.lib.utils import DataChainColumnError, DataChainParamsError
|
|
11
12
|
from datachain.query.schema import Column, ColumnMeta
|
|
12
13
|
from datachain.sql.functions import numeric
|
|
@@ -22,26 +23,29 @@ if TYPE_CHECKING:
|
|
|
22
23
|
from .window import Window
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
ColT = Union[str, Column, ColumnElement, "Func"
|
|
26
|
+
ColT = Union[str, tuple, Column, ColumnElement, "Func"]
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
class Func(Function): # noqa: PLW1641
|
|
29
30
|
"""Represents a function to be applied to a column in a SQL query."""
|
|
30
31
|
|
|
32
|
+
cols: Sequence[ColT]
|
|
33
|
+
args: Sequence[Any]
|
|
34
|
+
|
|
31
35
|
def __init__(
|
|
32
36
|
self,
|
|
33
37
|
name: str,
|
|
34
38
|
inner: Callable,
|
|
35
|
-
cols:
|
|
36
|
-
args:
|
|
37
|
-
kwargs:
|
|
38
|
-
result_type:
|
|
39
|
-
type_from_args:
|
|
39
|
+
cols: Sequence[ColT] | None = None,
|
|
40
|
+
args: Sequence[Any] | None = None,
|
|
41
|
+
kwargs: dict[str, Any] | None = None,
|
|
42
|
+
result_type: "DataType | None" = None,
|
|
43
|
+
type_from_args: Callable[..., "DataType"] | None = None,
|
|
40
44
|
is_array: bool = False,
|
|
41
45
|
from_array: bool = False,
|
|
42
46
|
is_window: bool = False,
|
|
43
|
-
window:
|
|
44
|
-
label:
|
|
47
|
+
window: "Window | None" = None,
|
|
48
|
+
label: str | None = None,
|
|
45
49
|
) -> None:
|
|
46
50
|
self.name = name
|
|
47
51
|
self.inner = inner
|
|
@@ -95,7 +99,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
95
99
|
else []
|
|
96
100
|
)
|
|
97
101
|
|
|
98
|
-
def _db_col_type(self, signals_schema: "SignalSchema") ->
|
|
102
|
+
def _db_col_type(self, signals_schema: "SignalSchema") -> "DataType | None":
|
|
99
103
|
if not self._db_cols:
|
|
100
104
|
return None
|
|
101
105
|
|
|
@@ -125,51 +129,51 @@ class Func(Function): # noqa: PLW1641
|
|
|
125
129
|
|
|
126
130
|
return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
|
|
127
131
|
|
|
128
|
-
def __add__(self, other:
|
|
132
|
+
def __add__(self, other: ColT | float) -> "Func":
|
|
129
133
|
if isinstance(other, (int, float)):
|
|
130
134
|
return Func("add", lambda a: a + other, [self])
|
|
131
135
|
return Func("add", lambda a1, a2: a1 + a2, [self, other])
|
|
132
136
|
|
|
133
|
-
def __radd__(self, other:
|
|
137
|
+
def __radd__(self, other: ColT | float) -> "Func":
|
|
134
138
|
if isinstance(other, (int, float)):
|
|
135
139
|
return Func("add", lambda a: other + a, [self])
|
|
136
140
|
return Func("add", lambda a1, a2: a1 + a2, [other, self])
|
|
137
141
|
|
|
138
|
-
def __sub__(self, other:
|
|
142
|
+
def __sub__(self, other: ColT | float) -> "Func":
|
|
139
143
|
if isinstance(other, (int, float)):
|
|
140
144
|
return Func("sub", lambda a: a - other, [self])
|
|
141
145
|
return Func("sub", lambda a1, a2: a1 - a2, [self, other])
|
|
142
146
|
|
|
143
|
-
def __rsub__(self, other:
|
|
147
|
+
def __rsub__(self, other: ColT | float) -> "Func":
|
|
144
148
|
if isinstance(other, (int, float)):
|
|
145
149
|
return Func("sub", lambda a: other - a, [self])
|
|
146
150
|
return Func("sub", lambda a1, a2: a1 - a2, [other, self])
|
|
147
151
|
|
|
148
|
-
def __mul__(self, other:
|
|
152
|
+
def __mul__(self, other: ColT | float) -> "Func":
|
|
149
153
|
if isinstance(other, (int, float)):
|
|
150
154
|
return Func("mul", lambda a: a * other, [self])
|
|
151
155
|
return Func("mul", lambda a1, a2: a1 * a2, [self, other])
|
|
152
156
|
|
|
153
|
-
def __rmul__(self, other:
|
|
157
|
+
def __rmul__(self, other: ColT | float) -> "Func":
|
|
154
158
|
if isinstance(other, (int, float)):
|
|
155
159
|
return Func("mul", lambda a: other * a, [self])
|
|
156
160
|
return Func("mul", lambda a1, a2: a1 * a2, [other, self])
|
|
157
161
|
|
|
158
|
-
def __truediv__(self, other:
|
|
162
|
+
def __truediv__(self, other: ColT | float) -> "Func":
|
|
159
163
|
if isinstance(other, (int, float)):
|
|
160
164
|
return Func("div", lambda a: _truediv(a, other), [self], result_type=float)
|
|
161
165
|
return Func(
|
|
162
166
|
"div", lambda a1, a2: _truediv(a1, a2), [self, other], result_type=float
|
|
163
167
|
)
|
|
164
168
|
|
|
165
|
-
def __rtruediv__(self, other:
|
|
169
|
+
def __rtruediv__(self, other: ColT | float) -> "Func":
|
|
166
170
|
if isinstance(other, (int, float)):
|
|
167
171
|
return Func("div", lambda a: _truediv(other, a), [self], result_type=float)
|
|
168
172
|
return Func(
|
|
169
173
|
"div", lambda a1, a2: _truediv(a1, a2), [other, self], result_type=float
|
|
170
174
|
)
|
|
171
175
|
|
|
172
|
-
def __floordiv__(self, other:
|
|
176
|
+
def __floordiv__(self, other: ColT | float) -> "Func":
|
|
173
177
|
if isinstance(other, (int, float)):
|
|
174
178
|
return Func(
|
|
175
179
|
"floordiv", lambda a: _floordiv(a, other), [self], result_type=int
|
|
@@ -178,7 +182,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
178
182
|
"floordiv", lambda a1, a2: _floordiv(a1, a2), [self, other], result_type=int
|
|
179
183
|
)
|
|
180
184
|
|
|
181
|
-
def __rfloordiv__(self, other:
|
|
185
|
+
def __rfloordiv__(self, other: ColT | float) -> "Func":
|
|
182
186
|
if isinstance(other, (int, float)):
|
|
183
187
|
return Func(
|
|
184
188
|
"floordiv", lambda a: _floordiv(other, a), [self], result_type=int
|
|
@@ -187,17 +191,17 @@ class Func(Function): # noqa: PLW1641
|
|
|
187
191
|
"floordiv", lambda a1, a2: _floordiv(a1, a2), [other, self], result_type=int
|
|
188
192
|
)
|
|
189
193
|
|
|
190
|
-
def __mod__(self, other:
|
|
194
|
+
def __mod__(self, other: ColT | float) -> "Func":
|
|
191
195
|
if isinstance(other, (int, float)):
|
|
192
196
|
return Func("mod", lambda a: a % other, [self], result_type=int)
|
|
193
197
|
return Func("mod", lambda a1, a2: a1 % a2, [self, other], result_type=int)
|
|
194
198
|
|
|
195
|
-
def __rmod__(self, other:
|
|
199
|
+
def __rmod__(self, other: ColT | float) -> "Func":
|
|
196
200
|
if isinstance(other, (int, float)):
|
|
197
201
|
return Func("mod", lambda a: other % a, [self], result_type=int)
|
|
198
202
|
return Func("mod", lambda a1, a2: a1 % a2, [other, self], result_type=int)
|
|
199
203
|
|
|
200
|
-
def __and__(self, other:
|
|
204
|
+
def __and__(self, other: ColT | float) -> "Func":
|
|
201
205
|
if isinstance(other, (int, float)):
|
|
202
206
|
return Func(
|
|
203
207
|
"and", lambda a: numeric.bit_and(a, other), [self], result_type=int
|
|
@@ -209,7 +213,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
209
213
|
result_type=int,
|
|
210
214
|
)
|
|
211
215
|
|
|
212
|
-
def __rand__(self, other:
|
|
216
|
+
def __rand__(self, other: ColT | float) -> "Func":
|
|
213
217
|
if isinstance(other, (int, float)):
|
|
214
218
|
return Func(
|
|
215
219
|
"and", lambda a: numeric.bit_and(other, a), [self], result_type=int
|
|
@@ -221,7 +225,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
221
225
|
result_type=int,
|
|
222
226
|
)
|
|
223
227
|
|
|
224
|
-
def __or__(self, other:
|
|
228
|
+
def __or__(self, other: ColT | float) -> "Func":
|
|
225
229
|
if isinstance(other, (int, float)):
|
|
226
230
|
return Func(
|
|
227
231
|
"or", lambda a: numeric.bit_or(a, other), [self], result_type=int
|
|
@@ -230,7 +234,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
230
234
|
"or", lambda a1, a2: numeric.bit_or(a1, a2), [self, other], result_type=int
|
|
231
235
|
)
|
|
232
236
|
|
|
233
|
-
def __ror__(self, other:
|
|
237
|
+
def __ror__(self, other: ColT | float) -> "Func":
|
|
234
238
|
if isinstance(other, (int, float)):
|
|
235
239
|
return Func(
|
|
236
240
|
"or", lambda a: numeric.bit_or(other, a), [self], result_type=int
|
|
@@ -239,7 +243,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
239
243
|
"or", lambda a1, a2: numeric.bit_or(a1, a2), [other, self], result_type=int
|
|
240
244
|
)
|
|
241
245
|
|
|
242
|
-
def __xor__(self, other:
|
|
246
|
+
def __xor__(self, other: ColT | float) -> "Func":
|
|
243
247
|
if isinstance(other, (int, float)):
|
|
244
248
|
return Func(
|
|
245
249
|
"xor", lambda a: numeric.bit_xor(a, other), [self], result_type=int
|
|
@@ -251,7 +255,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
251
255
|
result_type=int,
|
|
252
256
|
)
|
|
253
257
|
|
|
254
|
-
def __rxor__(self, other:
|
|
258
|
+
def __rxor__(self, other: ColT | float) -> "Func":
|
|
255
259
|
if isinstance(other, (int, float)):
|
|
256
260
|
return Func(
|
|
257
261
|
"xor", lambda a: numeric.bit_xor(other, a), [self], result_type=int
|
|
@@ -263,7 +267,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
263
267
|
result_type=int,
|
|
264
268
|
)
|
|
265
269
|
|
|
266
|
-
def __rshift__(self, other:
|
|
270
|
+
def __rshift__(self, other: ColT | float) -> "Func":
|
|
267
271
|
if isinstance(other, (int, float)):
|
|
268
272
|
return Func(
|
|
269
273
|
"rshift",
|
|
@@ -278,7 +282,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
278
282
|
result_type=int,
|
|
279
283
|
)
|
|
280
284
|
|
|
281
|
-
def __rrshift__(self, other:
|
|
285
|
+
def __rrshift__(self, other: ColT | float) -> "Func":
|
|
282
286
|
if isinstance(other, (int, float)):
|
|
283
287
|
return Func(
|
|
284
288
|
"rshift",
|
|
@@ -293,7 +297,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
293
297
|
result_type=int,
|
|
294
298
|
)
|
|
295
299
|
|
|
296
|
-
def __lshift__(self, other:
|
|
300
|
+
def __lshift__(self, other: ColT | float) -> "Func":
|
|
297
301
|
if isinstance(other, (int, float)):
|
|
298
302
|
return Func(
|
|
299
303
|
"lshift",
|
|
@@ -308,7 +312,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
308
312
|
result_type=int,
|
|
309
313
|
)
|
|
310
314
|
|
|
311
|
-
def __rlshift__(self, other:
|
|
315
|
+
def __rlshift__(self, other: ColT | float) -> "Func":
|
|
312
316
|
if isinstance(other, (int, float)):
|
|
313
317
|
return Func(
|
|
314
318
|
"lshift",
|
|
@@ -323,12 +327,12 @@ class Func(Function): # noqa: PLW1641
|
|
|
323
327
|
result_type=int,
|
|
324
328
|
)
|
|
325
329
|
|
|
326
|
-
def __lt__(self, other:
|
|
330
|
+
def __lt__(self, other: ColT | float) -> "Func":
|
|
327
331
|
if isinstance(other, (int, float)):
|
|
328
332
|
return Func("lt", lambda a: a < other, [self], result_type=bool)
|
|
329
333
|
return Func("lt", lambda a1, a2: a1 < a2, [self, other], result_type=bool)
|
|
330
334
|
|
|
331
|
-
def __le__(self, other:
|
|
335
|
+
def __le__(self, other: ColT | float) -> "Func":
|
|
332
336
|
if isinstance(other, (int, float)):
|
|
333
337
|
return Func("le", lambda a: a <= other, [self], result_type=bool)
|
|
334
338
|
return Func("le", lambda a1, a2: a1 <= a2, [self, other], result_type=bool)
|
|
@@ -343,12 +347,12 @@ class Func(Function): # noqa: PLW1641
|
|
|
343
347
|
return Func("ne", lambda a: a != other, [self], result_type=bool)
|
|
344
348
|
return Func("ne", lambda a1, a2: a1 != a2, [self, other], result_type=bool)
|
|
345
349
|
|
|
346
|
-
def __gt__(self, other:
|
|
350
|
+
def __gt__(self, other: ColT | float) -> "Func":
|
|
347
351
|
if isinstance(other, (int, float)):
|
|
348
352
|
return Func("gt", lambda a: a > other, [self], result_type=bool)
|
|
349
353
|
return Func("gt", lambda a1, a2: a1 > a2, [self, other], result_type=bool)
|
|
350
354
|
|
|
351
|
-
def __ge__(self, other:
|
|
355
|
+
def __ge__(self, other: ColT | float) -> "Func":
|
|
352
356
|
if isinstance(other, (int, float)):
|
|
353
357
|
return Func("ge", lambda a: a >= other, [self], result_type=bool)
|
|
354
358
|
return Func("ge", lambda a1, a2: a1 >= a2, [self, other], result_type=bool)
|
|
@@ -369,7 +373,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
369
373
|
label,
|
|
370
374
|
)
|
|
371
375
|
|
|
372
|
-
def get_col_name(self, label:
|
|
376
|
+
def get_col_name(self, label: str | None = None) -> str:
|
|
373
377
|
if label:
|
|
374
378
|
return label
|
|
375
379
|
if self.col_label:
|
|
@@ -384,7 +388,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
384
388
|
return self.name
|
|
385
389
|
|
|
386
390
|
def get_result_type(
|
|
387
|
-
self, signals_schema:
|
|
391
|
+
self, signals_schema: "SignalSchema | None" = None
|
|
388
392
|
) -> "DataType":
|
|
389
393
|
if self.result_type:
|
|
390
394
|
return self.result_type
|
|
@@ -408,10 +412,24 @@ class Func(Function): # noqa: PLW1641
|
|
|
408
412
|
|
|
409
413
|
def get_column(
|
|
410
414
|
self,
|
|
411
|
-
signals_schema:
|
|
412
|
-
label:
|
|
413
|
-
table:
|
|
415
|
+
signals_schema: "SignalSchema | None" = None,
|
|
416
|
+
label: str | None = None,
|
|
417
|
+
table: "TableClause | None" = None,
|
|
414
418
|
) -> Column:
|
|
419
|
+
# Guard against using complex (pydantic) object columns in SQL funcs
|
|
420
|
+
if signals_schema and self._db_cols:
|
|
421
|
+
for arg in self._db_cols:
|
|
422
|
+
# _db_cols normalizes known columns to strings; skip non-string args
|
|
423
|
+
if not isinstance(arg, str):
|
|
424
|
+
continue
|
|
425
|
+
t_with_sub = signals_schema.get_column_type(arg, with_subtree=True)
|
|
426
|
+
if ModelStore.is_pydantic(t_with_sub):
|
|
427
|
+
raise DataChainParamsError(
|
|
428
|
+
f"Function {self.name} doesn't support complex object "
|
|
429
|
+
f"columns like '{arg}'. Use a leaf field (e.g., "
|
|
430
|
+
f"'{arg}.path') or use UDFs to operate on complex objects."
|
|
431
|
+
)
|
|
432
|
+
|
|
415
433
|
col_type = self.get_result_type(signals_schema)
|
|
416
434
|
sql_type = python_to_sql(col_type)
|
|
417
435
|
|
|
@@ -431,6 +449,7 @@ class Func(Function): # noqa: PLW1641
|
|
|
431
449
|
return col
|
|
432
450
|
|
|
433
451
|
cols = [get_col(col) for col in self._db_cols]
|
|
452
|
+
|
|
434
453
|
kwargs = {k: get_col(v, string_as_literal=True) for k, v in self.kwargs.items()}
|
|
435
454
|
func_col = self.inner(*cols, *self.args, **kwargs)
|
|
436
455
|
|
|
@@ -467,9 +486,8 @@ def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
|
|
|
467
486
|
if isinstance(col, ColumnElement) and not hasattr(col, "name"):
|
|
468
487
|
return sql_to_python(col)
|
|
469
488
|
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
)
|
|
489
|
+
name = col.name if isinstance(col, ColumnElement) else col # type: ignore[assignment]
|
|
490
|
+
return signals_schema.get_column_type(name) # type: ignore[arg-type]
|
|
473
491
|
|
|
474
492
|
|
|
475
493
|
def _truediv(a, b):
|
datachain/func/numeric.py
CHANGED
|
@@ -1,12 +1,10 @@
|
|
|
1
|
-
from typing import Union
|
|
2
|
-
|
|
3
1
|
from datachain.query.schema import Column
|
|
4
2
|
from datachain.sql.functions import numeric
|
|
5
3
|
|
|
6
4
|
from .func import Func
|
|
7
5
|
|
|
8
6
|
|
|
9
|
-
def bit_and(*args:
|
|
7
|
+
def bit_and(*args: str | Column | Func | int) -> Func:
|
|
10
8
|
"""
|
|
11
9
|
Returns a function that computes the bitwise AND operation between two values.
|
|
12
10
|
|
|
@@ -51,7 +49,7 @@ def bit_and(*args: Union[str, Column, Func, int]) -> Func:
|
|
|
51
49
|
)
|
|
52
50
|
|
|
53
51
|
|
|
54
|
-
def bit_or(*args:
|
|
52
|
+
def bit_or(*args: str | Column | Func | int) -> Func:
|
|
55
53
|
"""
|
|
56
54
|
Returns a function that computes the bitwise OR operation between two values.
|
|
57
55
|
|
|
@@ -96,7 +94,7 @@ def bit_or(*args: Union[str, Column, Func, int]) -> Func:
|
|
|
96
94
|
)
|
|
97
95
|
|
|
98
96
|
|
|
99
|
-
def bit_xor(*args:
|
|
97
|
+
def bit_xor(*args: str | Column | Func | int) -> Func:
|
|
100
98
|
"""
|
|
101
99
|
Returns a function that computes the bitwise XOR operation between two values.
|
|
102
100
|
|
|
@@ -141,7 +139,7 @@ def bit_xor(*args: Union[str, Column, Func, int]) -> Func:
|
|
|
141
139
|
)
|
|
142
140
|
|
|
143
141
|
|
|
144
|
-
def int_hash_64(col:
|
|
142
|
+
def int_hash_64(col: str | Column | Func | int) -> Func:
|
|
145
143
|
"""
|
|
146
144
|
Returns a function that computes the 64-bit hash of an integer.
|
|
147
145
|
|
|
@@ -177,7 +175,7 @@ def int_hash_64(col: Union[str, Column, Func, int]) -> Func:
|
|
|
177
175
|
)
|
|
178
176
|
|
|
179
177
|
|
|
180
|
-
def bit_hamming_distance(*args:
|
|
178
|
+
def bit_hamming_distance(*args: str | Column | Func | int) -> Func:
|
|
181
179
|
"""
|
|
182
180
|
Returns a function that computes the Hamming distance between two integers.
|
|
183
181
|
|
datachain/func/string.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import get_origin
|
|
2
2
|
|
|
3
3
|
from sqlalchemy import literal
|
|
4
4
|
|
|
@@ -44,7 +44,7 @@ def length(col: ColT) -> Func:
|
|
|
44
44
|
return Func("length", inner=string.length, cols=[col], result_type=int)
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def split(col: ColT, sep: str, limit:
|
|
47
|
+
def split(col: ColT, sep: str, limit: int | None = None) -> Func:
|
|
48
48
|
"""
|
|
49
49
|
Takes a column and split character and returns an array of the parts.
|
|
50
50
|
|
datachain/hash_utils.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import inspect
|
|
3
|
+
import textwrap
|
|
4
|
+
from collections.abc import Sequence
|
|
5
|
+
from typing import TypeAlias, TypeVar
|
|
6
|
+
|
|
7
|
+
from sqlalchemy.sql.elements import ClauseElement, ColumnElement
|
|
8
|
+
|
|
9
|
+
from datachain import json
|
|
10
|
+
|
|
11
|
+
T = TypeVar("T", bound=ColumnElement)
|
|
12
|
+
ColumnLike: TypeAlias = str | T
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _serialize_value(val): # noqa: PLR0911
|
|
16
|
+
"""Helper to serialize arbitrary values recursively."""
|
|
17
|
+
if val is None:
|
|
18
|
+
return None
|
|
19
|
+
if isinstance(val, (str, int, float, bool)):
|
|
20
|
+
return val
|
|
21
|
+
if isinstance(val, ClauseElement):
|
|
22
|
+
return serialize_column_element(val)
|
|
23
|
+
if isinstance(val, dict):
|
|
24
|
+
# Sort dict keys for deterministic serialization
|
|
25
|
+
return {k: _serialize_value(v) for k, v in sorted(val.items())}
|
|
26
|
+
if isinstance(val, (list, tuple)):
|
|
27
|
+
return [_serialize_value(v) for v in val]
|
|
28
|
+
if callable(val):
|
|
29
|
+
return val.__name__ if hasattr(val, "__name__") else str(val)
|
|
30
|
+
return str(val)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def serialize_column_element(expr: str | ColumnElement) -> dict:
|
|
34
|
+
"""
|
|
35
|
+
Recursively serialize a SQLAlchemy ColumnElement into a deterministic structure.
|
|
36
|
+
Uses SQLAlchemy's _traverse_internals to automatically handle all expression types.
|
|
37
|
+
"""
|
|
38
|
+
from sqlalchemy.sql.elements import BindParameter
|
|
39
|
+
|
|
40
|
+
# Special case: BindParameter has non-deterministic 'key' attribute, only use value
|
|
41
|
+
if isinstance(expr, BindParameter):
|
|
42
|
+
return {"type": "bind", "value": _serialize_value(expr.value)}
|
|
43
|
+
|
|
44
|
+
# Generic handling for all ClauseElement types using SQLAlchemy's internals
|
|
45
|
+
if isinstance(expr, ClauseElement):
|
|
46
|
+
# All standard SQLAlchemy types have _traverse_internals
|
|
47
|
+
if hasattr(expr, "_traverse_internals"):
|
|
48
|
+
result = {"type": expr.__class__.__name__}
|
|
49
|
+
for attr_name, _ in expr._traverse_internals:
|
|
50
|
+
# Skip 'table' attribute - table names can be auto-generated/random
|
|
51
|
+
# and are not semantically important for hashing
|
|
52
|
+
if attr_name == "table":
|
|
53
|
+
continue
|
|
54
|
+
if hasattr(expr, attr_name):
|
|
55
|
+
val = getattr(expr, attr_name)
|
|
56
|
+
result[attr_name] = _serialize_value(val)
|
|
57
|
+
return result
|
|
58
|
+
# Rare case: custom user-defined ClauseElement without _traverse_internals
|
|
59
|
+
# We don't know its structure, so just stringify it
|
|
60
|
+
return {"type": expr.__class__.__name__, "repr": str(expr)}
|
|
61
|
+
|
|
62
|
+
# Absolute fallback: stringify completely unknown types
|
|
63
|
+
return {"type": "other", "repr": str(expr)}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def hash_column_elements(columns: ColumnLike | Sequence[ColumnLike]) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Hash a list of ColumnElements deterministically, dialect agnostic.
|
|
69
|
+
Only accepts ordered iterables (like list or tuple).
|
|
70
|
+
"""
|
|
71
|
+
# Handle case where a single ColumnElement is passed instead of a sequence
|
|
72
|
+
if isinstance(columns, (ColumnElement, str)):
|
|
73
|
+
columns = (columns,)
|
|
74
|
+
|
|
75
|
+
serialized = [serialize_column_element(c) for c in columns]
|
|
76
|
+
json_str = json.dumps(
|
|
77
|
+
serialized, sort_keys=True, separators=(", ", ": ")
|
|
78
|
+
) # stable JSON
|
|
79
|
+
return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def hash_callable(func):
|
|
83
|
+
"""
|
|
84
|
+
Calculate a hash from a callable.
|
|
85
|
+
Rules:
|
|
86
|
+
- Named functions (def) → use source code for stable, cross-version hashing
|
|
87
|
+
- Lambdas → use bytecode (deterministic in same Python runtime)
|
|
88
|
+
"""
|
|
89
|
+
if not callable(func):
|
|
90
|
+
raise TypeError("Expected a callable")
|
|
91
|
+
|
|
92
|
+
# Determine if it is a lambda
|
|
93
|
+
is_lambda = func.__name__ == "<lambda>"
|
|
94
|
+
|
|
95
|
+
if not is_lambda:
|
|
96
|
+
# Try to get exact source of named function
|
|
97
|
+
try:
|
|
98
|
+
lines, _ = inspect.getsourcelines(func)
|
|
99
|
+
payload = textwrap.dedent("".join(lines)).strip()
|
|
100
|
+
except (OSError, TypeError):
|
|
101
|
+
# Fallback: bytecode if source not available
|
|
102
|
+
payload = func.__code__.co_code
|
|
103
|
+
else:
|
|
104
|
+
# For lambdas, fall back directly to bytecode
|
|
105
|
+
payload = func.__code__.co_code
|
|
106
|
+
|
|
107
|
+
# Normalize annotations
|
|
108
|
+
annotations = {
|
|
109
|
+
k: getattr(v, "__name__", str(v)) for k, v in func.__annotations__.items()
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Extras to distinguish functions with same code but different metadata
|
|
113
|
+
extras = {
|
|
114
|
+
"name": func.__name__,
|
|
115
|
+
"defaults": func.__defaults__,
|
|
116
|
+
"annotations": annotations,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
# Compute SHA256
|
|
120
|
+
h = hashlib.sha256()
|
|
121
|
+
h.update(str(payload).encode() if isinstance(payload, str) else payload)
|
|
122
|
+
h.update(str(extras).encode())
|
|
123
|
+
return h.hexdigest()
|
datachain/job.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import uuid
|
|
3
2
|
from dataclasses import dataclass
|
|
4
3
|
from datetime import datetime
|
|
5
|
-
from typing import Any,
|
|
4
|
+
from typing import Any, TypeVar
|
|
5
|
+
|
|
6
|
+
from datachain import json
|
|
6
7
|
|
|
7
8
|
J = TypeVar("J", bound="Job")
|
|
8
9
|
|
|
@@ -18,27 +19,29 @@ class Job:
|
|
|
18
19
|
workers: int
|
|
19
20
|
params: dict[str, str]
|
|
20
21
|
metrics: dict[str, Any]
|
|
21
|
-
finished_at:
|
|
22
|
-
python_version:
|
|
22
|
+
finished_at: datetime | None = None
|
|
23
|
+
python_version: str | None = None
|
|
23
24
|
error_message: str = ""
|
|
24
25
|
error_stack: str = ""
|
|
26
|
+
parent_job_id: str | None = None
|
|
25
27
|
|
|
26
28
|
@classmethod
|
|
27
29
|
def parse(
|
|
28
30
|
cls,
|
|
29
|
-
id:
|
|
31
|
+
id: str | uuid.UUID,
|
|
30
32
|
name: str,
|
|
31
33
|
status: int,
|
|
32
34
|
created_at: datetime,
|
|
33
|
-
finished_at:
|
|
35
|
+
finished_at: datetime | None,
|
|
34
36
|
query: str,
|
|
35
37
|
query_type: int,
|
|
36
38
|
workers: int,
|
|
37
|
-
python_version:
|
|
39
|
+
python_version: str | None,
|
|
38
40
|
error_message: str,
|
|
39
41
|
error_stack: str,
|
|
40
42
|
params: str,
|
|
41
43
|
metrics: str,
|
|
44
|
+
parent_job_id: str | None,
|
|
42
45
|
) -> "Job":
|
|
43
46
|
return cls(
|
|
44
47
|
str(id),
|
|
@@ -54,4 +57,5 @@ class Job:
|
|
|
54
57
|
python_version,
|
|
55
58
|
error_message,
|
|
56
59
|
error_stack,
|
|
60
|
+
str(parent_job_id) if parent_job_id else None,
|
|
57
61
|
)
|