datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. datachain/__init__.py +4 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +5 -5
  4. datachain/catalog/__init__.py +0 -2
  5. datachain/catalog/catalog.py +276 -354
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +8 -3
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +10 -17
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +42 -27
  12. datachain/cli/commands/ls.py +15 -15
  13. datachain/cli/commands/show.py +2 -2
  14. datachain/cli/parser/__init__.py +3 -43
  15. datachain/cli/parser/job.py +1 -1
  16. datachain/cli/parser/utils.py +1 -2
  17. datachain/cli/utils.py +2 -15
  18. datachain/client/azure.py +2 -2
  19. datachain/client/fsspec.py +34 -23
  20. datachain/client/gcs.py +3 -3
  21. datachain/client/http.py +157 -0
  22. datachain/client/local.py +11 -7
  23. datachain/client/s3.py +3 -3
  24. datachain/config.py +4 -8
  25. datachain/data_storage/db_engine.py +12 -6
  26. datachain/data_storage/job.py +2 -0
  27. datachain/data_storage/metastore.py +716 -137
  28. datachain/data_storage/schema.py +20 -27
  29. datachain/data_storage/serializer.py +105 -15
  30. datachain/data_storage/sqlite.py +114 -114
  31. datachain/data_storage/warehouse.py +140 -48
  32. datachain/dataset.py +109 -89
  33. datachain/delta.py +117 -42
  34. datachain/diff/__init__.py +25 -33
  35. datachain/error.py +24 -0
  36. datachain/func/aggregate.py +9 -11
  37. datachain/func/array.py +12 -12
  38. datachain/func/base.py +7 -4
  39. datachain/func/conditional.py +9 -13
  40. datachain/func/func.py +63 -45
  41. datachain/func/numeric.py +5 -7
  42. datachain/func/string.py +2 -2
  43. datachain/hash_utils.py +123 -0
  44. datachain/job.py +11 -7
  45. datachain/json.py +138 -0
  46. datachain/lib/arrow.py +18 -15
  47. datachain/lib/audio.py +60 -59
  48. datachain/lib/clip.py +14 -13
  49. datachain/lib/convert/python_to_sql.py +6 -10
  50. datachain/lib/convert/values_to_tuples.py +151 -53
  51. datachain/lib/data_model.py +23 -19
  52. datachain/lib/dataset_info.py +7 -7
  53. datachain/lib/dc/__init__.py +2 -1
  54. datachain/lib/dc/csv.py +22 -26
  55. datachain/lib/dc/database.py +37 -34
  56. datachain/lib/dc/datachain.py +518 -324
  57. datachain/lib/dc/datasets.py +38 -30
  58. datachain/lib/dc/hf.py +16 -20
  59. datachain/lib/dc/json.py +17 -18
  60. datachain/lib/dc/listings.py +5 -8
  61. datachain/lib/dc/pandas.py +3 -6
  62. datachain/lib/dc/parquet.py +33 -21
  63. datachain/lib/dc/records.py +9 -13
  64. datachain/lib/dc/storage.py +103 -65
  65. datachain/lib/dc/storage_pattern.py +251 -0
  66. datachain/lib/dc/utils.py +17 -14
  67. datachain/lib/dc/values.py +3 -6
  68. datachain/lib/file.py +187 -50
  69. datachain/lib/hf.py +7 -5
  70. datachain/lib/image.py +13 -13
  71. datachain/lib/listing.py +5 -5
  72. datachain/lib/listing_info.py +1 -2
  73. datachain/lib/meta_formats.py +2 -3
  74. datachain/lib/model_store.py +20 -8
  75. datachain/lib/namespaces.py +59 -7
  76. datachain/lib/projects.py +51 -9
  77. datachain/lib/pytorch.py +31 -23
  78. datachain/lib/settings.py +188 -85
  79. datachain/lib/signal_schema.py +302 -64
  80. datachain/lib/text.py +8 -7
  81. datachain/lib/udf.py +103 -63
  82. datachain/lib/udf_signature.py +59 -34
  83. datachain/lib/utils.py +20 -0
  84. datachain/lib/video.py +3 -4
  85. datachain/lib/webdataset.py +31 -36
  86. datachain/lib/webdataset_laion.py +15 -16
  87. datachain/listing.py +12 -5
  88. datachain/model/bbox.py +3 -1
  89. datachain/namespace.py +22 -3
  90. datachain/node.py +6 -6
  91. datachain/nodes_thread_pool.py +0 -1
  92. datachain/plugins.py +24 -0
  93. datachain/project.py +4 -4
  94. datachain/query/batch.py +10 -12
  95. datachain/query/dataset.py +376 -194
  96. datachain/query/dispatch.py +112 -84
  97. datachain/query/metrics.py +3 -4
  98. datachain/query/params.py +2 -3
  99. datachain/query/queue.py +2 -1
  100. datachain/query/schema.py +7 -6
  101. datachain/query/session.py +190 -33
  102. datachain/query/udf.py +9 -6
  103. datachain/remote/studio.py +90 -53
  104. datachain/script_meta.py +12 -12
  105. datachain/sql/sqlite/base.py +37 -25
  106. datachain/sql/sqlite/types.py +1 -1
  107. datachain/sql/types.py +36 -5
  108. datachain/studio.py +49 -40
  109. datachain/toolkit/split.py +31 -10
  110. datachain/utils.py +39 -48
  111. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
  112. datachain-0.39.0.dist-info/RECORD +173 -0
  113. datachain/cli/commands/query.py +0 -54
  114. datachain/query/utils.py +0 -36
  115. datachain-0.30.5.dist-info/RECORD +0 -168
  116. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
  117. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  118. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  119. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,3 @@
1
- from typing import Optional, Union
2
-
3
1
  from sqlalchemy import ColumnElement
4
2
  from sqlalchemy import and_ as sql_and
5
3
  from sqlalchemy import case as sql_case
@@ -12,10 +10,10 @@ from datachain.sql.functions import conditional
12
10
 
13
11
  from .func import Func
14
12
 
15
- CaseT = Union[int, float, complex, bool, str, Func, ColumnElement]
13
+ CaseT = int | float | complex | bool | str | Func | ColumnElement
16
14
 
17
15
 
18
- def greatest(*args: Union[str, Column, Func, float]) -> Func:
16
+ def greatest(*args: str | Column | Func | float) -> Func:
19
17
  """
20
18
  Returns the greatest (largest) value from the given input values.
21
19
 
@@ -56,7 +54,7 @@ def greatest(*args: Union[str, Column, Func, float]) -> Func:
56
54
  )
57
55
 
58
56
 
59
- def least(*args: Union[str, Column, Func, float]) -> Func:
57
+ def least(*args: str | Column | Func | float) -> Func:
60
58
  """
61
59
  Returns the least (smallest) value from the given input values.
62
60
 
@@ -94,7 +92,7 @@ def least(*args: Union[str, Column, Func, float]) -> Func:
94
92
 
95
93
 
96
94
  def case(
97
- *args: tuple[Union[ColumnElement, Func, bool], CaseT], else_: Optional[CaseT] = None
95
+ *args: tuple[ColumnElement | Func | bool, CaseT], else_: CaseT | None = None
98
96
  ) -> Func:
99
97
  """
100
98
  Returns a case expression that evaluates a list of conditions and returns
@@ -163,9 +161,7 @@ def case(
163
161
  return Func("case", inner=sql_case, cols=args, kwargs=kwargs, result_type=type_)
164
162
 
165
163
 
166
- def ifelse(
167
- condition: Union[ColumnElement, Func], if_val: CaseT, else_val: CaseT
168
- ) -> Func:
164
+ def ifelse(condition: ColumnElement | Func, if_val: CaseT, else_val: CaseT) -> Func:
169
165
  """
170
166
  Returns an if-else expression that evaluates a condition and returns one
171
167
  of two values based on the result. Values can be Python primitives
@@ -193,7 +189,7 @@ def ifelse(
193
189
  return case((condition, if_val), else_=else_val)
194
190
 
195
191
 
196
- def isnone(col: Union[str, ColumnElement]) -> Func:
192
+ def isnone(col: str | ColumnElement) -> Func:
197
193
  """
198
194
  Returns a function that checks if the column value is `None` (NULL in DB).
199
195
 
@@ -221,7 +217,7 @@ def isnone(col: Union[str, ColumnElement]) -> Func:
221
217
  return case((col.is_(None) if col is not None else True, True), else_=False)
222
218
 
223
219
 
224
- def or_(*args: Union[ColumnElement, Func]) -> Func:
220
+ def or_(*args: ColumnElement | Func) -> Func:
225
221
  """
226
222
  Returns the function that produces conjunction of expressions joined by OR
227
223
  logical operator.
@@ -256,7 +252,7 @@ def or_(*args: Union[ColumnElement, Func]) -> Func:
256
252
  return Func("or", inner=sql_or, cols=cols, args=func_args, result_type=bool)
257
253
 
258
254
 
259
- def and_(*args: Union[ColumnElement, Func]) -> Func:
255
+ def and_(*args: ColumnElement | Func) -> Func:
260
256
  """
261
257
  Returns the function that produces conjunction of expressions joined by AND
262
258
  logical operator.
@@ -291,7 +287,7 @@ def and_(*args: Union[ColumnElement, Func]) -> Func:
291
287
  return Func("and", inner=sql_and, cols=cols, args=func_args, result_type=bool)
292
288
 
293
289
 
294
- def not_(arg: Union[ColumnElement, Func]) -> Func:
290
+ def not_(arg: ColumnElement | Func) -> Func:
295
291
  """
296
292
  Returns the function that produces NOT of the given expressions.
297
293
 
datachain/func/func.py CHANGED
@@ -1,12 +1,13 @@
1
1
  import inspect
2
- from collections.abc import Sequence
3
- from typing import TYPE_CHECKING, Any, Callable, Optional, Union, get_args, get_origin
2
+ from collections.abc import Callable, Sequence
3
+ from typing import TYPE_CHECKING, Any, Union, get_args, get_origin
4
4
 
5
5
  from sqlalchemy import BindParameter, Case, ColumnElement, Integer, cast, desc
6
6
  from sqlalchemy.sql import func as sa_func
7
7
 
8
8
  from datachain.lib.convert.python_to_sql import python_to_sql
9
9
  from datachain.lib.convert.sql_to_python import sql_to_python
10
+ from datachain.lib.model_store import ModelStore
10
11
  from datachain.lib.utils import DataChainColumnError, DataChainParamsError
11
12
  from datachain.query.schema import Column, ColumnMeta
12
13
  from datachain.sql.functions import numeric
@@ -22,26 +23,29 @@ if TYPE_CHECKING:
22
23
  from .window import Window
23
24
 
24
25
 
25
- ColT = Union[str, Column, ColumnElement, "Func", tuple]
26
+ ColT = Union[str, tuple, Column, ColumnElement, "Func"]
26
27
 
27
28
 
28
29
  class Func(Function): # noqa: PLW1641
29
30
  """Represents a function to be applied to a column in a SQL query."""
30
31
 
32
+ cols: Sequence[ColT]
33
+ args: Sequence[Any]
34
+
31
35
  def __init__(
32
36
  self,
33
37
  name: str,
34
38
  inner: Callable,
35
- cols: Optional[Sequence[ColT]] = None,
36
- args: Optional[Sequence[Any]] = None,
37
- kwargs: Optional[dict[str, Any]] = None,
38
- result_type: Optional["DataType"] = None,
39
- type_from_args: Optional[Callable[..., "DataType"]] = None,
39
+ cols: Sequence[ColT] | None = None,
40
+ args: Sequence[Any] | None = None,
41
+ kwargs: dict[str, Any] | None = None,
42
+ result_type: "DataType | None" = None,
43
+ type_from_args: Callable[..., "DataType"] | None = None,
40
44
  is_array: bool = False,
41
45
  from_array: bool = False,
42
46
  is_window: bool = False,
43
- window: Optional["Window"] = None,
44
- label: Optional[str] = None,
47
+ window: "Window | None" = None,
48
+ label: str | None = None,
45
49
  ) -> None:
46
50
  self.name = name
47
51
  self.inner = inner
@@ -95,7 +99,7 @@ class Func(Function): # noqa: PLW1641
95
99
  else []
96
100
  )
97
101
 
98
- def _db_col_type(self, signals_schema: "SignalSchema") -> Optional["DataType"]:
102
+ def _db_col_type(self, signals_schema: "SignalSchema") -> "DataType | None":
99
103
  if not self._db_cols:
100
104
  return None
101
105
 
@@ -125,51 +129,51 @@ class Func(Function): # noqa: PLW1641
125
129
 
126
130
  return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
127
131
 
128
- def __add__(self, other: Union[ColT, float]) -> "Func":
132
+ def __add__(self, other: ColT | float) -> "Func":
129
133
  if isinstance(other, (int, float)):
130
134
  return Func("add", lambda a: a + other, [self])
131
135
  return Func("add", lambda a1, a2: a1 + a2, [self, other])
132
136
 
133
- def __radd__(self, other: Union[ColT, float]) -> "Func":
137
+ def __radd__(self, other: ColT | float) -> "Func":
134
138
  if isinstance(other, (int, float)):
135
139
  return Func("add", lambda a: other + a, [self])
136
140
  return Func("add", lambda a1, a2: a1 + a2, [other, self])
137
141
 
138
- def __sub__(self, other: Union[ColT, float]) -> "Func":
142
+ def __sub__(self, other: ColT | float) -> "Func":
139
143
  if isinstance(other, (int, float)):
140
144
  return Func("sub", lambda a: a - other, [self])
141
145
  return Func("sub", lambda a1, a2: a1 - a2, [self, other])
142
146
 
143
- def __rsub__(self, other: Union[ColT, float]) -> "Func":
147
+ def __rsub__(self, other: ColT | float) -> "Func":
144
148
  if isinstance(other, (int, float)):
145
149
  return Func("sub", lambda a: other - a, [self])
146
150
  return Func("sub", lambda a1, a2: a1 - a2, [other, self])
147
151
 
148
- def __mul__(self, other: Union[ColT, float]) -> "Func":
152
+ def __mul__(self, other: ColT | float) -> "Func":
149
153
  if isinstance(other, (int, float)):
150
154
  return Func("mul", lambda a: a * other, [self])
151
155
  return Func("mul", lambda a1, a2: a1 * a2, [self, other])
152
156
 
153
- def __rmul__(self, other: Union[ColT, float]) -> "Func":
157
+ def __rmul__(self, other: ColT | float) -> "Func":
154
158
  if isinstance(other, (int, float)):
155
159
  return Func("mul", lambda a: other * a, [self])
156
160
  return Func("mul", lambda a1, a2: a1 * a2, [other, self])
157
161
 
158
- def __truediv__(self, other: Union[ColT, float]) -> "Func":
162
+ def __truediv__(self, other: ColT | float) -> "Func":
159
163
  if isinstance(other, (int, float)):
160
164
  return Func("div", lambda a: _truediv(a, other), [self], result_type=float)
161
165
  return Func(
162
166
  "div", lambda a1, a2: _truediv(a1, a2), [self, other], result_type=float
163
167
  )
164
168
 
165
- def __rtruediv__(self, other: Union[ColT, float]) -> "Func":
169
+ def __rtruediv__(self, other: ColT | float) -> "Func":
166
170
  if isinstance(other, (int, float)):
167
171
  return Func("div", lambda a: _truediv(other, a), [self], result_type=float)
168
172
  return Func(
169
173
  "div", lambda a1, a2: _truediv(a1, a2), [other, self], result_type=float
170
174
  )
171
175
 
172
- def __floordiv__(self, other: Union[ColT, float]) -> "Func":
176
+ def __floordiv__(self, other: ColT | float) -> "Func":
173
177
  if isinstance(other, (int, float)):
174
178
  return Func(
175
179
  "floordiv", lambda a: _floordiv(a, other), [self], result_type=int
@@ -178,7 +182,7 @@ class Func(Function): # noqa: PLW1641
178
182
  "floordiv", lambda a1, a2: _floordiv(a1, a2), [self, other], result_type=int
179
183
  )
180
184
 
181
- def __rfloordiv__(self, other: Union[ColT, float]) -> "Func":
185
+ def __rfloordiv__(self, other: ColT | float) -> "Func":
182
186
  if isinstance(other, (int, float)):
183
187
  return Func(
184
188
  "floordiv", lambda a: _floordiv(other, a), [self], result_type=int
@@ -187,17 +191,17 @@ class Func(Function): # noqa: PLW1641
187
191
  "floordiv", lambda a1, a2: _floordiv(a1, a2), [other, self], result_type=int
188
192
  )
189
193
 
190
- def __mod__(self, other: Union[ColT, float]) -> "Func":
194
+ def __mod__(self, other: ColT | float) -> "Func":
191
195
  if isinstance(other, (int, float)):
192
196
  return Func("mod", lambda a: a % other, [self], result_type=int)
193
197
  return Func("mod", lambda a1, a2: a1 % a2, [self, other], result_type=int)
194
198
 
195
- def __rmod__(self, other: Union[ColT, float]) -> "Func":
199
+ def __rmod__(self, other: ColT | float) -> "Func":
196
200
  if isinstance(other, (int, float)):
197
201
  return Func("mod", lambda a: other % a, [self], result_type=int)
198
202
  return Func("mod", lambda a1, a2: a1 % a2, [other, self], result_type=int)
199
203
 
200
- def __and__(self, other: Union[ColT, float]) -> "Func":
204
+ def __and__(self, other: ColT | float) -> "Func":
201
205
  if isinstance(other, (int, float)):
202
206
  return Func(
203
207
  "and", lambda a: numeric.bit_and(a, other), [self], result_type=int
@@ -209,7 +213,7 @@ class Func(Function): # noqa: PLW1641
209
213
  result_type=int,
210
214
  )
211
215
 
212
- def __rand__(self, other: Union[ColT, float]) -> "Func":
216
+ def __rand__(self, other: ColT | float) -> "Func":
213
217
  if isinstance(other, (int, float)):
214
218
  return Func(
215
219
  "and", lambda a: numeric.bit_and(other, a), [self], result_type=int
@@ -221,7 +225,7 @@ class Func(Function): # noqa: PLW1641
221
225
  result_type=int,
222
226
  )
223
227
 
224
- def __or__(self, other: Union[ColT, float]) -> "Func":
228
+ def __or__(self, other: ColT | float) -> "Func":
225
229
  if isinstance(other, (int, float)):
226
230
  return Func(
227
231
  "or", lambda a: numeric.bit_or(a, other), [self], result_type=int
@@ -230,7 +234,7 @@ class Func(Function): # noqa: PLW1641
230
234
  "or", lambda a1, a2: numeric.bit_or(a1, a2), [self, other], result_type=int
231
235
  )
232
236
 
233
- def __ror__(self, other: Union[ColT, float]) -> "Func":
237
+ def __ror__(self, other: ColT | float) -> "Func":
234
238
  if isinstance(other, (int, float)):
235
239
  return Func(
236
240
  "or", lambda a: numeric.bit_or(other, a), [self], result_type=int
@@ -239,7 +243,7 @@ class Func(Function): # noqa: PLW1641
239
243
  "or", lambda a1, a2: numeric.bit_or(a1, a2), [other, self], result_type=int
240
244
  )
241
245
 
242
- def __xor__(self, other: Union[ColT, float]) -> "Func":
246
+ def __xor__(self, other: ColT | float) -> "Func":
243
247
  if isinstance(other, (int, float)):
244
248
  return Func(
245
249
  "xor", lambda a: numeric.bit_xor(a, other), [self], result_type=int
@@ -251,7 +255,7 @@ class Func(Function): # noqa: PLW1641
251
255
  result_type=int,
252
256
  )
253
257
 
254
- def __rxor__(self, other: Union[ColT, float]) -> "Func":
258
+ def __rxor__(self, other: ColT | float) -> "Func":
255
259
  if isinstance(other, (int, float)):
256
260
  return Func(
257
261
  "xor", lambda a: numeric.bit_xor(other, a), [self], result_type=int
@@ -263,7 +267,7 @@ class Func(Function): # noqa: PLW1641
263
267
  result_type=int,
264
268
  )
265
269
 
266
- def __rshift__(self, other: Union[ColT, float]) -> "Func":
270
+ def __rshift__(self, other: ColT | float) -> "Func":
267
271
  if isinstance(other, (int, float)):
268
272
  return Func(
269
273
  "rshift",
@@ -278,7 +282,7 @@ class Func(Function): # noqa: PLW1641
278
282
  result_type=int,
279
283
  )
280
284
 
281
- def __rrshift__(self, other: Union[ColT, float]) -> "Func":
285
+ def __rrshift__(self, other: ColT | float) -> "Func":
282
286
  if isinstance(other, (int, float)):
283
287
  return Func(
284
288
  "rshift",
@@ -293,7 +297,7 @@ class Func(Function): # noqa: PLW1641
293
297
  result_type=int,
294
298
  )
295
299
 
296
- def __lshift__(self, other: Union[ColT, float]) -> "Func":
300
+ def __lshift__(self, other: ColT | float) -> "Func":
297
301
  if isinstance(other, (int, float)):
298
302
  return Func(
299
303
  "lshift",
@@ -308,7 +312,7 @@ class Func(Function): # noqa: PLW1641
308
312
  result_type=int,
309
313
  )
310
314
 
311
- def __rlshift__(self, other: Union[ColT, float]) -> "Func":
315
+ def __rlshift__(self, other: ColT | float) -> "Func":
312
316
  if isinstance(other, (int, float)):
313
317
  return Func(
314
318
  "lshift",
@@ -323,12 +327,12 @@ class Func(Function): # noqa: PLW1641
323
327
  result_type=int,
324
328
  )
325
329
 
326
- def __lt__(self, other: Union[ColT, float]) -> "Func":
330
+ def __lt__(self, other: ColT | float) -> "Func":
327
331
  if isinstance(other, (int, float)):
328
332
  return Func("lt", lambda a: a < other, [self], result_type=bool)
329
333
  return Func("lt", lambda a1, a2: a1 < a2, [self, other], result_type=bool)
330
334
 
331
- def __le__(self, other: Union[ColT, float]) -> "Func":
335
+ def __le__(self, other: ColT | float) -> "Func":
332
336
  if isinstance(other, (int, float)):
333
337
  return Func("le", lambda a: a <= other, [self], result_type=bool)
334
338
  return Func("le", lambda a1, a2: a1 <= a2, [self, other], result_type=bool)
@@ -343,12 +347,12 @@ class Func(Function): # noqa: PLW1641
343
347
  return Func("ne", lambda a: a != other, [self], result_type=bool)
344
348
  return Func("ne", lambda a1, a2: a1 != a2, [self, other], result_type=bool)
345
349
 
346
- def __gt__(self, other: Union[ColT, float]) -> "Func":
350
+ def __gt__(self, other: ColT | float) -> "Func":
347
351
  if isinstance(other, (int, float)):
348
352
  return Func("gt", lambda a: a > other, [self], result_type=bool)
349
353
  return Func("gt", lambda a1, a2: a1 > a2, [self, other], result_type=bool)
350
354
 
351
- def __ge__(self, other: Union[ColT, float]) -> "Func":
355
+ def __ge__(self, other: ColT | float) -> "Func":
352
356
  if isinstance(other, (int, float)):
353
357
  return Func("ge", lambda a: a >= other, [self], result_type=bool)
354
358
  return Func("ge", lambda a1, a2: a1 >= a2, [self, other], result_type=bool)
@@ -369,7 +373,7 @@ class Func(Function): # noqa: PLW1641
369
373
  label,
370
374
  )
371
375
 
372
- def get_col_name(self, label: Optional[str] = None) -> str:
376
+ def get_col_name(self, label: str | None = None) -> str:
373
377
  if label:
374
378
  return label
375
379
  if self.col_label:
@@ -384,7 +388,7 @@ class Func(Function): # noqa: PLW1641
384
388
  return self.name
385
389
 
386
390
  def get_result_type(
387
- self, signals_schema: Optional["SignalSchema"] = None
391
+ self, signals_schema: "SignalSchema | None" = None
388
392
  ) -> "DataType":
389
393
  if self.result_type:
390
394
  return self.result_type
@@ -408,10 +412,24 @@ class Func(Function): # noqa: PLW1641
408
412
 
409
413
  def get_column(
410
414
  self,
411
- signals_schema: Optional["SignalSchema"] = None,
412
- label: Optional[str] = None,
413
- table: Optional["TableClause"] = None,
415
+ signals_schema: "SignalSchema | None" = None,
416
+ label: str | None = None,
417
+ table: "TableClause | None" = None,
414
418
  ) -> Column:
419
+ # Guard against using complex (pydantic) object columns in SQL funcs
420
+ if signals_schema and self._db_cols:
421
+ for arg in self._db_cols:
422
+ # _db_cols normalizes known columns to strings; skip non-string args
423
+ if not isinstance(arg, str):
424
+ continue
425
+ t_with_sub = signals_schema.get_column_type(arg, with_subtree=True)
426
+ if ModelStore.is_pydantic(t_with_sub):
427
+ raise DataChainParamsError(
428
+ f"Function {self.name} doesn't support complex object "
429
+ f"columns like '{arg}'. Use a leaf field (e.g., "
430
+ f"'{arg}.path') or use UDFs to operate on complex objects."
431
+ )
432
+
415
433
  col_type = self.get_result_type(signals_schema)
416
434
  sql_type = python_to_sql(col_type)
417
435
 
@@ -431,6 +449,7 @@ class Func(Function): # noqa: PLW1641
431
449
  return col
432
450
 
433
451
  cols = [get_col(col) for col in self._db_cols]
452
+
434
453
  kwargs = {k: get_col(v, string_as_literal=True) for k, v in self.kwargs.items()}
435
454
  func_col = self.inner(*cols, *self.args, **kwargs)
436
455
 
@@ -467,9 +486,8 @@ def get_db_col_type(signals_schema: "SignalSchema", col: ColT) -> "DataType":
467
486
  if isinstance(col, ColumnElement) and not hasattr(col, "name"):
468
487
  return sql_to_python(col)
469
488
 
470
- return signals_schema.get_column_type(
471
- col.name if isinstance(col, ColumnElement) else col # type: ignore[arg-type]
472
- )
489
+ name = col.name if isinstance(col, ColumnElement) else col # type: ignore[assignment]
490
+ return signals_schema.get_column_type(name) # type: ignore[arg-type]
473
491
 
474
492
 
475
493
  def _truediv(a, b):
datachain/func/numeric.py CHANGED
@@ -1,12 +1,10 @@
1
- from typing import Union
2
-
3
1
  from datachain.query.schema import Column
4
2
  from datachain.sql.functions import numeric
5
3
 
6
4
  from .func import Func
7
5
 
8
6
 
9
- def bit_and(*args: Union[str, Column, Func, int]) -> Func:
7
+ def bit_and(*args: str | Column | Func | int) -> Func:
10
8
  """
11
9
  Returns a function that computes the bitwise AND operation between two values.
12
10
 
@@ -51,7 +49,7 @@ def bit_and(*args: Union[str, Column, Func, int]) -> Func:
51
49
  )
52
50
 
53
51
 
54
- def bit_or(*args: Union[str, Column, Func, int]) -> Func:
52
+ def bit_or(*args: str | Column | Func | int) -> Func:
55
53
  """
56
54
  Returns a function that computes the bitwise OR operation between two values.
57
55
 
@@ -96,7 +94,7 @@ def bit_or(*args: Union[str, Column, Func, int]) -> Func:
96
94
  )
97
95
 
98
96
 
99
- def bit_xor(*args: Union[str, Column, Func, int]) -> Func:
97
+ def bit_xor(*args: str | Column | Func | int) -> Func:
100
98
  """
101
99
  Returns a function that computes the bitwise XOR operation between two values.
102
100
 
@@ -141,7 +139,7 @@ def bit_xor(*args: Union[str, Column, Func, int]) -> Func:
141
139
  )
142
140
 
143
141
 
144
- def int_hash_64(col: Union[str, Column, Func, int]) -> Func:
142
+ def int_hash_64(col: str | Column | Func | int) -> Func:
145
143
  """
146
144
  Returns a function that computes the 64-bit hash of an integer.
147
145
 
@@ -177,7 +175,7 @@ def int_hash_64(col: Union[str, Column, Func, int]) -> Func:
177
175
  )
178
176
 
179
177
 
180
- def bit_hamming_distance(*args: Union[str, Column, Func, int]) -> Func:
178
+ def bit_hamming_distance(*args: str | Column | Func | int) -> Func:
181
179
  """
182
180
  Returns a function that computes the Hamming distance between two integers.
183
181
 
datachain/func/string.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Optional, get_origin
1
+ from typing import get_origin
2
2
 
3
3
  from sqlalchemy import literal
4
4
 
@@ -44,7 +44,7 @@ def length(col: ColT) -> Func:
44
44
  return Func("length", inner=string.length, cols=[col], result_type=int)
45
45
 
46
46
 
47
- def split(col: ColT, sep: str, limit: Optional[int] = None) -> Func:
47
+ def split(col: ColT, sep: str, limit: int | None = None) -> Func:
48
48
  """
49
49
  Takes a column and split character and returns an array of the parts.
50
50
 
@@ -0,0 +1,123 @@
1
+ import hashlib
2
+ import inspect
3
+ import textwrap
4
+ from collections.abc import Sequence
5
+ from typing import TypeAlias, TypeVar
6
+
7
+ from sqlalchemy.sql.elements import ClauseElement, ColumnElement
8
+
9
+ from datachain import json
10
+
11
+ T = TypeVar("T", bound=ColumnElement)
12
+ ColumnLike: TypeAlias = str | T
13
+
14
+
15
+ def _serialize_value(val): # noqa: PLR0911
16
+ """Helper to serialize arbitrary values recursively."""
17
+ if val is None:
18
+ return None
19
+ if isinstance(val, (str, int, float, bool)):
20
+ return val
21
+ if isinstance(val, ClauseElement):
22
+ return serialize_column_element(val)
23
+ if isinstance(val, dict):
24
+ # Sort dict keys for deterministic serialization
25
+ return {k: _serialize_value(v) for k, v in sorted(val.items())}
26
+ if isinstance(val, (list, tuple)):
27
+ return [_serialize_value(v) for v in val]
28
+ if callable(val):
29
+ return val.__name__ if hasattr(val, "__name__") else str(val)
30
+ return str(val)
31
+
32
+
33
+ def serialize_column_element(expr: str | ColumnElement) -> dict:
34
+ """
35
+ Recursively serialize a SQLAlchemy ColumnElement into a deterministic structure.
36
+ Uses SQLAlchemy's _traverse_internals to automatically handle all expression types.
37
+ """
38
+ from sqlalchemy.sql.elements import BindParameter
39
+
40
+ # Special case: BindParameter has non-deterministic 'key' attribute, only use value
41
+ if isinstance(expr, BindParameter):
42
+ return {"type": "bind", "value": _serialize_value(expr.value)}
43
+
44
+ # Generic handling for all ClauseElement types using SQLAlchemy's internals
45
+ if isinstance(expr, ClauseElement):
46
+ # All standard SQLAlchemy types have _traverse_internals
47
+ if hasattr(expr, "_traverse_internals"):
48
+ result = {"type": expr.__class__.__name__}
49
+ for attr_name, _ in expr._traverse_internals:
50
+ # Skip 'table' attribute - table names can be auto-generated/random
51
+ # and are not semantically important for hashing
52
+ if attr_name == "table":
53
+ continue
54
+ if hasattr(expr, attr_name):
55
+ val = getattr(expr, attr_name)
56
+ result[attr_name] = _serialize_value(val)
57
+ return result
58
+ # Rare case: custom user-defined ClauseElement without _traverse_internals
59
+ # We don't know its structure, so just stringify it
60
+ return {"type": expr.__class__.__name__, "repr": str(expr)}
61
+
62
+ # Absolute fallback: stringify completely unknown types
63
+ return {"type": "other", "repr": str(expr)}
64
+
65
+
66
+ def hash_column_elements(columns: ColumnLike | Sequence[ColumnLike]) -> str:
67
+ """
68
+ Hash a list of ColumnElements deterministically, dialect agnostic.
69
+ Only accepts ordered iterables (like list or tuple).
70
+ """
71
+ # Handle case where a single ColumnElement is passed instead of a sequence
72
+ if isinstance(columns, (ColumnElement, str)):
73
+ columns = (columns,)
74
+
75
+ serialized = [serialize_column_element(c) for c in columns]
76
+ json_str = json.dumps(
77
+ serialized, sort_keys=True, separators=(", ", ": ")
78
+ ) # stable JSON
79
+ return hashlib.sha256(json_str.encode("utf-8")).hexdigest()
80
+
81
+
82
+ def hash_callable(func):
83
+ """
84
+ Calculate a hash from a callable.
85
+ Rules:
86
+ - Named functions (def) → use source code for stable, cross-version hashing
87
+ - Lambdas → use bytecode (deterministic in same Python runtime)
88
+ """
89
+ if not callable(func):
90
+ raise TypeError("Expected a callable")
91
+
92
+ # Determine if it is a lambda
93
+ is_lambda = func.__name__ == "<lambda>"
94
+
95
+ if not is_lambda:
96
+ # Try to get exact source of named function
97
+ try:
98
+ lines, _ = inspect.getsourcelines(func)
99
+ payload = textwrap.dedent("".join(lines)).strip()
100
+ except (OSError, TypeError):
101
+ # Fallback: bytecode if source not available
102
+ payload = func.__code__.co_code
103
+ else:
104
+ # For lambdas, fall back directly to bytecode
105
+ payload = func.__code__.co_code
106
+
107
+ # Normalize annotations
108
+ annotations = {
109
+ k: getattr(v, "__name__", str(v)) for k, v in func.__annotations__.items()
110
+ }
111
+
112
+ # Extras to distinguish functions with same code but different metadata
113
+ extras = {
114
+ "name": func.__name__,
115
+ "defaults": func.__defaults__,
116
+ "annotations": annotations,
117
+ }
118
+
119
+ # Compute SHA256
120
+ h = hashlib.sha256()
121
+ h.update(str(payload).encode() if isinstance(payload, str) else payload)
122
+ h.update(str(extras).encode())
123
+ return h.hexdigest()
datachain/job.py CHANGED
@@ -1,8 +1,9 @@
1
- import json
2
1
  import uuid
3
2
  from dataclasses import dataclass
4
3
  from datetime import datetime
5
- from typing import Any, Optional, TypeVar, Union
4
+ from typing import Any, TypeVar
5
+
6
+ from datachain import json
6
7
 
7
8
  J = TypeVar("J", bound="Job")
8
9
 
@@ -18,27 +19,29 @@ class Job:
18
19
  workers: int
19
20
  params: dict[str, str]
20
21
  metrics: dict[str, Any]
21
- finished_at: Optional[datetime] = None
22
- python_version: Optional[str] = None
22
+ finished_at: datetime | None = None
23
+ python_version: str | None = None
23
24
  error_message: str = ""
24
25
  error_stack: str = ""
26
+ parent_job_id: str | None = None
25
27
 
26
28
  @classmethod
27
29
  def parse(
28
30
  cls,
29
- id: Union[str, uuid.UUID],
31
+ id: str | uuid.UUID,
30
32
  name: str,
31
33
  status: int,
32
34
  created_at: datetime,
33
- finished_at: Optional[datetime],
35
+ finished_at: datetime | None,
34
36
  query: str,
35
37
  query_type: int,
36
38
  workers: int,
37
- python_version: Optional[str],
39
+ python_version: str | None,
38
40
  error_message: str,
39
41
  error_stack: str,
40
42
  params: str,
41
43
  metrics: str,
44
+ parent_job_id: str | None,
42
45
  ) -> "Job":
43
46
  return cls(
44
47
  str(id),
@@ -54,4 +57,5 @@ class Job:
54
57
  python_version,
55
58
  error_message,
56
59
  error_stack,
60
+ str(parent_job_id) if parent_job_id else None,
57
61
  )