datachain 0.34.2__py3-none-any.whl → 0.34.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -875,8 +875,17 @@ class SQLiteWarehouse(AbstractWarehouse):
875
875
  if isinstance(c, BinaryExpression):
876
876
  right_left_join = add_left_rows_filter(c)
877
877
 
878
- union = sqlalchemy.union(left_right_join, right_left_join).subquery()
879
- return sqlalchemy.select(*union.c).select_from(union)
878
+ # Use CTE instead of subquery to force SQLite to materialize the result
879
+ # This breaks deep nesting and prevents parser stack overflow.
880
+ union_cte = sqlalchemy.union(left_right_join, right_left_join).cte()
881
+
882
+ return self._regenerate_system_columns(union_cte)
883
+
884
+ def _system_row_number_expr(self):
885
+ return func.row_number().over()
886
+
887
+ def _system_random_expr(self):
888
+ return self._system_row_number_expr() * 1103515245 + 12345
880
889
 
881
890
  def create_pre_udf_table(self, query: "Select") -> "Table":
882
891
  """
@@ -246,6 +246,44 @@ class AbstractWarehouse(ABC, Serializable):
246
246
  break # no more results
247
247
  offset += page_size
248
248
 
249
+ def _regenerate_system_columns(self, selectable):
250
+ """Return a SELECT that regenerates sys__id and sys__rand deterministically."""
251
+
252
+ base = selectable.subquery() if hasattr(selectable, "subquery") else selectable
253
+
254
+ system_types: dict[str, sa.types.TypeEngine] = {
255
+ sys_col.name: sys_col.type
256
+ for sys_col in self.schema.dataset_row_cls.sys_columns()
257
+ }
258
+
259
+ result_columns = []
260
+ for col in base.c:
261
+ if col.name == "sys__id":
262
+ expr = self._system_row_number_expr()
263
+ expr = sa.cast(expr, system_types["sys__id"])
264
+ result_columns.append(expr.label("sys__id"))
265
+ elif col.name == "sys__rand":
266
+ expr = self._system_random_expr()
267
+ expr = sa.cast(expr, system_types["sys__rand"])
268
+ result_columns.append(expr.label("sys__rand"))
269
+ else:
270
+ result_columns.append(col)
271
+
272
+ # Wrap in subquery to materialize window functions, then wrap again in SELECT
273
+ # This ensures window functions are computed before INSERT...FROM SELECT
274
+ inner = sa.select(*result_columns).select_from(base).subquery()
275
+ return sa.select(*inner.c).select_from(inner)
276
+
277
+ def _system_row_number_expr(self):
278
+ """Return an expression that produces deterministic row numbers."""
279
+
280
+ raise NotImplementedError
281
+
282
+ def _system_random_expr(self):
283
+ """Return an expression that produces deterministic random values."""
284
+
285
+ raise NotImplementedError
286
+
249
287
  #
250
288
  # Table Name Internal Functions
251
289
  #
@@ -923,6 +961,8 @@ class AbstractWarehouse(ABC, Serializable):
923
961
  right: "_FromClauseArgument",
924
962
  onclause: "_OnClauseArgument",
925
963
  inner: bool = True,
964
+ full: bool = False,
965
+ columns=None,
926
966
  ) -> sa.Select:
927
967
  """
928
968
  Join two tables together.
@@ -1701,7 +1701,11 @@ class DataChain:
1701
1701
  )
1702
1702
 
1703
1703
  query = self._query.join(
1704
- right_ds._query, sqlalchemy.and_(*ops), inner, full, rname + "{name}"
1704
+ right_ds._query,
1705
+ sqlalchemy.and_(*ops),
1706
+ inner,
1707
+ full,
1708
+ rname + "{name}",
1705
1709
  )
1706
1710
  query.feature_schema = None
1707
1711
  ds = self._evolve(query=query)
@@ -1989,7 +1993,8 @@ class DataChain:
1989
1993
  results = self.results(include_hidden=include_hidden)
1990
1994
  if as_object:
1991
1995
  df = pd.DataFrame(results, columns=columns, dtype=object)
1992
- return df.where(pd.notna(df), None)
1996
+ df.where(pd.notna(df), None, inplace=True)
1997
+ return df
1993
1998
  return pd.DataFrame.from_records(results, columns=columns)
1994
1999
 
1995
2000
  def show(
@@ -1,6 +1,8 @@
1
1
  import copy
2
2
  import hashlib
3
3
  import json
4
+ import logging
5
+ import math
4
6
  import warnings
5
7
  from collections.abc import Iterator, Sequence
6
8
  from dataclasses import dataclass
@@ -23,7 +25,7 @@ from typing import ( # noqa: UP035
23
25
  get_origin,
24
26
  )
25
27
 
26
- from pydantic import BaseModel, Field, create_model
28
+ from pydantic import BaseModel, Field, ValidationError, create_model
27
29
  from sqlalchemy import ColumnElement
28
30
  from typing_extensions import Literal as LiteralEx
29
31
 
@@ -43,6 +45,8 @@ if TYPE_CHECKING:
43
45
  from datachain.catalog import Catalog
44
46
 
45
47
 
48
+ logger = logging.getLogger(__name__)
49
+
46
50
  NAMES_TO_TYPES = {
47
51
  "int": int,
48
52
  "str": str,
@@ -463,12 +467,32 @@ class SignalSchema:
463
467
  objs.append(self.setup_values.get(name))
464
468
  elif (fr := ModelStore.to_pydantic(fr_type)) is not None:
465
469
  j, pos = unflatten_to_json_pos(fr, row, pos)
466
- objs.append(fr(**j))
470
+ try:
471
+ obj = fr(**j)
472
+ except ValidationError as e:
473
+ if self._all_values_none(j):
474
+ logger.debug("Failed to create input for %s: %s", name, e)
475
+ obj = None
476
+ else:
477
+ raise
478
+ objs.append(obj)
467
479
  else:
468
480
  objs.append(row[pos])
469
481
  pos += 1
470
482
  return objs
471
483
 
484
+ @staticmethod
485
+ def _all_values_none(value: Any) -> bool:
486
+ if isinstance(value, dict):
487
+ return all(SignalSchema._all_values_none(v) for v in value.values())
488
+ if isinstance(value, (list, tuple, set)):
489
+ return all(SignalSchema._all_values_none(v) for v in value)
490
+ if isinstance(value, float):
491
+ # NaN is used to represent NULL and NaN float values in datachain
492
+ # Since SQLite does not have a separate NULL type, we need to check for NaN
493
+ return math.isnan(value) or value is None
494
+ return value is None
495
+
472
496
  def get_file_signal(self) -> Optional[str]:
473
497
  for signal_name, signal_type in self.values.items():
474
498
  if (fr := ModelStore.to_pydantic(signal_type)) is not None and issubclass(
@@ -546,8 +570,15 @@ class SignalSchema:
546
570
  pos += 1
547
571
  else:
548
572
  json, pos = unflatten_to_json_pos(fr, row, pos) # type: ignore[union-attr]
549
- obj = fr(**json)
550
- SignalSchema._set_file_stream(obj, catalog, cache)
573
+ try:
574
+ obj = fr(**json)
575
+ SignalSchema._set_file_stream(obj, catalog, cache)
576
+ except ValidationError as e:
577
+ if self._all_values_none(json):
578
+ logger.debug("Failed to create feature for %s: %s", fr_cls, e)
579
+ obj = None
580
+ else:
581
+ raise
551
582
  res.append(obj)
552
583
  return res
553
584
 
@@ -1009,7 +1009,9 @@ class SQLJoin(Step):
1009
1009
  rname: str
1010
1010
 
1011
1011
  def hash_inputs(self) -> str:
1012
- predicates = ensure_sequence(self.predicates or [])
1012
+ predicates = (
1013
+ ensure_sequence(self.predicates) if self.predicates is not None else []
1014
+ )
1013
1015
 
1014
1016
  parts = [
1015
1017
  bytes.fromhex(self.query1.hash()),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.34.2
3
+ Version: 0.34.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -56,8 +56,8 @@ datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE
56
56
  datachain/data_storage/metastore.py,sha256=9Wd0MfdVrdpgvFXOddUvyz61MnoRDipv0-A38aRsqzw,61021
57
57
  datachain/data_storage/schema.py,sha256=DmxxXjNIsXib9gj5jcrb1CVjGzHf7HZLOehs1RmuiMA,9891
58
58
  datachain/data_storage/serializer.py,sha256=oL8i8smyAeVUyDepk8Xhf3lFOGOEHMoZjA5GdFzvfGI,3862
59
- datachain/data_storage/sqlite.py,sha256=YNHXPdJeTEoWfhZYb5fsLf1CIjiEhB7VG4OgQzDrWVU,30936
60
- datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
59
+ datachain/data_storage/sqlite.py,sha256=aZY-1pjzvoRcPyIJ7i9QQu6kH8tCDkfFgPybHjFHg1k,31266
60
+ datachain/data_storage/warehouse.py,sha256=dPafzy-JsN2x9TD0j0ZBIUie6sE2Z8XzwELfOZ6quyU,34386
61
61
  datachain/diff/__init__.py,sha256=v03JfMxH1VvwFl3rniedS4YWs6EXSfaLCULJTKNECE4,9603
62
62
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -90,7 +90,7 @@ datachain/lib/namespaces.py,sha256=ZyIYUa3WMrv6R5HrSoLsmLiEbvUQDl8sBINLUmWOYG0,3
90
90
  datachain/lib/projects.py,sha256=_YeU9PPcH_pC8-sbX-47XtWSdl1ltVKnALY8azWLJkM,4112
91
91
  datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
92
92
  datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
93
- datachain/lib/signal_schema.py,sha256=WDFLbzXEOhgv865TePcFpLQHxsKQHtn8kTzaQGUG_XA,39479
93
+ datachain/lib/signal_schema.py,sha256=NsL2ISnSRN-lKRpXzB9CtsUj2tVKcoAe73TaaZKMT-0,40774
94
94
  datachain/lib/tar.py,sha256=MLcVjzIgBqRuJacCNpZ6kwSZNq1i2tLyROc8PVprHsA,999
95
95
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
96
96
  datachain/lib/udf.py,sha256=DdUxGBo9Y7Jz6aTBKgwex7YfK1RNaGm1JUlXCqs7qnw,18122
@@ -108,7 +108,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
108
108
  datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
109
109
  datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
110
110
  datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
111
- datachain/lib/dc/datachain.py,sha256=Xh7Hwpvow_3QHPhsPSpP99HDKlwcJOpZEZJUNa_Ex9c,104396
111
+ datachain/lib/dc/datachain.py,sha256=TW9kcqNJr46_gQTpeCcSxYKKUpkk9cLVW9ADTcPJrug,104474
112
112
  datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
113
113
  datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
114
114
  datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
@@ -131,7 +131,7 @@ datachain/model/ultralytics/pose.py,sha256=pvoXrWWUSWT_UBaMwUb5MBHAY57Co2HFDPigF
131
131
  datachain/model/ultralytics/segment.py,sha256=v9_xDxd5zw_I8rXsbl7yQXgEdTs2T38zyY_Y4XGN8ok,3194
132
132
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
133
133
  datachain/query/batch.py,sha256=ocPeNgrJM6Y_6SYCx3O2cwlCFAhNMfoYgB99GP6A1Bg,4294
134
- datachain/query/dataset.py,sha256=P7pyRiWc9G3AfzxvyB2yToKW3bXoUCrfFOtFdiVbCrU,67836
134
+ datachain/query/dataset.py,sha256=RQLNc368vrKI6EdsugbXWFbJobl430yXV-Ks1i4sdfo,67893
135
135
  datachain/query/dispatch.py,sha256=pygp7xg3lUDKlYHhecKxW5fB3zOSX1fPJfZBU4dfijk,16067
136
136
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
137
137
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -165,9 +165,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
165
165
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
166
166
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
167
167
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
168
- datachain-0.34.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
- datachain-0.34.2.dist-info/METADATA,sha256=p-mulDC4TJ2QOJr2peiHCygfiVP1bwwdubi-fyfLQkg,13655
170
- datachain-0.34.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
- datachain-0.34.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
- datachain-0.34.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
- datachain-0.34.2.dist-info/RECORD,,
168
+ datachain-0.34.4.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
+ datachain-0.34.4.dist-info/METADATA,sha256=pjivvNYJPbaTLyOpWYRJiaaoyC8k-LUaDl-dczGFUQc,13655
170
+ datachain-0.34.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
+ datachain-0.34.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
+ datachain-0.34.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
+ datachain-0.34.4.dist-info/RECORD,,