datachain 0.34.2__py3-none-any.whl → 0.34.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -875,8 +875,17 @@ class SQLiteWarehouse(AbstractWarehouse):
875
875
  if isinstance(c, BinaryExpression):
876
876
  right_left_join = add_left_rows_filter(c)
877
877
 
878
- union = sqlalchemy.union(left_right_join, right_left_join).subquery()
879
- return sqlalchemy.select(*union.c).select_from(union)
878
+ # Use CTE instead of subquery to force SQLite to materialize the result
879
+ # This breaks deep nesting and prevents parser stack overflow.
880
+ union_cte = sqlalchemy.union(left_right_join, right_left_join).cte()
881
+
882
+ return self._regenerate_system_columns(union_cte)
883
+
884
+ def _system_row_number_expr(self):
885
+ return func.row_number().over()
886
+
887
+ def _system_random_expr(self):
888
+ return self._system_row_number_expr() * 1103515245 + 12345
880
889
 
881
890
  def create_pre_udf_table(self, query: "Select") -> "Table":
882
891
  """
@@ -246,6 +246,44 @@ class AbstractWarehouse(ABC, Serializable):
246
246
  break # no more results
247
247
  offset += page_size
248
248
 
249
+ def _regenerate_system_columns(self, selectable):
250
+ """Return a SELECT that regenerates sys__id and sys__rand deterministically."""
251
+
252
+ base = selectable.subquery() if hasattr(selectable, "subquery") else selectable
253
+
254
+ system_types: dict[str, sa.types.TypeEngine] = {
255
+ sys_col.name: sys_col.type
256
+ for sys_col in self.schema.dataset_row_cls.sys_columns()
257
+ }
258
+
259
+ result_columns = []
260
+ for col in base.c:
261
+ if col.name == "sys__id":
262
+ expr = self._system_row_number_expr()
263
+ expr = sa.cast(expr, system_types["sys__id"])
264
+ result_columns.append(expr.label("sys__id"))
265
+ elif col.name == "sys__rand":
266
+ expr = self._system_random_expr()
267
+ expr = sa.cast(expr, system_types["sys__rand"])
268
+ result_columns.append(expr.label("sys__rand"))
269
+ else:
270
+ result_columns.append(col)
271
+
272
+ # Wrap in subquery to materialize window functions, then wrap again in SELECT
273
+ # This ensures window functions are computed before INSERT...FROM SELECT
274
+ inner = sa.select(*result_columns).select_from(base).subquery()
275
+ return sa.select(*inner.c).select_from(inner)
276
+
277
+ def _system_row_number_expr(self):
278
+ """Return an expression that produces deterministic row numbers."""
279
+
280
+ raise NotImplementedError
281
+
282
+ def _system_random_expr(self):
283
+ """Return an expression that produces deterministic random values."""
284
+
285
+ raise NotImplementedError
286
+
249
287
  #
250
288
  # Table Name Internal Functions
251
289
  #
@@ -923,6 +961,8 @@ class AbstractWarehouse(ABC, Serializable):
923
961
  right: "_FromClauseArgument",
924
962
  onclause: "_OnClauseArgument",
925
963
  inner: bool = True,
964
+ full: bool = False,
965
+ columns=None,
926
966
  ) -> sa.Select:
927
967
  """
928
968
  Join two tables together.
@@ -1701,7 +1701,11 @@ class DataChain:
1701
1701
  )
1702
1702
 
1703
1703
  query = self._query.join(
1704
- right_ds._query, sqlalchemy.and_(*ops), inner, full, rname + "{name}"
1704
+ right_ds._query,
1705
+ sqlalchemy.and_(*ops),
1706
+ inner,
1707
+ full,
1708
+ rname + "{name}",
1705
1709
  )
1706
1710
  query.feature_schema = None
1707
1711
  ds = self._evolve(query=query)
@@ -1989,7 +1993,8 @@ class DataChain:
1989
1993
  results = self.results(include_hidden=include_hidden)
1990
1994
  if as_object:
1991
1995
  df = pd.DataFrame(results, columns=columns, dtype=object)
1992
- return df.where(pd.notna(df), None)
1996
+ df.where(pd.notna(df), None, inplace=True)
1997
+ return df
1993
1998
  return pd.DataFrame.from_records(results, columns=columns)
1994
1999
 
1995
2000
  def show(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.34.2
3
+ Version: 0.34.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -56,8 +56,8 @@ datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE
56
56
  datachain/data_storage/metastore.py,sha256=9Wd0MfdVrdpgvFXOddUvyz61MnoRDipv0-A38aRsqzw,61021
57
57
  datachain/data_storage/schema.py,sha256=DmxxXjNIsXib9gj5jcrb1CVjGzHf7HZLOehs1RmuiMA,9891
58
58
  datachain/data_storage/serializer.py,sha256=oL8i8smyAeVUyDepk8Xhf3lFOGOEHMoZjA5GdFzvfGI,3862
59
- datachain/data_storage/sqlite.py,sha256=YNHXPdJeTEoWfhZYb5fsLf1CIjiEhB7VG4OgQzDrWVU,30936
60
- datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
59
+ datachain/data_storage/sqlite.py,sha256=aZY-1pjzvoRcPyIJ7i9QQu6kH8tCDkfFgPybHjFHg1k,31266
60
+ datachain/data_storage/warehouse.py,sha256=dPafzy-JsN2x9TD0j0ZBIUie6sE2Z8XzwELfOZ6quyU,34386
61
61
  datachain/diff/__init__.py,sha256=v03JfMxH1VvwFl3rniedS4YWs6EXSfaLCULJTKNECE4,9603
62
62
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -108,7 +108,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
108
108
  datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
109
109
  datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
110
110
  datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
111
- datachain/lib/dc/datachain.py,sha256=Xh7Hwpvow_3QHPhsPSpP99HDKlwcJOpZEZJUNa_Ex9c,104396
111
+ datachain/lib/dc/datachain.py,sha256=TW9kcqNJr46_gQTpeCcSxYKKUpkk9cLVW9ADTcPJrug,104474
112
112
  datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
113
113
  datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
114
114
  datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
@@ -165,9 +165,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
165
165
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
166
166
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
167
167
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
168
- datachain-0.34.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
- datachain-0.34.2.dist-info/METADATA,sha256=p-mulDC4TJ2QOJr2peiHCygfiVP1bwwdubi-fyfLQkg,13655
170
- datachain-0.34.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
- datachain-0.34.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
- datachain-0.34.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
- datachain-0.34.2.dist-info/RECORD,,
168
+ datachain-0.34.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
+ datachain-0.34.3.dist-info/METADATA,sha256=l1d5np6lvB4K8ohVibIbhzlNobGtlglmBhK0VcQqV-U,13655
170
+ datachain-0.34.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
+ datachain-0.34.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
+ datachain-0.34.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
+ datachain-0.34.3.dist-info/RECORD,,