datachain 0.34.1__py3-none-any.whl → 0.34.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -3,6 +3,7 @@ import sys
3
3
  from importlib import import_module
4
4
  from typing import TYPE_CHECKING, Any, Optional
5
5
 
6
+ from datachain.plugins import ensure_plugins_loaded
6
7
  from datachain.utils import get_envs_by_prefix
7
8
 
8
9
  if TYPE_CHECKING:
@@ -24,6 +25,8 @@ IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
24
25
 
25
26
 
26
27
  def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
28
+ ensure_plugins_loaded()
29
+
27
30
  from datachain.data_storage import AbstractMetastore
28
31
  from datachain.data_storage.serializer import deserialize
29
32
 
@@ -64,6 +67,8 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
64
67
 
65
68
 
66
69
  def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
70
+ ensure_plugins_loaded()
71
+
67
72
  from datachain.data_storage import AbstractWarehouse
68
73
  from datachain.data_storage.serializer import deserialize
69
74
 
@@ -1,29 +1,119 @@
1
1
  import base64
2
- import pickle
2
+ import json
3
3
  from abc import abstractmethod
4
4
  from collections.abc import Callable
5
- from typing import Any
5
+ from typing import Any, ClassVar
6
+
7
+ from datachain.plugins import ensure_plugins_loaded
8
+
9
+
10
+ class CallableRegistry:
11
+ _registry: ClassVar[dict[str, Callable]] = {}
12
+
13
+ @classmethod
14
+ def register(cls, callable_obj: Callable, name: str) -> str:
15
+ cls._registry[name] = callable_obj
16
+ return name
17
+
18
+ @classmethod
19
+ def get(cls, name: str) -> Callable:
20
+ return cls._registry[name]
6
21
 
7
22
 
8
23
  class Serializable:
24
+ @classmethod
25
+ @abstractmethod
26
+ def serialize_callable_name(cls) -> str:
27
+ """Return the registered name used for this class' factory callable."""
28
+
9
29
  @abstractmethod
10
30
  def clone_params(self) -> tuple[Callable[..., Any], list[Any], dict[str, Any]]:
11
- """
12
- Returns the class, args, and kwargs needed to instantiate a cloned copy
13
- of this instance for use in separate processes or machines.
14
- """
31
+ """Return (callable, args, kwargs) necessary to recreate this object."""
32
+
33
+ def _prepare(self, params: tuple) -> dict:
34
+ callable, args, kwargs = params
35
+ callable_name = callable.__self__.serialize_callable_name()
36
+ return {
37
+ "callable": callable_name,
38
+ "args": args,
39
+ "kwargs": {
40
+ k: self._prepare(v) if isinstance(v, tuple) else v
41
+ for k, v in kwargs.items()
42
+ },
43
+ }
15
44
 
16
45
  def serialize(self) -> str:
17
- """
18
- Returns a string representation of clone params.
19
- This is useful for storing the state of an object in environment variable.
20
- """
21
- return base64.b64encode(pickle.dumps(self.clone_params())).decode()
46
+ """Return a base64-encoded JSON string with registered callable + params."""
47
+ _ensure_default_callables_registered()
48
+ data = self.clone_params()
49
+ return base64.b64encode(json.dumps(self._prepare(data)).encode()).decode()
22
50
 
23
51
 
24
52
  def deserialize(s: str) -> Serializable:
53
+ """Deserialize from base64-encoded JSON using only registered callables.
54
+
55
+ Nested serialized objects are instantiated automatically except for those
56
+ passed via clone parameter tuples (keys ending with ``_clone_params``),
57
+ which must remain as (callable, args, kwargs) for later factory usage.
25
58
  """
26
- Returns a new instance of the class represented by the string.
27
- """
28
- (f, args, kwargs) = pickle.loads(base64.b64decode(s.encode())) # noqa: S301
29
- return f(*args, **kwargs)
59
+ ensure_plugins_loaded()
60
+ _ensure_default_callables_registered()
61
+ decoded = base64.b64decode(s.encode())
62
+ data = json.loads(decoded.decode())
63
+
64
+ def _is_serialized(obj: Any) -> bool:
65
+ return isinstance(obj, dict) and {"callable", "args", "kwargs"}.issubset(
66
+ obj.keys()
67
+ )
68
+
69
+ def _reconstruct(obj: Any, nested: bool = False) -> Any:
70
+ if not _is_serialized(obj):
71
+ return obj
72
+ callable_name: str = obj["callable"]
73
+ args: list[Any] = obj["args"]
74
+ kwargs: dict[str, Any] = obj["kwargs"]
75
+ # Recurse only inside kwargs because serialize() only nests through kwargs
76
+ for k, v in list(kwargs.items()):
77
+ if _is_serialized(v):
78
+ kwargs[k] = _reconstruct(v, True)
79
+ callable_obj = CallableRegistry.get(callable_name)
80
+ if nested:
81
+ return (callable_obj, args, kwargs)
82
+ # Otherwise instantiate
83
+ return callable_obj(*args, **kwargs)
84
+
85
+ if not _is_serialized(data):
86
+ raise ValueError("Invalid serialized data format")
87
+ return _reconstruct(data, False)
88
+
89
+
90
+ class _DefaultsState:
91
+ registered = False
92
+
93
+
94
+ def _ensure_default_callables_registered() -> None:
95
+ if _DefaultsState.registered:
96
+ return
97
+
98
+ from datachain.data_storage.sqlite import (
99
+ SQLiteDatabaseEngine,
100
+ SQLiteMetastore,
101
+ SQLiteWarehouse,
102
+ )
103
+
104
+ # Register (idempotent by name overwrite is fine) using class-level
105
+ # serialization names to avoid hard-coded literals here.
106
+ CallableRegistry.register(
107
+ SQLiteDatabaseEngine.from_db_file,
108
+ SQLiteDatabaseEngine.serialize_callable_name(),
109
+ )
110
+ CallableRegistry.register(
111
+ SQLiteMetastore.init_after_clone,
112
+ SQLiteMetastore.serialize_callable_name(),
113
+ )
114
+ CallableRegistry.register(
115
+ SQLiteWarehouse.init_after_clone,
116
+ SQLiteWarehouse.serialize_callable_name(),
117
+ )
118
+
119
+ _DefaultsState.registered = True
@@ -201,10 +201,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
201
201
  """
202
202
  return (
203
203
  SQLiteDatabaseEngine.from_db_file,
204
- [self.db_file],
204
+ [str(self.db_file)],
205
205
  {},
206
206
  )
207
207
 
208
+ @classmethod
209
+ def serialize_callable_name(cls) -> str:
210
+ return "sqlite.from_db_file"
211
+
208
212
  def _reconnect(self) -> None:
209
213
  if not self.is_closed:
210
214
  raise RuntimeError("Cannot reconnect on still-open DB!")
@@ -403,6 +407,10 @@ class SQLiteMetastore(AbstractDBMetastore):
403
407
  },
404
408
  )
405
409
 
410
+ @classmethod
411
+ def serialize_callable_name(cls) -> str:
412
+ return "sqlite.metastore.init_after_clone"
413
+
406
414
  @classmethod
407
415
  def init_after_clone(
408
416
  cls,
@@ -610,6 +618,10 @@ class SQLiteWarehouse(AbstractWarehouse):
610
618
  {"db_clone_params": self.db.clone_params()},
611
619
  )
612
620
 
621
+ @classmethod
622
+ def serialize_callable_name(cls) -> str:
623
+ return "sqlite.warehouse.init_after_clone"
624
+
613
625
  @classmethod
614
626
  def init_after_clone(
615
627
  cls,
@@ -863,8 +875,17 @@ class SQLiteWarehouse(AbstractWarehouse):
863
875
  if isinstance(c, BinaryExpression):
864
876
  right_left_join = add_left_rows_filter(c)
865
877
 
866
- union = sqlalchemy.union(left_right_join, right_left_join).subquery()
867
- return sqlalchemy.select(*union.c).select_from(union)
878
+ # Use CTE instead of subquery to force SQLite to materialize the result
879
+ # This breaks deep nesting and prevents parser stack overflow.
880
+ union_cte = sqlalchemy.union(left_right_join, right_left_join).cte()
881
+
882
+ return self._regenerate_system_columns(union_cte)
883
+
884
+ def _system_row_number_expr(self):
885
+ return func.row_number().over()
886
+
887
+ def _system_random_expr(self):
888
+ return self._system_row_number_expr() * 1103515245 + 12345
868
889
 
869
890
  def create_pre_udf_table(self, query: "Select") -> "Table":
870
891
  """
@@ -246,6 +246,44 @@ class AbstractWarehouse(ABC, Serializable):
246
246
  break # no more results
247
247
  offset += page_size
248
248
 
249
+ def _regenerate_system_columns(self, selectable):
250
+ """Return a SELECT that regenerates sys__id and sys__rand deterministically."""
251
+
252
+ base = selectable.subquery() if hasattr(selectable, "subquery") else selectable
253
+
254
+ system_types: dict[str, sa.types.TypeEngine] = {
255
+ sys_col.name: sys_col.type
256
+ for sys_col in self.schema.dataset_row_cls.sys_columns()
257
+ }
258
+
259
+ result_columns = []
260
+ for col in base.c:
261
+ if col.name == "sys__id":
262
+ expr = self._system_row_number_expr()
263
+ expr = sa.cast(expr, system_types["sys__id"])
264
+ result_columns.append(expr.label("sys__id"))
265
+ elif col.name == "sys__rand":
266
+ expr = self._system_random_expr()
267
+ expr = sa.cast(expr, system_types["sys__rand"])
268
+ result_columns.append(expr.label("sys__rand"))
269
+ else:
270
+ result_columns.append(col)
271
+
272
+ # Wrap in subquery to materialize window functions, then wrap again in SELECT
273
+ # This ensures window functions are computed before INSERT...FROM SELECT
274
+ inner = sa.select(*result_columns).select_from(base).subquery()
275
+ return sa.select(*inner.c).select_from(inner)
276
+
277
+ def _system_row_number_expr(self):
278
+ """Return an expression that produces deterministic row numbers."""
279
+
280
+ raise NotImplementedError
281
+
282
+ def _system_random_expr(self):
283
+ """Return an expression that produces deterministic random values."""
284
+
285
+ raise NotImplementedError
286
+
249
287
  #
250
288
  # Table Name Internal Functions
251
289
  #
@@ -923,6 +961,8 @@ class AbstractWarehouse(ABC, Serializable):
923
961
  right: "_FromClauseArgument",
924
962
  onclause: "_OnClauseArgument",
925
963
  inner: bool = True,
964
+ full: bool = False,
965
+ columns=None,
926
966
  ) -> sa.Select:
927
967
  """
928
968
  Join two tables together.
@@ -1701,7 +1701,11 @@ class DataChain:
1701
1701
  )
1702
1702
 
1703
1703
  query = self._query.join(
1704
- right_ds._query, sqlalchemy.and_(*ops), inner, full, rname + "{name}"
1704
+ right_ds._query,
1705
+ sqlalchemy.and_(*ops),
1706
+ inner,
1707
+ full,
1708
+ rname + "{name}",
1705
1709
  )
1706
1710
  query.feature_schema = None
1707
1711
  ds = self._evolve(query=query)
@@ -1989,7 +1993,8 @@ class DataChain:
1989
1993
  results = self.results(include_hidden=include_hidden)
1990
1994
  if as_object:
1991
1995
  df = pd.DataFrame(results, columns=columns, dtype=object)
1992
- return df.where(pd.notna(df), None)
1996
+ df.where(pd.notna(df), None, inplace=True)
1997
+ return df
1993
1998
  return pd.DataFrame.from_records(results, columns=columns)
1994
1999
 
1995
2000
  def show(
datachain/plugins.py ADDED
@@ -0,0 +1,30 @@
1
+ """Plugin loader for DataChain callables.
2
+
3
+ Discovers and invokes entry points in the group "datachain.callables" once
4
+ per process. This enables external packages (e.g., Studio) to register
5
+ their callables with the serializer registry without explicit imports.
6
+ """
7
+
8
+ from importlib import metadata as importlib_metadata
9
+
10
+ _plugins_loaded = False
11
+
12
+
13
+ def ensure_plugins_loaded() -> None:
14
+ global _plugins_loaded # noqa: PLW0603
15
+ if _plugins_loaded:
16
+ return
17
+
18
+ # Compatible across importlib.metadata versions
19
+ eps_obj = importlib_metadata.entry_points()
20
+ if hasattr(eps_obj, "select"):
21
+ eps_list = eps_obj.select(group="datachain.callables")
22
+ else:
23
+ # Compatibility for older versions of importlib_metadata, Python 3.9
24
+ eps_list = eps_obj.get("datachain.callables", []) # type: ignore[attr-defined]
25
+
26
+ for ep in eps_list:
27
+ func = ep.load()
28
+ func()
29
+
30
+ _plugins_loaded = True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.34.1
3
+ Version: 0.34.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -14,6 +14,7 @@ datachain/namespace.py,sha256=sgIF90KEaC_VlMFivDIJiFz8RUsTftMxW4kOUTyxo3A,2356
14
14
  datachain/node.py,sha256=KWDT0ClYXB7FYI-QOvzAa-UDkLJErUI2eWm5FBteYuU,5577
15
15
  datachain/nodes_fetcher.py,sha256=_wgaKyqEjkqdwJ_Hj6D8vUYz7hnU7g6xhm0H6ZnYxmE,1095
16
16
  datachain/nodes_thread_pool.py,sha256=mdo0s-VybuSZkRUARcUO4Tjh8KFfZr9foHqmupx2SmM,3989
17
+ datachain/plugins.py,sha256=eWOeKg1uBZYEJND8s4D8eZ9b2oEHGN9hahkQDMR45Jc,932
17
18
  datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
18
19
  datachain/project.py,sha256=90D4GpJSA3t0fayYZbzrL3sk4U7EJhQo8psnWvdI7_o,2280
19
20
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -25,7 +26,7 @@ datachain/utils.py,sha256=yW-Df5R6npqcqlNZMlBRBwyhUFmXpl9sQipPmy9HfQU,15797
25
26
  datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
26
27
  datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
27
28
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
28
- datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6150
29
+ datachain/catalog/loader.py,sha256=H25cESk72rzs_oAP22jIbthVHunslQbdr63CvV54Pko,6260
29
30
  datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
30
31
  datachain/cli/utils.py,sha256=wrLnAh7Wx8O_ojZE8AE4Lxn5WoxHbOj7as8NWlLAA74,3036
31
32
  datachain/cli/commands/__init__.py,sha256=zp3bYIioO60x_X04A4-IpZqSYVnpwOa1AdERQaRlIhI,493
@@ -54,9 +55,9 @@ datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6
54
55
  datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
55
56
  datachain/data_storage/metastore.py,sha256=9Wd0MfdVrdpgvFXOddUvyz61MnoRDipv0-A38aRsqzw,61021
56
57
  datachain/data_storage/schema.py,sha256=DmxxXjNIsXib9gj5jcrb1CVjGzHf7HZLOehs1RmuiMA,9891
57
- datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
58
- datachain/data_storage/sqlite.py,sha256=Z6KlFk7hWoXBbjzxfk2NuIBecqP86AJzp5iEE2W4yw0,30603
59
- datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
58
+ datachain/data_storage/serializer.py,sha256=oL8i8smyAeVUyDepk8Xhf3lFOGOEHMoZjA5GdFzvfGI,3862
59
+ datachain/data_storage/sqlite.py,sha256=aZY-1pjzvoRcPyIJ7i9QQu6kH8tCDkfFgPybHjFHg1k,31266
60
+ datachain/data_storage/warehouse.py,sha256=dPafzy-JsN2x9TD0j0ZBIUie6sE2Z8XzwELfOZ6quyU,34386
60
61
  datachain/diff/__init__.py,sha256=v03JfMxH1VvwFl3rniedS4YWs6EXSfaLCULJTKNECE4,9603
61
62
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
63
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -107,7 +108,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
107
108
  datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
108
109
  datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
109
110
  datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
110
- datachain/lib/dc/datachain.py,sha256=Xh7Hwpvow_3QHPhsPSpP99HDKlwcJOpZEZJUNa_Ex9c,104396
111
+ datachain/lib/dc/datachain.py,sha256=TW9kcqNJr46_gQTpeCcSxYKKUpkk9cLVW9ADTcPJrug,104474
111
112
  datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
112
113
  datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
113
114
  datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
@@ -164,9 +165,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
164
165
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
165
166
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
166
167
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
167
- datachain-0.34.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
168
- datachain-0.34.1.dist-info/METADATA,sha256=x6vwqoDfsyj5T08GdAT7Qs13lv9uIonatPaxr_nPQ5Y,13655
169
- datachain-0.34.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
170
- datachain-0.34.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
171
- datachain-0.34.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
172
- datachain-0.34.1.dist-info/RECORD,,
168
+ datachain-0.34.3.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
+ datachain-0.34.3.dist-info/METADATA,sha256=l1d5np6lvB4K8ohVibIbhzlNobGtlglmBhK0VcQqV-U,13655
170
+ datachain-0.34.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
+ datachain-0.34.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
+ datachain-0.34.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
+ datachain-0.34.3.dist-info/RECORD,,