datachain 0.34.1__py3-none-any.whl → 0.34.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -3,6 +3,7 @@ import sys
3
3
  from importlib import import_module
4
4
  from typing import TYPE_CHECKING, Any, Optional
5
5
 
6
+ from datachain.plugins import ensure_plugins_loaded
6
7
  from datachain.utils import get_envs_by_prefix
7
8
 
8
9
  if TYPE_CHECKING:
@@ -24,6 +25,8 @@ IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
24
25
 
25
26
 
26
27
  def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
28
+ ensure_plugins_loaded()
29
+
27
30
  from datachain.data_storage import AbstractMetastore
28
31
  from datachain.data_storage.serializer import deserialize
29
32
 
@@ -64,6 +67,8 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
64
67
 
65
68
 
66
69
  def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
70
+ ensure_plugins_loaded()
71
+
67
72
  from datachain.data_storage import AbstractWarehouse
68
73
  from datachain.data_storage.serializer import deserialize
69
74
 
@@ -1,29 +1,119 @@
1
1
  import base64
2
- import pickle
2
+ import json
3
3
  from abc import abstractmethod
4
4
  from collections.abc import Callable
5
- from typing import Any
5
+ from typing import Any, ClassVar
6
+
7
+ from datachain.plugins import ensure_plugins_loaded
8
+
9
+
10
+ class CallableRegistry:
11
+ _registry: ClassVar[dict[str, Callable]] = {}
12
+
13
+ @classmethod
14
+ def register(cls, callable_obj: Callable, name: str) -> str:
15
+ cls._registry[name] = callable_obj
16
+ return name
17
+
18
+ @classmethod
19
+ def get(cls, name: str) -> Callable:
20
+ return cls._registry[name]
6
21
 
7
22
 
8
23
  class Serializable:
24
+ @classmethod
25
+ @abstractmethod
26
+ def serialize_callable_name(cls) -> str:
27
+ """Return the registered name used for this class' factory callable."""
28
+
9
29
  @abstractmethod
10
30
  def clone_params(self) -> tuple[Callable[..., Any], list[Any], dict[str, Any]]:
11
- """
12
- Returns the class, args, and kwargs needed to instantiate a cloned copy
13
- of this instance for use in separate processes or machines.
14
- """
31
+ """Return (callable, args, kwargs) necessary to recreate this object."""
32
+
33
+ def _prepare(self, params: tuple) -> dict:
34
+ callable, args, kwargs = params
35
+ callable_name = callable.__self__.serialize_callable_name()
36
+ return {
37
+ "callable": callable_name,
38
+ "args": args,
39
+ "kwargs": {
40
+ k: self._prepare(v) if isinstance(v, tuple) else v
41
+ for k, v in kwargs.items()
42
+ },
43
+ }
15
44
 
16
45
  def serialize(self) -> str:
17
- """
18
- Returns a string representation of clone params.
19
- This is useful for storing the state of an object in environment variable.
20
- """
21
- return base64.b64encode(pickle.dumps(self.clone_params())).decode()
46
+ """Return a base64-encoded JSON string with registered callable + params."""
47
+ _ensure_default_callables_registered()
48
+ data = self.clone_params()
49
+ return base64.b64encode(json.dumps(self._prepare(data)).encode()).decode()
22
50
 
23
51
 
24
52
  def deserialize(s: str) -> Serializable:
53
+ """Deserialize from base64-encoded JSON using only registered callables.
54
+
55
+ Nested serialized objects are instantiated automatically except for those
56
+ passed via clone parameter tuples (keys ending with ``_clone_params``),
57
+ which must remain as (callable, args, kwargs) for later factory usage.
25
58
  """
26
- Returns a new instance of the class represented by the string.
27
- """
28
- (f, args, kwargs) = pickle.loads(base64.b64decode(s.encode())) # noqa: S301
29
- return f(*args, **kwargs)
59
+ ensure_plugins_loaded()
60
+ _ensure_default_callables_registered()
61
+ decoded = base64.b64decode(s.encode())
62
+ data = json.loads(decoded.decode())
63
+
64
+ def _is_serialized(obj: Any) -> bool:
65
+ return isinstance(obj, dict) and {"callable", "args", "kwargs"}.issubset(
66
+ obj.keys()
67
+ )
68
+
69
+ def _reconstruct(obj: Any, nested: bool = False) -> Any:
70
+ if not _is_serialized(obj):
71
+ return obj
72
+ callable_name: str = obj["callable"]
73
+ args: list[Any] = obj["args"]
74
+ kwargs: dict[str, Any] = obj["kwargs"]
75
+ # Recurse only inside kwargs because serialize() only nests through kwargs
76
+ for k, v in list(kwargs.items()):
77
+ if _is_serialized(v):
78
+ kwargs[k] = _reconstruct(v, True)
79
+ callable_obj = CallableRegistry.get(callable_name)
80
+ if nested:
81
+ return (callable_obj, args, kwargs)
82
+ # Otherwise instantiate
83
+ return callable_obj(*args, **kwargs)
84
+
85
+ if not _is_serialized(data):
86
+ raise ValueError("Invalid serialized data format")
87
+ return _reconstruct(data, False)
88
+
89
+
90
+ class _DefaultsState:
91
+ registered = False
92
+
93
+
94
+ def _ensure_default_callables_registered() -> None:
95
+ if _DefaultsState.registered:
96
+ return
97
+
98
+ from datachain.data_storage.sqlite import (
99
+ SQLiteDatabaseEngine,
100
+ SQLiteMetastore,
101
+ SQLiteWarehouse,
102
+ )
103
+
104
+ # Register (idempotent by name overwrite is fine) using class-level
105
+ # serialization names to avoid hard-coded literals here.
106
+ CallableRegistry.register(
107
+ SQLiteDatabaseEngine.from_db_file,
108
+ SQLiteDatabaseEngine.serialize_callable_name(),
109
+ )
110
+ CallableRegistry.register(
111
+ SQLiteMetastore.init_after_clone,
112
+ SQLiteMetastore.serialize_callable_name(),
113
+ )
114
+ CallableRegistry.register(
115
+ SQLiteWarehouse.init_after_clone,
116
+ SQLiteWarehouse.serialize_callable_name(),
117
+ )
118
+
119
+ _DefaultsState.registered = True
@@ -201,10 +201,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
201
201
  """
202
202
  return (
203
203
  SQLiteDatabaseEngine.from_db_file,
204
- [self.db_file],
204
+ [str(self.db_file)],
205
205
  {},
206
206
  )
207
207
 
208
+ @classmethod
209
+ def serialize_callable_name(cls) -> str:
210
+ return "sqlite.from_db_file"
211
+
208
212
  def _reconnect(self) -> None:
209
213
  if not self.is_closed:
210
214
  raise RuntimeError("Cannot reconnect on still-open DB!")
@@ -403,6 +407,10 @@ class SQLiteMetastore(AbstractDBMetastore):
403
407
  },
404
408
  )
405
409
 
410
+ @classmethod
411
+ def serialize_callable_name(cls) -> str:
412
+ return "sqlite.metastore.init_after_clone"
413
+
406
414
  @classmethod
407
415
  def init_after_clone(
408
416
  cls,
@@ -610,6 +618,10 @@ class SQLiteWarehouse(AbstractWarehouse):
610
618
  {"db_clone_params": self.db.clone_params()},
611
619
  )
612
620
 
621
+ @classmethod
622
+ def serialize_callable_name(cls) -> str:
623
+ return "sqlite.warehouse.init_after_clone"
624
+
613
625
  @classmethod
614
626
  def init_after_clone(
615
627
  cls,
datachain/plugins.py ADDED
@@ -0,0 +1,30 @@
1
+ """Plugin loader for DataChain callables.
2
+
3
+ Discovers and invokes entry points in the group "datachain.callables" once
4
+ per process. This enables external packages (e.g., Studio) to register
5
+ their callables with the serializer registry without explicit imports.
6
+ """
7
+
8
+ from importlib import metadata as importlib_metadata
9
+
10
+ _plugins_loaded = False
11
+
12
+
13
+ def ensure_plugins_loaded() -> None:
14
+ global _plugins_loaded # noqa: PLW0603
15
+ if _plugins_loaded:
16
+ return
17
+
18
+ # Compatible across importlib.metadata versions
19
+ eps_obj = importlib_metadata.entry_points()
20
+ if hasattr(eps_obj, "select"):
21
+ eps_list = eps_obj.select(group="datachain.callables")
22
+ else:
23
+ # Compatibility for older versions of importlib_metadata, Python 3.9
24
+ eps_list = eps_obj.get("datachain.callables", []) # type: ignore[attr-defined]
25
+
26
+ for ep in eps_list:
27
+ func = ep.load()
28
+ func()
29
+
30
+ _plugins_loaded = True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.34.1
3
+ Version: 0.34.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -14,6 +14,7 @@ datachain/namespace.py,sha256=sgIF90KEaC_VlMFivDIJiFz8RUsTftMxW4kOUTyxo3A,2356
14
14
  datachain/node.py,sha256=KWDT0ClYXB7FYI-QOvzAa-UDkLJErUI2eWm5FBteYuU,5577
15
15
  datachain/nodes_fetcher.py,sha256=_wgaKyqEjkqdwJ_Hj6D8vUYz7hnU7g6xhm0H6ZnYxmE,1095
16
16
  datachain/nodes_thread_pool.py,sha256=mdo0s-VybuSZkRUARcUO4Tjh8KFfZr9foHqmupx2SmM,3989
17
+ datachain/plugins.py,sha256=eWOeKg1uBZYEJND8s4D8eZ9b2oEHGN9hahkQDMR45Jc,932
17
18
  datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
18
19
  datachain/project.py,sha256=90D4GpJSA3t0fayYZbzrL3sk4U7EJhQo8psnWvdI7_o,2280
19
20
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -25,7 +26,7 @@ datachain/utils.py,sha256=yW-Df5R6npqcqlNZMlBRBwyhUFmXpl9sQipPmy9HfQU,15797
25
26
  datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
26
27
  datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
27
28
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
28
- datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6150
29
+ datachain/catalog/loader.py,sha256=H25cESk72rzs_oAP22jIbthVHunslQbdr63CvV54Pko,6260
29
30
  datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
30
31
  datachain/cli/utils.py,sha256=wrLnAh7Wx8O_ojZE8AE4Lxn5WoxHbOj7as8NWlLAA74,3036
31
32
  datachain/cli/commands/__init__.py,sha256=zp3bYIioO60x_X04A4-IpZqSYVnpwOa1AdERQaRlIhI,493
@@ -54,8 +55,8 @@ datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6
54
55
  datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
55
56
  datachain/data_storage/metastore.py,sha256=9Wd0MfdVrdpgvFXOddUvyz61MnoRDipv0-A38aRsqzw,61021
56
57
  datachain/data_storage/schema.py,sha256=DmxxXjNIsXib9gj5jcrb1CVjGzHf7HZLOehs1RmuiMA,9891
57
- datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
58
- datachain/data_storage/sqlite.py,sha256=Z6KlFk7hWoXBbjzxfk2NuIBecqP86AJzp5iEE2W4yw0,30603
58
+ datachain/data_storage/serializer.py,sha256=oL8i8smyAeVUyDepk8Xhf3lFOGOEHMoZjA5GdFzvfGI,3862
59
+ datachain/data_storage/sqlite.py,sha256=YNHXPdJeTEoWfhZYb5fsLf1CIjiEhB7VG4OgQzDrWVU,30936
59
60
  datachain/data_storage/warehouse.py,sha256=7jc69CtWdfQlc_9WbJ5l6yQooarpLFBrDk4fY-svi_0,32783
60
61
  datachain/diff/__init__.py,sha256=v03JfMxH1VvwFl3rniedS4YWs6EXSfaLCULJTKNECE4,9603
61
62
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -164,9 +165,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
164
165
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
165
166
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
166
167
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
167
- datachain-0.34.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
168
- datachain-0.34.1.dist-info/METADATA,sha256=x6vwqoDfsyj5T08GdAT7Qs13lv9uIonatPaxr_nPQ5Y,13655
169
- datachain-0.34.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
170
- datachain-0.34.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
171
- datachain-0.34.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
172
- datachain-0.34.1.dist-info/RECORD,,
168
+ datachain-0.34.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
169
+ datachain-0.34.2.dist-info/METADATA,sha256=p-mulDC4TJ2QOJr2peiHCygfiVP1bwwdubi-fyfLQkg,13655
170
+ datachain-0.34.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
+ datachain-0.34.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
172
+ datachain-0.34.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
173
+ datachain-0.34.2.dist-info/RECORD,,