datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. datachain/__init__.py +4 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +5 -5
  4. datachain/catalog/__init__.py +0 -2
  5. datachain/catalog/catalog.py +276 -354
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +8 -3
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +10 -17
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +42 -27
  12. datachain/cli/commands/ls.py +15 -15
  13. datachain/cli/commands/show.py +2 -2
  14. datachain/cli/parser/__init__.py +3 -43
  15. datachain/cli/parser/job.py +1 -1
  16. datachain/cli/parser/utils.py +1 -2
  17. datachain/cli/utils.py +2 -15
  18. datachain/client/azure.py +2 -2
  19. datachain/client/fsspec.py +34 -23
  20. datachain/client/gcs.py +3 -3
  21. datachain/client/http.py +157 -0
  22. datachain/client/local.py +11 -7
  23. datachain/client/s3.py +3 -3
  24. datachain/config.py +4 -8
  25. datachain/data_storage/db_engine.py +12 -6
  26. datachain/data_storage/job.py +2 -0
  27. datachain/data_storage/metastore.py +716 -137
  28. datachain/data_storage/schema.py +20 -27
  29. datachain/data_storage/serializer.py +105 -15
  30. datachain/data_storage/sqlite.py +114 -114
  31. datachain/data_storage/warehouse.py +140 -48
  32. datachain/dataset.py +109 -89
  33. datachain/delta.py +117 -42
  34. datachain/diff/__init__.py +25 -33
  35. datachain/error.py +24 -0
  36. datachain/func/aggregate.py +9 -11
  37. datachain/func/array.py +12 -12
  38. datachain/func/base.py +7 -4
  39. datachain/func/conditional.py +9 -13
  40. datachain/func/func.py +63 -45
  41. datachain/func/numeric.py +5 -7
  42. datachain/func/string.py +2 -2
  43. datachain/hash_utils.py +123 -0
  44. datachain/job.py +11 -7
  45. datachain/json.py +138 -0
  46. datachain/lib/arrow.py +18 -15
  47. datachain/lib/audio.py +60 -59
  48. datachain/lib/clip.py +14 -13
  49. datachain/lib/convert/python_to_sql.py +6 -10
  50. datachain/lib/convert/values_to_tuples.py +151 -53
  51. datachain/lib/data_model.py +23 -19
  52. datachain/lib/dataset_info.py +7 -7
  53. datachain/lib/dc/__init__.py +2 -1
  54. datachain/lib/dc/csv.py +22 -26
  55. datachain/lib/dc/database.py +37 -34
  56. datachain/lib/dc/datachain.py +518 -324
  57. datachain/lib/dc/datasets.py +38 -30
  58. datachain/lib/dc/hf.py +16 -20
  59. datachain/lib/dc/json.py +17 -18
  60. datachain/lib/dc/listings.py +5 -8
  61. datachain/lib/dc/pandas.py +3 -6
  62. datachain/lib/dc/parquet.py +33 -21
  63. datachain/lib/dc/records.py +9 -13
  64. datachain/lib/dc/storage.py +103 -65
  65. datachain/lib/dc/storage_pattern.py +251 -0
  66. datachain/lib/dc/utils.py +17 -14
  67. datachain/lib/dc/values.py +3 -6
  68. datachain/lib/file.py +187 -50
  69. datachain/lib/hf.py +7 -5
  70. datachain/lib/image.py +13 -13
  71. datachain/lib/listing.py +5 -5
  72. datachain/lib/listing_info.py +1 -2
  73. datachain/lib/meta_formats.py +2 -3
  74. datachain/lib/model_store.py +20 -8
  75. datachain/lib/namespaces.py +59 -7
  76. datachain/lib/projects.py +51 -9
  77. datachain/lib/pytorch.py +31 -23
  78. datachain/lib/settings.py +188 -85
  79. datachain/lib/signal_schema.py +302 -64
  80. datachain/lib/text.py +8 -7
  81. datachain/lib/udf.py +103 -63
  82. datachain/lib/udf_signature.py +59 -34
  83. datachain/lib/utils.py +20 -0
  84. datachain/lib/video.py +3 -4
  85. datachain/lib/webdataset.py +31 -36
  86. datachain/lib/webdataset_laion.py +15 -16
  87. datachain/listing.py +12 -5
  88. datachain/model/bbox.py +3 -1
  89. datachain/namespace.py +22 -3
  90. datachain/node.py +6 -6
  91. datachain/nodes_thread_pool.py +0 -1
  92. datachain/plugins.py +24 -0
  93. datachain/project.py +4 -4
  94. datachain/query/batch.py +10 -12
  95. datachain/query/dataset.py +376 -194
  96. datachain/query/dispatch.py +112 -84
  97. datachain/query/metrics.py +3 -4
  98. datachain/query/params.py +2 -3
  99. datachain/query/queue.py +2 -1
  100. datachain/query/schema.py +7 -6
  101. datachain/query/session.py +190 -33
  102. datachain/query/udf.py +9 -6
  103. datachain/remote/studio.py +90 -53
  104. datachain/script_meta.py +12 -12
  105. datachain/sql/sqlite/base.py +37 -25
  106. datachain/sql/sqlite/types.py +1 -1
  107. datachain/sql/types.py +36 -5
  108. datachain/studio.py +49 -40
  109. datachain/toolkit/split.py +31 -10
  110. datachain/utils.py +39 -48
  111. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
  112. datachain-0.39.0.dist-info/RECORD +173 -0
  113. datachain/cli/commands/query.py +0 -54
  114. datachain/query/utils.py +0 -36
  115. datachain-0.30.5.dist-info/RECORD +0 -168
  116. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
  117. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  118. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  119. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,6 @@
1
1
  import inspect
2
2
  from collections.abc import Iterable, Iterator, Sequence
3
- from typing import (
4
- TYPE_CHECKING,
5
- Any,
6
- Generic,
7
- Optional,
8
- TypeVar,
9
- )
3
+ from typing import TYPE_CHECKING, Any, Generic, TypeVar
10
4
 
11
5
  import sqlalchemy as sa
12
6
  from sqlalchemy.sql import func as f
@@ -17,7 +11,6 @@ from datachain.sql.types import (
17
11
  JSON,
18
12
  Boolean,
19
13
  DateTime,
20
- Int,
21
14
  Int64,
22
15
  SQLType,
23
16
  String,
@@ -51,7 +44,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
51
44
  """
52
45
  c_set: dict[str, sa.Column] = {}
53
46
  for c in columns:
54
- if (ec := c_set.get(c.name, None)) is not None:
47
+ if (ec := c_set.get(c.name)) is not None:
55
48
  if str(ec.type) != str(c.type):
56
49
  raise ValueError(
57
50
  f"conflicting types for column {c.name}:{c.type!s} and {ec.type!s}"
@@ -96,11 +89,11 @@ class DirExpansion:
96
89
  def __init__(self, column: str):
97
90
  self.column = column
98
91
 
99
- def col_name(self, name: str, column: Optional[str] = None) -> str:
92
+ def col_name(self, name: str, column: str | None = None) -> str:
100
93
  column = column or self.column
101
94
  return col_name(name, column)
102
95
 
103
- def c(self, query, name: str, column: Optional[str] = None) -> str:
96
+ def c(self, query, name: str, column: str | None = None) -> str:
104
97
  return getattr(query.c, self.col_name(name, column=column))
105
98
 
106
99
  def base_select(self, q):
@@ -161,7 +154,7 @@ class DataTable:
161
154
  self,
162
155
  name: str,
163
156
  engine: "DatabaseEngine",
164
- column_types: Optional[dict[str, SQLType]] = None,
157
+ column_types: dict[str, SQLType] | None = None,
165
158
  column: str = "file",
166
159
  ):
167
160
  self.name: str = name
@@ -172,12 +165,12 @@ class DataTable:
172
165
  @staticmethod
173
166
  def copy_column(
174
167
  column: sa.Column,
175
- primary_key: Optional[bool] = None,
176
- index: Optional[bool] = None,
177
- nullable: Optional[bool] = None,
178
- default: Optional[Any] = None,
179
- server_default: Optional[Any] = None,
180
- unique: Optional[bool] = None,
168
+ primary_key: bool | None = None,
169
+ index: bool | None = None,
170
+ nullable: bool | None = None,
171
+ default: Any | None = None,
172
+ server_default: Any | None = None,
173
+ unique: bool | None = None,
181
174
  ) -> sa.Column:
182
175
  """
183
176
  Copy a sqlalchemy Column object intended for use as a signal column.
@@ -206,8 +199,8 @@ class DataTable:
206
199
  def new_table(
207
200
  cls,
208
201
  name: str,
209
- columns: Sequence["sa.Column"] = (),
210
- metadata: Optional["sa.MetaData"] = None,
202
+ columns: Sequence[sa.Column] = (),
203
+ metadata: sa.MetaData | None = None,
211
204
  ):
212
205
  # copy columns, since reusing the same objects from another table
213
206
  # may raise an error
@@ -218,7 +211,7 @@ class DataTable:
218
211
  metadata = sa.MetaData()
219
212
  return sa.Table(name, metadata, *columns)
220
213
 
221
- def get_table(self) -> "sa.Table":
214
+ def get_table(self) -> sa.Table:
222
215
  table = self.engine.get_table(self.name)
223
216
 
224
217
  column_types = self.column_types | {c.name: c.type for c in self.sys_columns()}
@@ -233,19 +226,19 @@ class DataTable:
233
226
  def columns(self) -> "ReadOnlyColumnCollection[str, sa.Column[Any]]":
234
227
  return self.table.columns
235
228
 
236
- def col_name(self, name: str, column: Optional[str] = None) -> str:
229
+ def col_name(self, name: str, column: str | None = None) -> str:
237
230
  column = column or self.column
238
231
  return col_name(name, column)
239
232
 
240
- def without_object(self, column_name: str, column: Optional[str] = None) -> str:
233
+ def without_object(self, column_name: str, column: str | None = None) -> str:
241
234
  column = column or self.column
242
235
  return column_name.removeprefix(f"{column}{DEFAULT_DELIMITER}")
243
236
 
244
- def c(self, name: str, column: Optional[str] = None):
237
+ def c(self, name: str, column: str | None = None):
245
238
  return getattr(self.columns, self.col_name(name, column=column))
246
239
 
247
240
  @property
248
- def table(self) -> "sa.Table":
241
+ def table(self) -> sa.Table:
249
242
  return self.get_table()
250
243
 
251
244
  def apply_conditions(self, query: "Executable") -> "Executable":
@@ -275,7 +268,7 @@ class DataTable:
275
268
  @classmethod
276
269
  def sys_columns(cls):
277
270
  return [
278
- sa.Column("sys__id", Int, primary_key=True),
271
+ sa.Column("sys__id", UInt64, primary_key=True),
279
272
  sa.Column(
280
273
  "sys__rand", UInt64, nullable=False, server_default=f.abs(f.random())
281
274
  ),
@@ -303,7 +296,7 @@ PARTITION_COLUMN_ID = "partition_id"
303
296
  partition_col_names = [PARTITION_COLUMN_ID]
304
297
 
305
298
 
306
- def partition_columns() -> Sequence["sa.Column"]:
299
+ def partition_columns() -> Sequence[sa.Column]:
307
300
  return [
308
301
  sa.Column(PARTITION_COLUMN_ID, sa.Integer),
309
302
  ]
@@ -1,29 +1,119 @@
1
1
  import base64
2
- import pickle
3
2
  from abc import abstractmethod
4
3
  from collections.abc import Callable
5
- from typing import Any
4
+ from typing import Any, ClassVar
5
+
6
+ from datachain import json
7
+ from datachain.plugins import ensure_plugins_loaded
8
+
9
+
10
+ class CallableRegistry:
11
+ _registry: ClassVar[dict[str, Callable]] = {}
12
+
13
+ @classmethod
14
+ def register(cls, callable_obj: Callable, name: str) -> str:
15
+ cls._registry[name] = callable_obj
16
+ return name
17
+
18
+ @classmethod
19
+ def get(cls, name: str) -> Callable:
20
+ return cls._registry[name]
6
21
 
7
22
 
8
23
  class Serializable:
24
+ @classmethod
25
+ @abstractmethod
26
+ def serialize_callable_name(cls) -> str:
27
+ """Return the registered name used for this class' factory callable."""
28
+
9
29
  @abstractmethod
10
30
  def clone_params(self) -> tuple[Callable[..., Any], list[Any], dict[str, Any]]:
11
- """
12
- Returns the class, args, and kwargs needed to instantiate a cloned copy
13
- of this instance for use in separate processes or machines.
14
- """
31
+ """Return (callable, args, kwargs) necessary to recreate this object."""
32
+
33
+ def _prepare(self, params: tuple) -> dict:
34
+ callable, args, kwargs = params
35
+ callable_name = callable.__self__.serialize_callable_name()
36
+ return {
37
+ "callable": callable_name,
38
+ "args": args,
39
+ "kwargs": {
40
+ k: self._prepare(v) if isinstance(v, tuple) else v
41
+ for k, v in kwargs.items()
42
+ },
43
+ }
15
44
 
16
45
  def serialize(self) -> str:
17
- """
18
- Returns a string representation of clone params.
19
- This is useful for storing the state of an object in environment variable.
20
- """
21
- return base64.b64encode(pickle.dumps(self.clone_params())).decode()
46
+ """Return a base64-encoded JSON string with registered callable + params."""
47
+ _ensure_default_callables_registered()
48
+ data = self.clone_params()
49
+ return base64.b64encode(json.dumps(self._prepare(data)).encode()).decode()
22
50
 
23
51
 
24
52
  def deserialize(s: str) -> Serializable:
53
+ """Deserialize from base64-encoded JSON using only registered callables.
54
+
55
+ Nested serialized objects are instantiated automatically except for those
56
+ passed via clone parameter tuples (keys ending with ``_clone_params``),
57
+ which must remain as (callable, args, kwargs) for later factory usage.
25
58
  """
26
- Returns a new instance of the class represented by the string.
27
- """
28
- (f, args, kwargs) = pickle.loads(base64.b64decode(s.encode())) # noqa: S301
29
- return f(*args, **kwargs)
59
+ ensure_plugins_loaded()
60
+ _ensure_default_callables_registered()
61
+ decoded = base64.b64decode(s.encode())
62
+ data = json.loads(decoded.decode())
63
+
64
+ def _is_serialized(obj: Any) -> bool:
65
+ return isinstance(obj, dict) and {"callable", "args", "kwargs"}.issubset(
66
+ obj.keys()
67
+ )
68
+
69
+ def _reconstruct(obj: Any, nested: bool = False) -> Any:
70
+ if not _is_serialized(obj):
71
+ return obj
72
+ callable_name: str = obj["callable"]
73
+ args: list[Any] = obj["args"]
74
+ kwargs: dict[str, Any] = obj["kwargs"]
75
+ # Recurse only inside kwargs because serialize() only nests through kwargs
76
+ for k, v in list(kwargs.items()):
77
+ if _is_serialized(v):
78
+ kwargs[k] = _reconstruct(v, True)
79
+ callable_obj = CallableRegistry.get(callable_name)
80
+ if nested:
81
+ return (callable_obj, args, kwargs)
82
+ # Otherwise instantiate
83
+ return callable_obj(*args, **kwargs)
84
+
85
+ if not _is_serialized(data):
86
+ raise ValueError("Invalid serialized data format")
87
+ return _reconstruct(data, False)
88
+
89
+
90
+ class _DefaultsState:
91
+ registered = False
92
+
93
+
94
+ def _ensure_default_callables_registered() -> None:
95
+ if _DefaultsState.registered:
96
+ return
97
+
98
+ from datachain.data_storage.sqlite import (
99
+ SQLiteDatabaseEngine,
100
+ SQLiteMetastore,
101
+ SQLiteWarehouse,
102
+ )
103
+
104
+ # Register (idempotent by name overwrite is fine) using class-level
105
+ # serialization names to avoid hard-coded literals here.
106
+ CallableRegistry.register(
107
+ SQLiteDatabaseEngine.from_db_file,
108
+ SQLiteDatabaseEngine.serialize_callable_name(),
109
+ )
110
+ CallableRegistry.register(
111
+ SQLiteMetastore.init_after_clone,
112
+ SQLiteMetastore.serialize_callable_name(),
113
+ )
114
+ CallableRegistry.register(
115
+ SQLiteWarehouse.init_after_clone,
116
+ SQLiteWarehouse.serialize_callable_name(),
117
+ )
118
+
119
+ _DefaultsState.registered = True