datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. datachain/__init__.py +20 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +7 -7
  4. datachain/catalog/__init__.py +2 -2
  5. datachain/catalog/catalog.py +621 -507
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +28 -18
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +24 -33
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +83 -52
  12. datachain/cli/commands/ls.py +17 -17
  13. datachain/cli/commands/show.py +4 -4
  14. datachain/cli/parser/__init__.py +8 -74
  15. datachain/cli/parser/job.py +95 -3
  16. datachain/cli/parser/studio.py +11 -4
  17. datachain/cli/parser/utils.py +1 -2
  18. datachain/cli/utils.py +2 -15
  19. datachain/client/azure.py +4 -4
  20. datachain/client/fsspec.py +45 -28
  21. datachain/client/gcs.py +6 -6
  22. datachain/client/hf.py +29 -2
  23. datachain/client/http.py +157 -0
  24. datachain/client/local.py +15 -11
  25. datachain/client/s3.py +17 -9
  26. datachain/config.py +4 -8
  27. datachain/data_storage/db_engine.py +12 -6
  28. datachain/data_storage/job.py +5 -1
  29. datachain/data_storage/metastore.py +1252 -186
  30. datachain/data_storage/schema.py +58 -45
  31. datachain/data_storage/serializer.py +105 -15
  32. datachain/data_storage/sqlite.py +286 -127
  33. datachain/data_storage/warehouse.py +250 -113
  34. datachain/dataset.py +353 -148
  35. datachain/delta.py +391 -0
  36. datachain/diff/__init__.py +27 -29
  37. datachain/error.py +60 -0
  38. datachain/func/__init__.py +2 -1
  39. datachain/func/aggregate.py +66 -42
  40. datachain/func/array.py +242 -38
  41. datachain/func/base.py +7 -4
  42. datachain/func/conditional.py +110 -60
  43. datachain/func/func.py +96 -45
  44. datachain/func/numeric.py +55 -38
  45. datachain/func/path.py +32 -20
  46. datachain/func/random.py +2 -2
  47. datachain/func/string.py +67 -37
  48. datachain/func/window.py +7 -8
  49. datachain/hash_utils.py +123 -0
  50. datachain/job.py +11 -7
  51. datachain/json.py +138 -0
  52. datachain/lib/arrow.py +58 -22
  53. datachain/lib/audio.py +245 -0
  54. datachain/lib/clip.py +14 -13
  55. datachain/lib/convert/flatten.py +5 -3
  56. datachain/lib/convert/python_to_sql.py +6 -10
  57. datachain/lib/convert/sql_to_python.py +8 -0
  58. datachain/lib/convert/values_to_tuples.py +156 -51
  59. datachain/lib/data_model.py +42 -20
  60. datachain/lib/dataset_info.py +36 -8
  61. datachain/lib/dc/__init__.py +8 -2
  62. datachain/lib/dc/csv.py +25 -28
  63. datachain/lib/dc/database.py +398 -0
  64. datachain/lib/dc/datachain.py +1289 -425
  65. datachain/lib/dc/datasets.py +320 -38
  66. datachain/lib/dc/hf.py +38 -24
  67. datachain/lib/dc/json.py +29 -32
  68. datachain/lib/dc/listings.py +112 -8
  69. datachain/lib/dc/pandas.py +16 -12
  70. datachain/lib/dc/parquet.py +35 -23
  71. datachain/lib/dc/records.py +31 -23
  72. datachain/lib/dc/storage.py +154 -64
  73. datachain/lib/dc/storage_pattern.py +251 -0
  74. datachain/lib/dc/utils.py +24 -16
  75. datachain/lib/dc/values.py +8 -9
  76. datachain/lib/file.py +622 -89
  77. datachain/lib/hf.py +69 -39
  78. datachain/lib/image.py +14 -14
  79. datachain/lib/listing.py +14 -11
  80. datachain/lib/listing_info.py +1 -2
  81. datachain/lib/meta_formats.py +3 -4
  82. datachain/lib/model_store.py +39 -7
  83. datachain/lib/namespaces.py +125 -0
  84. datachain/lib/projects.py +130 -0
  85. datachain/lib/pytorch.py +32 -21
  86. datachain/lib/settings.py +192 -56
  87. datachain/lib/signal_schema.py +427 -104
  88. datachain/lib/tar.py +1 -2
  89. datachain/lib/text.py +8 -7
  90. datachain/lib/udf.py +164 -76
  91. datachain/lib/udf_signature.py +60 -35
  92. datachain/lib/utils.py +118 -4
  93. datachain/lib/video.py +17 -9
  94. datachain/lib/webdataset.py +61 -56
  95. datachain/lib/webdataset_laion.py +15 -16
  96. datachain/listing.py +22 -10
  97. datachain/model/bbox.py +3 -1
  98. datachain/model/ultralytics/bbox.py +16 -12
  99. datachain/model/ultralytics/pose.py +16 -12
  100. datachain/model/ultralytics/segment.py +16 -12
  101. datachain/namespace.py +84 -0
  102. datachain/node.py +6 -6
  103. datachain/nodes_thread_pool.py +0 -1
  104. datachain/plugins.py +24 -0
  105. datachain/project.py +78 -0
  106. datachain/query/batch.py +40 -41
  107. datachain/query/dataset.py +604 -322
  108. datachain/query/dispatch.py +261 -154
  109. datachain/query/metrics.py +4 -6
  110. datachain/query/params.py +2 -3
  111. datachain/query/queue.py +3 -12
  112. datachain/query/schema.py +11 -6
  113. datachain/query/session.py +200 -33
  114. datachain/query/udf.py +34 -2
  115. datachain/remote/studio.py +171 -69
  116. datachain/script_meta.py +12 -12
  117. datachain/semver.py +68 -0
  118. datachain/sql/__init__.py +2 -0
  119. datachain/sql/functions/array.py +33 -1
  120. datachain/sql/postgresql_dialect.py +9 -0
  121. datachain/sql/postgresql_types.py +21 -0
  122. datachain/sql/sqlite/__init__.py +5 -1
  123. datachain/sql/sqlite/base.py +102 -29
  124. datachain/sql/sqlite/types.py +8 -13
  125. datachain/sql/types.py +70 -15
  126. datachain/studio.py +223 -46
  127. datachain/toolkit/split.py +31 -10
  128. datachain/utils.py +101 -59
  129. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
  130. datachain-0.39.0.dist-info/RECORD +173 -0
  131. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
  132. datachain/cli/commands/query.py +0 -53
  133. datachain/query/utils.py +0 -42
  134. datachain-0.14.2.dist-info/RECORD +0 -158
  135. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  136. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  137. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,21 @@
1
1
  import inspect
2
2
  from collections.abc import Iterable, Iterator, Sequence
3
- from typing import (
4
- TYPE_CHECKING,
5
- Any,
6
- Generic,
7
- Optional,
8
- TypeVar,
9
- )
3
+ from typing import TYPE_CHECKING, Any, Generic, TypeVar
10
4
 
11
5
  import sqlalchemy as sa
12
6
  from sqlalchemy.sql import func as f
13
7
  from sqlalchemy.sql.expression import false, null, true
14
8
 
15
9
  from datachain.sql.functions import path as pathfunc
16
- from datachain.sql.types import Int, SQLType, UInt64
10
+ from datachain.sql.types import (
11
+ JSON,
12
+ Boolean,
13
+ DateTime,
14
+ Int64,
15
+ SQLType,
16
+ String,
17
+ UInt64,
18
+ )
17
19
 
18
20
  if TYPE_CHECKING:
19
21
  from sqlalchemy.engine.interfaces import Dialect
@@ -30,8 +32,8 @@ if TYPE_CHECKING:
30
32
  DEFAULT_DELIMITER = "__"
31
33
 
32
34
 
33
- def col_name(name: str, object_name: str = "file") -> str:
34
- return f"{object_name}{DEFAULT_DELIMITER}{name}"
35
+ def col_name(name: str, column: str = "file") -> str:
36
+ return f"{column}{DEFAULT_DELIMITER}{name}"
35
37
 
36
38
 
37
39
  def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
@@ -42,7 +44,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
42
44
  """
43
45
  c_set: dict[str, sa.Column] = {}
44
46
  for c in columns:
45
- if (ec := c_set.get(c.name, None)) is not None:
47
+ if (ec := c_set.get(c.name)) is not None:
46
48
  if str(ec.type) != str(c.type):
47
49
  raise ValueError(
48
50
  f"conflicting types for column {c.name}:{c.type!s} and {ec.type!s}"
@@ -84,19 +86,19 @@ def convert_rows_custom_column_types(
84
86
 
85
87
 
86
88
  class DirExpansion:
87
- def __init__(self, object_name: str):
88
- self.object_name = object_name
89
+ def __init__(self, column: str):
90
+ self.column = column
89
91
 
90
- def col_name(self, name: str, object_name: Optional[str] = None) -> str:
91
- object_name = object_name or self.object_name
92
- return col_name(name, object_name)
92
+ def col_name(self, name: str, column: str | None = None) -> str:
93
+ column = column or self.column
94
+ return col_name(name, column)
93
95
 
94
- def c(self, query, name: str, object_name: Optional[str] = None) -> str:
95
- return getattr(query.c, self.col_name(name, object_name=object_name))
96
+ def c(self, query, name: str, column: str | None = None) -> str:
97
+ return getattr(query.c, self.col_name(name, column=column))
96
98
 
97
99
  def base_select(self, q):
98
100
  return sa.select(
99
- self.c(q, "id", object_name="sys"),
101
+ self.c(q, "id", column="sys"),
100
102
  false().label(self.col_name("is_dir")),
101
103
  self.c(q, "source"),
102
104
  self.c(q, "path"),
@@ -152,23 +154,23 @@ class DataTable:
152
154
  self,
153
155
  name: str,
154
156
  engine: "DatabaseEngine",
155
- column_types: Optional[dict[str, SQLType]] = None,
156
- object_name: str = "file",
157
+ column_types: dict[str, SQLType] | None = None,
158
+ column: str = "file",
157
159
  ):
158
160
  self.name: str = name
159
161
  self.engine = engine
160
162
  self.column_types: dict[str, SQLType] = column_types or {}
161
- self.object_name = object_name
163
+ self.column = column
162
164
 
163
165
  @staticmethod
164
166
  def copy_column(
165
167
  column: sa.Column,
166
- primary_key: Optional[bool] = None,
167
- index: Optional[bool] = None,
168
- nullable: Optional[bool] = None,
169
- default: Optional[Any] = None,
170
- server_default: Optional[Any] = None,
171
- unique: Optional[bool] = None,
168
+ primary_key: bool | None = None,
169
+ index: bool | None = None,
170
+ nullable: bool | None = None,
171
+ default: Any | None = None,
172
+ server_default: Any | None = None,
173
+ unique: bool | None = None,
172
174
  ) -> sa.Column:
173
175
  """
174
176
  Copy a sqlalchemy Column object intended for use as a signal column.
@@ -197,8 +199,8 @@ class DataTable:
197
199
  def new_table(
198
200
  cls,
199
201
  name: str,
200
- columns: Sequence["sa.Column"] = (),
201
- metadata: Optional["sa.MetaData"] = None,
202
+ columns: Sequence[sa.Column] = (),
203
+ metadata: sa.MetaData | None = None,
202
204
  ):
203
205
  # copy columns, since reusing the same objects from another table
204
206
  # may raise an error
@@ -209,7 +211,7 @@ class DataTable:
209
211
  metadata = sa.MetaData()
210
212
  return sa.Table(name, metadata, *columns)
211
213
 
212
- def get_table(self) -> "sa.Table":
214
+ def get_table(self) -> sa.Table:
213
215
  table = self.engine.get_table(self.name)
214
216
 
215
217
  column_types = self.column_types | {c.name: c.type for c in self.sys_columns()}
@@ -224,21 +226,19 @@ class DataTable:
224
226
  def columns(self) -> "ReadOnlyColumnCollection[str, sa.Column[Any]]":
225
227
  return self.table.columns
226
228
 
227
- def col_name(self, name: str, object_name: Optional[str] = None) -> str:
228
- object_name = object_name or self.object_name
229
- return col_name(name, object_name)
229
+ def col_name(self, name: str, column: str | None = None) -> str:
230
+ column = column or self.column
231
+ return col_name(name, column)
230
232
 
231
- def without_object(
232
- self, column_name: str, object_name: Optional[str] = None
233
- ) -> str:
234
- object_name = object_name or self.object_name
235
- return column_name.removeprefix(f"{object_name}{DEFAULT_DELIMITER}")
233
+ def without_object(self, column_name: str, column: str | None = None) -> str:
234
+ column = column or self.column
235
+ return column_name.removeprefix(f"{column}{DEFAULT_DELIMITER}")
236
236
 
237
- def c(self, name: str, object_name: Optional[str] = None):
238
- return getattr(self.columns, self.col_name(name, object_name=object_name))
237
+ def c(self, name: str, column: str | None = None):
238
+ return getattr(self.columns, self.col_name(name, column=column))
239
239
 
240
240
  @property
241
- def table(self) -> "sa.Table":
241
+ def table(self) -> sa.Table:
242
242
  return self.get_table()
243
243
 
244
244
  def apply_conditions(self, query: "Executable") -> "Executable":
@@ -268,14 +268,27 @@ class DataTable:
268
268
  @classmethod
269
269
  def sys_columns(cls):
270
270
  return [
271
- sa.Column("sys__id", Int, primary_key=True),
271
+ sa.Column("sys__id", UInt64, primary_key=True),
272
272
  sa.Column(
273
273
  "sys__rand", UInt64, nullable=False, server_default=f.abs(f.random())
274
274
  ),
275
275
  ]
276
276
 
277
+ @classmethod
278
+ def listing_columns(cls):
279
+ return [
280
+ sa.Column("file__source", String()),
281
+ sa.Column("file__path", String()),
282
+ sa.Column("file__size", Int64()),
283
+ sa.Column("file__version", String()),
284
+ sa.Column("file__etag", String()),
285
+ sa.Column("file__is_latest", Boolean()),
286
+ sa.Column("file__last_modified", DateTime()),
287
+ sa.Column("file__location", JSON()),
288
+ ]
289
+
277
290
  def dir_expansion(self):
278
- return DirExpansion(self.object_name)
291
+ return DirExpansion(self.column)
279
292
 
280
293
 
281
294
  PARTITION_COLUMN_ID = "partition_id"
@@ -283,7 +296,7 @@ PARTITION_COLUMN_ID = "partition_id"
283
296
  partition_col_names = [PARTITION_COLUMN_ID]
284
297
 
285
298
 
286
- def partition_columns() -> Sequence["sa.Column"]:
299
+ def partition_columns() -> Sequence[sa.Column]:
287
300
  return [
288
301
  sa.Column(PARTITION_COLUMN_ID, sa.Integer),
289
302
  ]
@@ -1,29 +1,119 @@
1
1
  import base64
2
- import pickle
3
2
  from abc import abstractmethod
4
3
  from collections.abc import Callable
5
- from typing import Any
4
+ from typing import Any, ClassVar
5
+
6
+ from datachain import json
7
+ from datachain.plugins import ensure_plugins_loaded
8
+
9
+
10
+ class CallableRegistry:
11
+ _registry: ClassVar[dict[str, Callable]] = {}
12
+
13
+ @classmethod
14
+ def register(cls, callable_obj: Callable, name: str) -> str:
15
+ cls._registry[name] = callable_obj
16
+ return name
17
+
18
+ @classmethod
19
+ def get(cls, name: str) -> Callable:
20
+ return cls._registry[name]
6
21
 
7
22
 
8
23
  class Serializable:
24
+ @classmethod
25
+ @abstractmethod
26
+ def serialize_callable_name(cls) -> str:
27
+ """Return the registered name used for this class' factory callable."""
28
+
9
29
  @abstractmethod
10
30
  def clone_params(self) -> tuple[Callable[..., Any], list[Any], dict[str, Any]]:
11
- """
12
- Returns the class, args, and kwargs needed to instantiate a cloned copy
13
- of this instance for use in separate processes or machines.
14
- """
31
+ """Return (callable, args, kwargs) necessary to recreate this object."""
32
+
33
+ def _prepare(self, params: tuple) -> dict:
34
+ callable, args, kwargs = params
35
+ callable_name = callable.__self__.serialize_callable_name()
36
+ return {
37
+ "callable": callable_name,
38
+ "args": args,
39
+ "kwargs": {
40
+ k: self._prepare(v) if isinstance(v, tuple) else v
41
+ for k, v in kwargs.items()
42
+ },
43
+ }
15
44
 
16
45
  def serialize(self) -> str:
17
- """
18
- Returns a string representation of clone params.
19
- This is useful for storing the state of an object in environment variable.
20
- """
21
- return base64.b64encode(pickle.dumps(self.clone_params())).decode()
46
+ """Return a base64-encoded JSON string with registered callable + params."""
47
+ _ensure_default_callables_registered()
48
+ data = self.clone_params()
49
+ return base64.b64encode(json.dumps(self._prepare(data)).encode()).decode()
22
50
 
23
51
 
24
52
  def deserialize(s: str) -> Serializable:
53
+ """Deserialize from base64-encoded JSON using only registered callables.
54
+
55
+ Nested serialized objects are instantiated automatically except for those
56
+ passed via clone parameter tuples (keys ending with ``_clone_params``),
57
+ which must remain as (callable, args, kwargs) for later factory usage.
25
58
  """
26
- Returns a new instance of the class represented by the string.
27
- """
28
- (f, args, kwargs) = pickle.loads(base64.b64decode(s.encode())) # noqa: S301
29
- return f(*args, **kwargs)
59
+ ensure_plugins_loaded()
60
+ _ensure_default_callables_registered()
61
+ decoded = base64.b64decode(s.encode())
62
+ data = json.loads(decoded.decode())
63
+
64
+ def _is_serialized(obj: Any) -> bool:
65
+ return isinstance(obj, dict) and {"callable", "args", "kwargs"}.issubset(
66
+ obj.keys()
67
+ )
68
+
69
+ def _reconstruct(obj: Any, nested: bool = False) -> Any:
70
+ if not _is_serialized(obj):
71
+ return obj
72
+ callable_name: str = obj["callable"]
73
+ args: list[Any] = obj["args"]
74
+ kwargs: dict[str, Any] = obj["kwargs"]
75
+ # Recurse only inside kwargs because serialize() only nests through kwargs
76
+ for k, v in list(kwargs.items()):
77
+ if _is_serialized(v):
78
+ kwargs[k] = _reconstruct(v, True)
79
+ callable_obj = CallableRegistry.get(callable_name)
80
+ if nested:
81
+ return (callable_obj, args, kwargs)
82
+ # Otherwise instantiate
83
+ return callable_obj(*args, **kwargs)
84
+
85
+ if not _is_serialized(data):
86
+ raise ValueError("Invalid serialized data format")
87
+ return _reconstruct(data, False)
88
+
89
+
90
+ class _DefaultsState:
91
+ registered = False
92
+
93
+
94
+ def _ensure_default_callables_registered() -> None:
95
+ if _DefaultsState.registered:
96
+ return
97
+
98
+ from datachain.data_storage.sqlite import (
99
+ SQLiteDatabaseEngine,
100
+ SQLiteMetastore,
101
+ SQLiteWarehouse,
102
+ )
103
+
104
+ # Register (idempotent by name overwrite is fine) using class-level
105
+ # serialization names to avoid hard-coded literals here.
106
+ CallableRegistry.register(
107
+ SQLiteDatabaseEngine.from_db_file,
108
+ SQLiteDatabaseEngine.serialize_callable_name(),
109
+ )
110
+ CallableRegistry.register(
111
+ SQLiteMetastore.init_after_clone,
112
+ SQLiteMetastore.serialize_callable_name(),
113
+ )
114
+ CallableRegistry.register(
115
+ SQLiteWarehouse.init_after_clone,
116
+ SQLiteWarehouse.serialize_callable_name(),
117
+ )
118
+
119
+ _DefaultsState.registered = True