datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +4 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +276 -354
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +8 -3
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +10 -17
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +42 -27
- datachain/cli/commands/ls.py +15 -15
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/__init__.py +3 -43
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +34 -23
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +157 -0
- datachain/client/local.py +11 -7
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +2 -0
- datachain/data_storage/metastore.py +716 -137
- datachain/data_storage/schema.py +20 -27
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +114 -114
- datachain/data_storage/warehouse.py +140 -48
- datachain/dataset.py +109 -89
- datachain/delta.py +117 -42
- datachain/diff/__init__.py +25 -33
- datachain/error.py +24 -0
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +63 -45
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +18 -15
- datachain/lib/audio.py +60 -59
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/values_to_tuples.py +151 -53
- datachain/lib/data_model.py +23 -19
- datachain/lib/dataset_info.py +7 -7
- datachain/lib/dc/__init__.py +2 -1
- datachain/lib/dc/csv.py +22 -26
- datachain/lib/dc/database.py +37 -34
- datachain/lib/dc/datachain.py +518 -324
- datachain/lib/dc/datasets.py +38 -30
- datachain/lib/dc/hf.py +16 -20
- datachain/lib/dc/json.py +17 -18
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +33 -21
- datachain/lib/dc/records.py +9 -13
- datachain/lib/dc/storage.py +103 -65
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +17 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +187 -50
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +2 -3
- datachain/lib/model_store.py +20 -8
- datachain/lib/namespaces.py +59 -7
- datachain/lib/projects.py +51 -9
- datachain/lib/pytorch.py +31 -23
- datachain/lib/settings.py +188 -85
- datachain/lib/signal_schema.py +302 -64
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +103 -63
- datachain/lib/udf_signature.py +59 -34
- datachain/lib/utils.py +20 -0
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +31 -36
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +12 -5
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +22 -3
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +4 -4
- datachain/query/batch.py +10 -12
- datachain/query/dataset.py +376 -194
- datachain/query/dispatch.py +112 -84
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/queue.py +2 -1
- datachain/query/schema.py +7 -6
- datachain/query/session.py +190 -33
- datachain/query/udf.py +9 -6
- datachain/remote/studio.py +90 -53
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +37 -25
- datachain/sql/sqlite/types.py +1 -1
- datachain/sql/types.py +36 -5
- datachain/studio.py +49 -40
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +39 -48
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
- datachain-0.39.0.dist-info/RECORD +173 -0
- datachain/cli/commands/query.py +0 -54
- datachain/query/utils.py +0 -36
- datachain-0.30.5.dist-info/RECORD +0 -168
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/data_storage/schema.py
CHANGED
|
@@ -1,12 +1,6 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from collections.abc import Iterable, Iterator, Sequence
|
|
3
|
-
from typing import
|
|
4
|
-
TYPE_CHECKING,
|
|
5
|
-
Any,
|
|
6
|
-
Generic,
|
|
7
|
-
Optional,
|
|
8
|
-
TypeVar,
|
|
9
|
-
)
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Generic, TypeVar
|
|
10
4
|
|
|
11
5
|
import sqlalchemy as sa
|
|
12
6
|
from sqlalchemy.sql import func as f
|
|
@@ -17,7 +11,6 @@ from datachain.sql.types import (
|
|
|
17
11
|
JSON,
|
|
18
12
|
Boolean,
|
|
19
13
|
DateTime,
|
|
20
|
-
Int,
|
|
21
14
|
Int64,
|
|
22
15
|
SQLType,
|
|
23
16
|
String,
|
|
@@ -51,7 +44,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
|
51
44
|
"""
|
|
52
45
|
c_set: dict[str, sa.Column] = {}
|
|
53
46
|
for c in columns:
|
|
54
|
-
if (ec := c_set.get(c.name
|
|
47
|
+
if (ec := c_set.get(c.name)) is not None:
|
|
55
48
|
if str(ec.type) != str(c.type):
|
|
56
49
|
raise ValueError(
|
|
57
50
|
f"conflicting types for column {c.name}:{c.type!s} and {ec.type!s}"
|
|
@@ -96,11 +89,11 @@ class DirExpansion:
|
|
|
96
89
|
def __init__(self, column: str):
|
|
97
90
|
self.column = column
|
|
98
91
|
|
|
99
|
-
def col_name(self, name: str, column:
|
|
92
|
+
def col_name(self, name: str, column: str | None = None) -> str:
|
|
100
93
|
column = column or self.column
|
|
101
94
|
return col_name(name, column)
|
|
102
95
|
|
|
103
|
-
def c(self, query, name: str, column:
|
|
96
|
+
def c(self, query, name: str, column: str | None = None) -> str:
|
|
104
97
|
return getattr(query.c, self.col_name(name, column=column))
|
|
105
98
|
|
|
106
99
|
def base_select(self, q):
|
|
@@ -161,7 +154,7 @@ class DataTable:
|
|
|
161
154
|
self,
|
|
162
155
|
name: str,
|
|
163
156
|
engine: "DatabaseEngine",
|
|
164
|
-
column_types:
|
|
157
|
+
column_types: dict[str, SQLType] | None = None,
|
|
165
158
|
column: str = "file",
|
|
166
159
|
):
|
|
167
160
|
self.name: str = name
|
|
@@ -172,12 +165,12 @@ class DataTable:
|
|
|
172
165
|
@staticmethod
|
|
173
166
|
def copy_column(
|
|
174
167
|
column: sa.Column,
|
|
175
|
-
primary_key:
|
|
176
|
-
index:
|
|
177
|
-
nullable:
|
|
178
|
-
default:
|
|
179
|
-
server_default:
|
|
180
|
-
unique:
|
|
168
|
+
primary_key: bool | None = None,
|
|
169
|
+
index: bool | None = None,
|
|
170
|
+
nullable: bool | None = None,
|
|
171
|
+
default: Any | None = None,
|
|
172
|
+
server_default: Any | None = None,
|
|
173
|
+
unique: bool | None = None,
|
|
181
174
|
) -> sa.Column:
|
|
182
175
|
"""
|
|
183
176
|
Copy a sqlalchemy Column object intended for use as a signal column.
|
|
@@ -206,8 +199,8 @@ class DataTable:
|
|
|
206
199
|
def new_table(
|
|
207
200
|
cls,
|
|
208
201
|
name: str,
|
|
209
|
-
columns: Sequence[
|
|
210
|
-
metadata:
|
|
202
|
+
columns: Sequence[sa.Column] = (),
|
|
203
|
+
metadata: sa.MetaData | None = None,
|
|
211
204
|
):
|
|
212
205
|
# copy columns, since reusing the same objects from another table
|
|
213
206
|
# may raise an error
|
|
@@ -218,7 +211,7 @@ class DataTable:
|
|
|
218
211
|
metadata = sa.MetaData()
|
|
219
212
|
return sa.Table(name, metadata, *columns)
|
|
220
213
|
|
|
221
|
-
def get_table(self) ->
|
|
214
|
+
def get_table(self) -> sa.Table:
|
|
222
215
|
table = self.engine.get_table(self.name)
|
|
223
216
|
|
|
224
217
|
column_types = self.column_types | {c.name: c.type for c in self.sys_columns()}
|
|
@@ -233,19 +226,19 @@ class DataTable:
|
|
|
233
226
|
def columns(self) -> "ReadOnlyColumnCollection[str, sa.Column[Any]]":
|
|
234
227
|
return self.table.columns
|
|
235
228
|
|
|
236
|
-
def col_name(self, name: str, column:
|
|
229
|
+
def col_name(self, name: str, column: str | None = None) -> str:
|
|
237
230
|
column = column or self.column
|
|
238
231
|
return col_name(name, column)
|
|
239
232
|
|
|
240
|
-
def without_object(self, column_name: str, column:
|
|
233
|
+
def without_object(self, column_name: str, column: str | None = None) -> str:
|
|
241
234
|
column = column or self.column
|
|
242
235
|
return column_name.removeprefix(f"{column}{DEFAULT_DELIMITER}")
|
|
243
236
|
|
|
244
|
-
def c(self, name: str, column:
|
|
237
|
+
def c(self, name: str, column: str | None = None):
|
|
245
238
|
return getattr(self.columns, self.col_name(name, column=column))
|
|
246
239
|
|
|
247
240
|
@property
|
|
248
|
-
def table(self) ->
|
|
241
|
+
def table(self) -> sa.Table:
|
|
249
242
|
return self.get_table()
|
|
250
243
|
|
|
251
244
|
def apply_conditions(self, query: "Executable") -> "Executable":
|
|
@@ -275,7 +268,7 @@ class DataTable:
|
|
|
275
268
|
@classmethod
|
|
276
269
|
def sys_columns(cls):
|
|
277
270
|
return [
|
|
278
|
-
sa.Column("sys__id",
|
|
271
|
+
sa.Column("sys__id", UInt64, primary_key=True),
|
|
279
272
|
sa.Column(
|
|
280
273
|
"sys__rand", UInt64, nullable=False, server_default=f.abs(f.random())
|
|
281
274
|
),
|
|
@@ -303,7 +296,7 @@ PARTITION_COLUMN_ID = "partition_id"
|
|
|
303
296
|
partition_col_names = [PARTITION_COLUMN_ID]
|
|
304
297
|
|
|
305
298
|
|
|
306
|
-
def partition_columns() -> Sequence[
|
|
299
|
+
def partition_columns() -> Sequence[sa.Column]:
|
|
307
300
|
return [
|
|
308
301
|
sa.Column(PARTITION_COLUMN_ID, sa.Integer),
|
|
309
302
|
]
|
|
@@ -1,29 +1,119 @@
|
|
|
1
1
|
import base64
|
|
2
|
-
import pickle
|
|
3
2
|
from abc import abstractmethod
|
|
4
3
|
from collections.abc import Callable
|
|
5
|
-
from typing import Any
|
|
4
|
+
from typing import Any, ClassVar
|
|
5
|
+
|
|
6
|
+
from datachain import json
|
|
7
|
+
from datachain.plugins import ensure_plugins_loaded
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CallableRegistry:
|
|
11
|
+
_registry: ClassVar[dict[str, Callable]] = {}
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
def register(cls, callable_obj: Callable, name: str) -> str:
|
|
15
|
+
cls._registry[name] = callable_obj
|
|
16
|
+
return name
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def get(cls, name: str) -> Callable:
|
|
20
|
+
return cls._registry[name]
|
|
6
21
|
|
|
7
22
|
|
|
8
23
|
class Serializable:
|
|
24
|
+
@classmethod
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def serialize_callable_name(cls) -> str:
|
|
27
|
+
"""Return the registered name used for this class' factory callable."""
|
|
28
|
+
|
|
9
29
|
@abstractmethod
|
|
10
30
|
def clone_params(self) -> tuple[Callable[..., Any], list[Any], dict[str, Any]]:
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
31
|
+
"""Return (callable, args, kwargs) necessary to recreate this object."""
|
|
32
|
+
|
|
33
|
+
def _prepare(self, params: tuple) -> dict:
|
|
34
|
+
callable, args, kwargs = params
|
|
35
|
+
callable_name = callable.__self__.serialize_callable_name()
|
|
36
|
+
return {
|
|
37
|
+
"callable": callable_name,
|
|
38
|
+
"args": args,
|
|
39
|
+
"kwargs": {
|
|
40
|
+
k: self._prepare(v) if isinstance(v, tuple) else v
|
|
41
|
+
for k, v in kwargs.items()
|
|
42
|
+
},
|
|
43
|
+
}
|
|
15
44
|
|
|
16
45
|
def serialize(self) -> str:
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
return base64.b64encode(pickle.dumps(self.clone_params())).decode()
|
|
46
|
+
"""Return a base64-encoded JSON string with registered callable + params."""
|
|
47
|
+
_ensure_default_callables_registered()
|
|
48
|
+
data = self.clone_params()
|
|
49
|
+
return base64.b64encode(json.dumps(self._prepare(data)).encode()).decode()
|
|
22
50
|
|
|
23
51
|
|
|
24
52
|
def deserialize(s: str) -> Serializable:
|
|
53
|
+
"""Deserialize from base64-encoded JSON using only registered callables.
|
|
54
|
+
|
|
55
|
+
Nested serialized objects are instantiated automatically except for those
|
|
56
|
+
passed via clone parameter tuples (keys ending with ``_clone_params``),
|
|
57
|
+
which must remain as (callable, args, kwargs) for later factory usage.
|
|
25
58
|
"""
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
59
|
+
ensure_plugins_loaded()
|
|
60
|
+
_ensure_default_callables_registered()
|
|
61
|
+
decoded = base64.b64decode(s.encode())
|
|
62
|
+
data = json.loads(decoded.decode())
|
|
63
|
+
|
|
64
|
+
def _is_serialized(obj: Any) -> bool:
|
|
65
|
+
return isinstance(obj, dict) and {"callable", "args", "kwargs"}.issubset(
|
|
66
|
+
obj.keys()
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def _reconstruct(obj: Any, nested: bool = False) -> Any:
|
|
70
|
+
if not _is_serialized(obj):
|
|
71
|
+
return obj
|
|
72
|
+
callable_name: str = obj["callable"]
|
|
73
|
+
args: list[Any] = obj["args"]
|
|
74
|
+
kwargs: dict[str, Any] = obj["kwargs"]
|
|
75
|
+
# Recurse only inside kwargs because serialize() only nests through kwargs
|
|
76
|
+
for k, v in list(kwargs.items()):
|
|
77
|
+
if _is_serialized(v):
|
|
78
|
+
kwargs[k] = _reconstruct(v, True)
|
|
79
|
+
callable_obj = CallableRegistry.get(callable_name)
|
|
80
|
+
if nested:
|
|
81
|
+
return (callable_obj, args, kwargs)
|
|
82
|
+
# Otherwise instantiate
|
|
83
|
+
return callable_obj(*args, **kwargs)
|
|
84
|
+
|
|
85
|
+
if not _is_serialized(data):
|
|
86
|
+
raise ValueError("Invalid serialized data format")
|
|
87
|
+
return _reconstruct(data, False)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class _DefaultsState:
|
|
91
|
+
registered = False
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _ensure_default_callables_registered() -> None:
|
|
95
|
+
if _DefaultsState.registered:
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
from datachain.data_storage.sqlite import (
|
|
99
|
+
SQLiteDatabaseEngine,
|
|
100
|
+
SQLiteMetastore,
|
|
101
|
+
SQLiteWarehouse,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Register (idempotent by name overwrite is fine) using class-level
|
|
105
|
+
# serialization names to avoid hard-coded literals here.
|
|
106
|
+
CallableRegistry.register(
|
|
107
|
+
SQLiteDatabaseEngine.from_db_file,
|
|
108
|
+
SQLiteDatabaseEngine.serialize_callable_name(),
|
|
109
|
+
)
|
|
110
|
+
CallableRegistry.register(
|
|
111
|
+
SQLiteMetastore.init_after_clone,
|
|
112
|
+
SQLiteMetastore.serialize_callable_name(),
|
|
113
|
+
)
|
|
114
|
+
CallableRegistry.register(
|
|
115
|
+
SQLiteWarehouse.init_after_clone,
|
|
116
|
+
SQLiteWarehouse.serialize_callable_name(),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
_DefaultsState.registered = True
|