datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/data_storage/schema.py
CHANGED
|
@@ -1,19 +1,21 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
from collections.abc import Iterable, Iterator, Sequence
|
|
3
|
-
from typing import
|
|
4
|
-
TYPE_CHECKING,
|
|
5
|
-
Any,
|
|
6
|
-
Generic,
|
|
7
|
-
Optional,
|
|
8
|
-
TypeVar,
|
|
9
|
-
)
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Generic, TypeVar
|
|
10
4
|
|
|
11
5
|
import sqlalchemy as sa
|
|
12
6
|
from sqlalchemy.sql import func as f
|
|
13
7
|
from sqlalchemy.sql.expression import false, null, true
|
|
14
8
|
|
|
15
9
|
from datachain.sql.functions import path as pathfunc
|
|
16
|
-
from datachain.sql.types import
|
|
10
|
+
from datachain.sql.types import (
|
|
11
|
+
JSON,
|
|
12
|
+
Boolean,
|
|
13
|
+
DateTime,
|
|
14
|
+
Int64,
|
|
15
|
+
SQLType,
|
|
16
|
+
String,
|
|
17
|
+
UInt64,
|
|
18
|
+
)
|
|
17
19
|
|
|
18
20
|
if TYPE_CHECKING:
|
|
19
21
|
from sqlalchemy.engine.interfaces import Dialect
|
|
@@ -30,8 +32,8 @@ if TYPE_CHECKING:
|
|
|
30
32
|
DEFAULT_DELIMITER = "__"
|
|
31
33
|
|
|
32
34
|
|
|
33
|
-
def col_name(name: str,
|
|
34
|
-
return f"{
|
|
35
|
+
def col_name(name: str, column: str = "file") -> str:
|
|
36
|
+
return f"{column}{DEFAULT_DELIMITER}{name}"
|
|
35
37
|
|
|
36
38
|
|
|
37
39
|
def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
@@ -42,7 +44,7 @@ def dedup_columns(columns: Iterable[sa.Column]) -> list[sa.Column]:
|
|
|
42
44
|
"""
|
|
43
45
|
c_set: dict[str, sa.Column] = {}
|
|
44
46
|
for c in columns:
|
|
45
|
-
if (ec := c_set.get(c.name
|
|
47
|
+
if (ec := c_set.get(c.name)) is not None:
|
|
46
48
|
if str(ec.type) != str(c.type):
|
|
47
49
|
raise ValueError(
|
|
48
50
|
f"conflicting types for column {c.name}:{c.type!s} and {ec.type!s}"
|
|
@@ -84,19 +86,19 @@ def convert_rows_custom_column_types(
|
|
|
84
86
|
|
|
85
87
|
|
|
86
88
|
class DirExpansion:
|
|
87
|
-
def __init__(self,
|
|
88
|
-
self.
|
|
89
|
+
def __init__(self, column: str):
|
|
90
|
+
self.column = column
|
|
89
91
|
|
|
90
|
-
def col_name(self, name: str,
|
|
91
|
-
|
|
92
|
-
return col_name(name,
|
|
92
|
+
def col_name(self, name: str, column: str | None = None) -> str:
|
|
93
|
+
column = column or self.column
|
|
94
|
+
return col_name(name, column)
|
|
93
95
|
|
|
94
|
-
def c(self, query, name: str,
|
|
95
|
-
return getattr(query.c, self.col_name(name,
|
|
96
|
+
def c(self, query, name: str, column: str | None = None) -> str:
|
|
97
|
+
return getattr(query.c, self.col_name(name, column=column))
|
|
96
98
|
|
|
97
99
|
def base_select(self, q):
|
|
98
100
|
return sa.select(
|
|
99
|
-
self.c(q, "id",
|
|
101
|
+
self.c(q, "id", column="sys"),
|
|
100
102
|
false().label(self.col_name("is_dir")),
|
|
101
103
|
self.c(q, "source"),
|
|
102
104
|
self.c(q, "path"),
|
|
@@ -152,23 +154,23 @@ class DataTable:
|
|
|
152
154
|
self,
|
|
153
155
|
name: str,
|
|
154
156
|
engine: "DatabaseEngine",
|
|
155
|
-
column_types:
|
|
156
|
-
|
|
157
|
+
column_types: dict[str, SQLType] | None = None,
|
|
158
|
+
column: str = "file",
|
|
157
159
|
):
|
|
158
160
|
self.name: str = name
|
|
159
161
|
self.engine = engine
|
|
160
162
|
self.column_types: dict[str, SQLType] = column_types or {}
|
|
161
|
-
self.
|
|
163
|
+
self.column = column
|
|
162
164
|
|
|
163
165
|
@staticmethod
|
|
164
166
|
def copy_column(
|
|
165
167
|
column: sa.Column,
|
|
166
|
-
primary_key:
|
|
167
|
-
index:
|
|
168
|
-
nullable:
|
|
169
|
-
default:
|
|
170
|
-
server_default:
|
|
171
|
-
unique:
|
|
168
|
+
primary_key: bool | None = None,
|
|
169
|
+
index: bool | None = None,
|
|
170
|
+
nullable: bool | None = None,
|
|
171
|
+
default: Any | None = None,
|
|
172
|
+
server_default: Any | None = None,
|
|
173
|
+
unique: bool | None = None,
|
|
172
174
|
) -> sa.Column:
|
|
173
175
|
"""
|
|
174
176
|
Copy a sqlalchemy Column object intended for use as a signal column.
|
|
@@ -197,8 +199,8 @@ class DataTable:
|
|
|
197
199
|
def new_table(
|
|
198
200
|
cls,
|
|
199
201
|
name: str,
|
|
200
|
-
columns: Sequence[
|
|
201
|
-
metadata:
|
|
202
|
+
columns: Sequence[sa.Column] = (),
|
|
203
|
+
metadata: sa.MetaData | None = None,
|
|
202
204
|
):
|
|
203
205
|
# copy columns, since reusing the same objects from another table
|
|
204
206
|
# may raise an error
|
|
@@ -209,7 +211,7 @@ class DataTable:
|
|
|
209
211
|
metadata = sa.MetaData()
|
|
210
212
|
return sa.Table(name, metadata, *columns)
|
|
211
213
|
|
|
212
|
-
def get_table(self) ->
|
|
214
|
+
def get_table(self) -> sa.Table:
|
|
213
215
|
table = self.engine.get_table(self.name)
|
|
214
216
|
|
|
215
217
|
column_types = self.column_types | {c.name: c.type for c in self.sys_columns()}
|
|
@@ -224,21 +226,19 @@ class DataTable:
|
|
|
224
226
|
def columns(self) -> "ReadOnlyColumnCollection[str, sa.Column[Any]]":
|
|
225
227
|
return self.table.columns
|
|
226
228
|
|
|
227
|
-
def col_name(self, name: str,
|
|
228
|
-
|
|
229
|
-
return col_name(name,
|
|
229
|
+
def col_name(self, name: str, column: str | None = None) -> str:
|
|
230
|
+
column = column or self.column
|
|
231
|
+
return col_name(name, column)
|
|
230
232
|
|
|
231
|
-
def without_object(
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
object_name = object_name or self.object_name
|
|
235
|
-
return column_name.removeprefix(f"{object_name}{DEFAULT_DELIMITER}")
|
|
233
|
+
def without_object(self, column_name: str, column: str | None = None) -> str:
|
|
234
|
+
column = column or self.column
|
|
235
|
+
return column_name.removeprefix(f"{column}{DEFAULT_DELIMITER}")
|
|
236
236
|
|
|
237
|
-
def c(self, name: str,
|
|
238
|
-
return getattr(self.columns, self.col_name(name,
|
|
237
|
+
def c(self, name: str, column: str | None = None):
|
|
238
|
+
return getattr(self.columns, self.col_name(name, column=column))
|
|
239
239
|
|
|
240
240
|
@property
|
|
241
|
-
def table(self) ->
|
|
241
|
+
def table(self) -> sa.Table:
|
|
242
242
|
return self.get_table()
|
|
243
243
|
|
|
244
244
|
def apply_conditions(self, query: "Executable") -> "Executable":
|
|
@@ -268,14 +268,27 @@ class DataTable:
|
|
|
268
268
|
@classmethod
|
|
269
269
|
def sys_columns(cls):
|
|
270
270
|
return [
|
|
271
|
-
sa.Column("sys__id",
|
|
271
|
+
sa.Column("sys__id", UInt64, primary_key=True),
|
|
272
272
|
sa.Column(
|
|
273
273
|
"sys__rand", UInt64, nullable=False, server_default=f.abs(f.random())
|
|
274
274
|
),
|
|
275
275
|
]
|
|
276
276
|
|
|
277
|
+
@classmethod
|
|
278
|
+
def listing_columns(cls):
|
|
279
|
+
return [
|
|
280
|
+
sa.Column("file__source", String()),
|
|
281
|
+
sa.Column("file__path", String()),
|
|
282
|
+
sa.Column("file__size", Int64()),
|
|
283
|
+
sa.Column("file__version", String()),
|
|
284
|
+
sa.Column("file__etag", String()),
|
|
285
|
+
sa.Column("file__is_latest", Boolean()),
|
|
286
|
+
sa.Column("file__last_modified", DateTime()),
|
|
287
|
+
sa.Column("file__location", JSON()),
|
|
288
|
+
]
|
|
289
|
+
|
|
277
290
|
def dir_expansion(self):
|
|
278
|
-
return DirExpansion(self.
|
|
291
|
+
return DirExpansion(self.column)
|
|
279
292
|
|
|
280
293
|
|
|
281
294
|
PARTITION_COLUMN_ID = "partition_id"
|
|
@@ -283,7 +296,7 @@ PARTITION_COLUMN_ID = "partition_id"
|
|
|
283
296
|
partition_col_names = [PARTITION_COLUMN_ID]
|
|
284
297
|
|
|
285
298
|
|
|
286
|
-
def partition_columns() -> Sequence[
|
|
299
|
+
def partition_columns() -> Sequence[sa.Column]:
|
|
287
300
|
return [
|
|
288
301
|
sa.Column(PARTITION_COLUMN_ID, sa.Integer),
|
|
289
302
|
]
|
|
@@ -1,29 +1,119 @@
|
|
|
1
1
|
import base64
|
|
2
|
-
import pickle
|
|
3
2
|
from abc import abstractmethod
|
|
4
3
|
from collections.abc import Callable
|
|
5
|
-
from typing import Any
|
|
4
|
+
from typing import Any, ClassVar
|
|
5
|
+
|
|
6
|
+
from datachain import json
|
|
7
|
+
from datachain.plugins import ensure_plugins_loaded
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CallableRegistry:
|
|
11
|
+
_registry: ClassVar[dict[str, Callable]] = {}
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
def register(cls, callable_obj: Callable, name: str) -> str:
|
|
15
|
+
cls._registry[name] = callable_obj
|
|
16
|
+
return name
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def get(cls, name: str) -> Callable:
|
|
20
|
+
return cls._registry[name]
|
|
6
21
|
|
|
7
22
|
|
|
8
23
|
class Serializable:
|
|
24
|
+
@classmethod
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def serialize_callable_name(cls) -> str:
|
|
27
|
+
"""Return the registered name used for this class' factory callable."""
|
|
28
|
+
|
|
9
29
|
@abstractmethod
|
|
10
30
|
def clone_params(self) -> tuple[Callable[..., Any], list[Any], dict[str, Any]]:
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
31
|
+
"""Return (callable, args, kwargs) necessary to recreate this object."""
|
|
32
|
+
|
|
33
|
+
def _prepare(self, params: tuple) -> dict:
|
|
34
|
+
callable, args, kwargs = params
|
|
35
|
+
callable_name = callable.__self__.serialize_callable_name()
|
|
36
|
+
return {
|
|
37
|
+
"callable": callable_name,
|
|
38
|
+
"args": args,
|
|
39
|
+
"kwargs": {
|
|
40
|
+
k: self._prepare(v) if isinstance(v, tuple) else v
|
|
41
|
+
for k, v in kwargs.items()
|
|
42
|
+
},
|
|
43
|
+
}
|
|
15
44
|
|
|
16
45
|
def serialize(self) -> str:
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
return base64.b64encode(pickle.dumps(self.clone_params())).decode()
|
|
46
|
+
"""Return a base64-encoded JSON string with registered callable + params."""
|
|
47
|
+
_ensure_default_callables_registered()
|
|
48
|
+
data = self.clone_params()
|
|
49
|
+
return base64.b64encode(json.dumps(self._prepare(data)).encode()).decode()
|
|
22
50
|
|
|
23
51
|
|
|
24
52
|
def deserialize(s: str) -> Serializable:
|
|
53
|
+
"""Deserialize from base64-encoded JSON using only registered callables.
|
|
54
|
+
|
|
55
|
+
Nested serialized objects are instantiated automatically except for those
|
|
56
|
+
passed via clone parameter tuples (keys ending with ``_clone_params``),
|
|
57
|
+
which must remain as (callable, args, kwargs) for later factory usage.
|
|
25
58
|
"""
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
59
|
+
ensure_plugins_loaded()
|
|
60
|
+
_ensure_default_callables_registered()
|
|
61
|
+
decoded = base64.b64decode(s.encode())
|
|
62
|
+
data = json.loads(decoded.decode())
|
|
63
|
+
|
|
64
|
+
def _is_serialized(obj: Any) -> bool:
|
|
65
|
+
return isinstance(obj, dict) and {"callable", "args", "kwargs"}.issubset(
|
|
66
|
+
obj.keys()
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def _reconstruct(obj: Any, nested: bool = False) -> Any:
|
|
70
|
+
if not _is_serialized(obj):
|
|
71
|
+
return obj
|
|
72
|
+
callable_name: str = obj["callable"]
|
|
73
|
+
args: list[Any] = obj["args"]
|
|
74
|
+
kwargs: dict[str, Any] = obj["kwargs"]
|
|
75
|
+
# Recurse only inside kwargs because serialize() only nests through kwargs
|
|
76
|
+
for k, v in list(kwargs.items()):
|
|
77
|
+
if _is_serialized(v):
|
|
78
|
+
kwargs[k] = _reconstruct(v, True)
|
|
79
|
+
callable_obj = CallableRegistry.get(callable_name)
|
|
80
|
+
if nested:
|
|
81
|
+
return (callable_obj, args, kwargs)
|
|
82
|
+
# Otherwise instantiate
|
|
83
|
+
return callable_obj(*args, **kwargs)
|
|
84
|
+
|
|
85
|
+
if not _is_serialized(data):
|
|
86
|
+
raise ValueError("Invalid serialized data format")
|
|
87
|
+
return _reconstruct(data, False)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class _DefaultsState:
|
|
91
|
+
registered = False
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _ensure_default_callables_registered() -> None:
|
|
95
|
+
if _DefaultsState.registered:
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
from datachain.data_storage.sqlite import (
|
|
99
|
+
SQLiteDatabaseEngine,
|
|
100
|
+
SQLiteMetastore,
|
|
101
|
+
SQLiteWarehouse,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Register (idempotent by name overwrite is fine) using class-level
|
|
105
|
+
# serialization names to avoid hard-coded literals here.
|
|
106
|
+
CallableRegistry.register(
|
|
107
|
+
SQLiteDatabaseEngine.from_db_file,
|
|
108
|
+
SQLiteDatabaseEngine.serialize_callable_name(),
|
|
109
|
+
)
|
|
110
|
+
CallableRegistry.register(
|
|
111
|
+
SQLiteMetastore.init_after_clone,
|
|
112
|
+
SQLiteMetastore.serialize_callable_name(),
|
|
113
|
+
)
|
|
114
|
+
CallableRegistry.register(
|
|
115
|
+
SQLiteWarehouse.init_after_clone,
|
|
116
|
+
SQLiteWarehouse.serialize_callable_name(),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
_DefaultsState.registered = True
|