datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/dataset.py
CHANGED
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
import builtins
|
|
2
|
-
import json
|
|
3
2
|
from dataclasses import dataclass, fields
|
|
4
3
|
from datetime import datetime
|
|
5
4
|
from functools import cached_property
|
|
6
|
-
from typing import
|
|
7
|
-
Any,
|
|
8
|
-
NewType,
|
|
9
|
-
Optional,
|
|
10
|
-
TypeVar,
|
|
11
|
-
Union,
|
|
12
|
-
)
|
|
5
|
+
from typing import Any, NewType, TypeVar
|
|
13
6
|
from urllib.parse import urlparse
|
|
14
7
|
|
|
15
|
-
from
|
|
8
|
+
from packaging.specifiers import SpecifierSet
|
|
9
|
+
from packaging.version import Version
|
|
10
|
+
|
|
11
|
+
from datachain import json, semver
|
|
12
|
+
from datachain.error import DatasetVersionNotFoundError, InvalidDatasetNameError
|
|
13
|
+
from datachain.namespace import Namespace
|
|
14
|
+
from datachain.project import Project
|
|
16
15
|
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
|
|
17
16
|
|
|
18
17
|
T = TypeVar("T", bound="DatasetRecord")
|
|
@@ -25,6 +24,10 @@ DATASET_PREFIX = "ds://"
|
|
|
25
24
|
QUERY_DATASET_PREFIX = "ds_query_"
|
|
26
25
|
LISTING_PREFIX = "lst__"
|
|
27
26
|
|
|
27
|
+
DEFAULT_DATASET_VERSION = "1.0.0"
|
|
28
|
+
DATASET_NAME_RESERVED_CHARS = [".", "@"]
|
|
29
|
+
DATASET_NAME_REPLACEMENT_CHAR = "_"
|
|
30
|
+
|
|
28
31
|
|
|
29
32
|
# StorageURI represents a normalised URI to a valid storage location (full bucket or
|
|
30
33
|
# absolute local path).
|
|
@@ -33,12 +36,12 @@ LISTING_PREFIX = "lst__"
|
|
|
33
36
|
StorageURI = NewType("StorageURI", str)
|
|
34
37
|
|
|
35
38
|
|
|
36
|
-
def parse_dataset_uri(uri: str) -> tuple[str,
|
|
39
|
+
def parse_dataset_uri(uri: str) -> tuple[str, str | None]:
|
|
37
40
|
"""
|
|
38
41
|
Parse dataser uri to extract name and version out of it (if version is defined)
|
|
39
42
|
Example:
|
|
40
|
-
Input: ds://zalando@v3
|
|
41
|
-
Output: (zalando, 3)
|
|
43
|
+
Input: ds://zalando@v3.0.1
|
|
44
|
+
Output: (zalando, 3.0.1)
|
|
42
45
|
"""
|
|
43
46
|
p = urlparse(uri)
|
|
44
47
|
if p.scheme != "ds":
|
|
@@ -51,24 +54,74 @@ def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
|
|
|
51
54
|
raise Exception(
|
|
52
55
|
"Wrong dataset uri format, it should be: ds://<name>@v<version>"
|
|
53
56
|
)
|
|
54
|
-
|
|
55
|
-
return name, version
|
|
57
|
+
return name, s[1]
|
|
56
58
|
|
|
57
59
|
|
|
58
|
-
def create_dataset_uri(
|
|
60
|
+
def create_dataset_uri(
|
|
61
|
+
name: str, namespace: str, project: str, version: str | None = None
|
|
62
|
+
) -> str:
|
|
59
63
|
"""
|
|
60
|
-
Creates a dataset uri based on dataset name and optionally
|
|
64
|
+
Creates a dataset uri based on namespace, project, dataset name and optionally
|
|
65
|
+
version.
|
|
61
66
|
Example:
|
|
62
|
-
Input: zalando, 3
|
|
63
|
-
Output: ds//zalando@v3
|
|
67
|
+
Input: dev, clothes, zalando, 3.0.1
|
|
68
|
+
Output: ds//dev.clothes.zalando@v3.0.1
|
|
64
69
|
"""
|
|
65
|
-
uri = f"{DATASET_PREFIX}{name}"
|
|
70
|
+
uri = f"{DATASET_PREFIX}{namespace}.{project}.{name}"
|
|
66
71
|
if version:
|
|
67
72
|
uri += f"@v{version}"
|
|
68
73
|
|
|
69
74
|
return uri
|
|
70
75
|
|
|
71
76
|
|
|
77
|
+
def parse_dataset_name(name: str) -> tuple[str | None, str | None, str]:
|
|
78
|
+
"""Parses dataset name and returns namespace, project and name"""
|
|
79
|
+
if not name:
|
|
80
|
+
raise InvalidDatasetNameError("Name must be defined to parse it")
|
|
81
|
+
split = name.split(".")
|
|
82
|
+
if len(split) > 3:
|
|
83
|
+
raise InvalidDatasetNameError(f"Invalid dataset name {name}")
|
|
84
|
+
name = split[-1]
|
|
85
|
+
project_name = split[-2] if len(split) > 1 else None
|
|
86
|
+
namespace_name = split[-3] if len(split) > 2 else None
|
|
87
|
+
|
|
88
|
+
return namespace_name, project_name, name
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def parse_schema(ct: dict[str, Any]) -> dict[str, SQLType | type[SQLType]]:
|
|
92
|
+
"""Parse dataset schema from dictionary representation.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
ct: Dictionary with column definitions
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Dictionary mapping column names to SQL types
|
|
99
|
+
|
|
100
|
+
Raises:
|
|
101
|
+
TypeError: If schema format is invalid
|
|
102
|
+
ValueError: If column type is not defined or not supported
|
|
103
|
+
"""
|
|
104
|
+
if not isinstance(ct, dict):
|
|
105
|
+
raise TypeError("Schema definition must be a dictionary")
|
|
106
|
+
res = {}
|
|
107
|
+
for c_name, c_type in ct.items():
|
|
108
|
+
if not isinstance(c_type, dict):
|
|
109
|
+
raise TypeError(f"Schema column '{c_name}' type must be a dictionary")
|
|
110
|
+
if "type" not in c_type:
|
|
111
|
+
raise ValueError(f"Schema column '{c_name}' type is not defined")
|
|
112
|
+
if c_type["type"] not in NAME_TYPES_MAPPING:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"Schema column '{c_name}' type '{c_type['type']}' is not supported"
|
|
115
|
+
)
|
|
116
|
+
try:
|
|
117
|
+
res[c_name] = NAME_TYPES_MAPPING[c_type["type"]].from_dict(c_type) # type: ignore [attr-defined]
|
|
118
|
+
except Exception as e:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"Schema column '{c_name}' type '{c_type['type']}' parsing error: {e}"
|
|
121
|
+
) from e
|
|
122
|
+
return res
|
|
123
|
+
|
|
124
|
+
|
|
72
125
|
class DatasetDependencyType:
|
|
73
126
|
DATASET = "dataset"
|
|
74
127
|
STORAGE = "storage"
|
|
@@ -76,12 +129,16 @@ class DatasetDependencyType:
|
|
|
76
129
|
|
|
77
130
|
@dataclass
|
|
78
131
|
class DatasetDependency:
|
|
132
|
+
# TODO put `DatasetRecord` instead of name + version which will
|
|
133
|
+
# simplify codebase in various places
|
|
79
134
|
id: int
|
|
80
135
|
type: str
|
|
136
|
+
namespace: str
|
|
137
|
+
project: str
|
|
81
138
|
name: str
|
|
82
|
-
version: str
|
|
139
|
+
version: str
|
|
83
140
|
created_at: datetime
|
|
84
|
-
dependencies: list[
|
|
141
|
+
dependencies: list["DatasetDependency | None"]
|
|
85
142
|
|
|
86
143
|
@property
|
|
87
144
|
def dataset_name(self) -> str:
|
|
@@ -91,40 +148,41 @@ class DatasetDependency:
|
|
|
91
148
|
if self.type == DatasetDependencyType.DATASET:
|
|
92
149
|
return self.name
|
|
93
150
|
|
|
94
|
-
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/")
|
|
151
|
+
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"))
|
|
95
152
|
assert list_dataset_name
|
|
96
153
|
return list_dataset_name
|
|
97
154
|
|
|
98
155
|
@classmethod
|
|
99
156
|
def parse(
|
|
100
157
|
cls: builtins.type[DD],
|
|
158
|
+
namespace_name: str,
|
|
159
|
+
project_name: str,
|
|
101
160
|
id: int,
|
|
102
|
-
dataset_id:
|
|
103
|
-
dataset_version_id:
|
|
104
|
-
dataset_name:
|
|
105
|
-
dataset_version:
|
|
106
|
-
dataset_version_created_at:
|
|
107
|
-
) ->
|
|
108
|
-
from datachain.
|
|
109
|
-
from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
|
|
161
|
+
dataset_id: int | None,
|
|
162
|
+
dataset_version_id: int | None,
|
|
163
|
+
dataset_name: str | None,
|
|
164
|
+
dataset_version: str | None,
|
|
165
|
+
dataset_version_created_at: datetime | None,
|
|
166
|
+
) -> "DatasetDependency | None":
|
|
167
|
+
from datachain.lib.listing import is_listing_dataset
|
|
110
168
|
|
|
111
169
|
if not dataset_id:
|
|
112
170
|
return None
|
|
113
171
|
|
|
114
172
|
assert dataset_name is not None
|
|
115
|
-
dependency_type = DatasetDependencyType.DATASET
|
|
116
|
-
dependency_name = dataset_name
|
|
117
|
-
|
|
118
|
-
if is_listing_dataset(dataset_name):
|
|
119
|
-
dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
|
|
120
|
-
dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
|
|
121
173
|
|
|
122
174
|
return cls(
|
|
123
175
|
id,
|
|
124
|
-
dependency_type,
|
|
125
|
-
dependency_name,
|
|
126
176
|
(
|
|
127
|
-
|
|
177
|
+
DatasetDependencyType.STORAGE
|
|
178
|
+
if is_listing_dataset(dataset_name)
|
|
179
|
+
else DatasetDependencyType.DATASET
|
|
180
|
+
),
|
|
181
|
+
namespace_name,
|
|
182
|
+
project_name,
|
|
183
|
+
dataset_name,
|
|
184
|
+
(
|
|
185
|
+
dataset_version # type: ignore[arg-type]
|
|
128
186
|
if dataset_version
|
|
129
187
|
else None
|
|
130
188
|
),
|
|
@@ -163,21 +221,21 @@ class DatasetVersion:
|
|
|
163
221
|
id: int
|
|
164
222
|
uuid: str
|
|
165
223
|
dataset_id: int
|
|
166
|
-
version:
|
|
224
|
+
version: str
|
|
167
225
|
status: int
|
|
168
226
|
feature_schema: dict
|
|
169
227
|
created_at: datetime
|
|
170
|
-
finished_at:
|
|
228
|
+
finished_at: datetime | None
|
|
171
229
|
error_message: str
|
|
172
230
|
error_stack: str
|
|
173
231
|
script_output: str
|
|
174
|
-
schema: dict[str,
|
|
175
|
-
num_objects:
|
|
176
|
-
size:
|
|
177
|
-
_preview_data:
|
|
232
|
+
schema: dict[str, SQLType | type[SQLType]]
|
|
233
|
+
num_objects: int | None
|
|
234
|
+
size: int | None
|
|
235
|
+
_preview_data: str | list[dict] | None
|
|
178
236
|
sources: str = ""
|
|
179
237
|
query_script: str = ""
|
|
180
|
-
job_id:
|
|
238
|
+
job_id: str | None = None
|
|
181
239
|
|
|
182
240
|
@classmethod
|
|
183
241
|
def parse( # noqa: PLR0913
|
|
@@ -185,22 +243,27 @@ class DatasetVersion:
|
|
|
185
243
|
id: int,
|
|
186
244
|
uuid: str,
|
|
187
245
|
dataset_id: int,
|
|
188
|
-
version:
|
|
246
|
+
version: str,
|
|
189
247
|
status: int,
|
|
190
|
-
feature_schema:
|
|
248
|
+
feature_schema: str | None,
|
|
191
249
|
created_at: datetime,
|
|
192
|
-
finished_at:
|
|
250
|
+
finished_at: datetime | None,
|
|
193
251
|
error_message: str,
|
|
194
252
|
error_stack: str,
|
|
195
253
|
script_output: str,
|
|
196
|
-
num_objects:
|
|
197
|
-
size:
|
|
198
|
-
preview:
|
|
199
|
-
schema: dict[str,
|
|
254
|
+
num_objects: int | None,
|
|
255
|
+
size: int | None,
|
|
256
|
+
preview: str | list[dict] | None,
|
|
257
|
+
schema: str | dict[str, SQLType | type[SQLType]],
|
|
200
258
|
sources: str = "",
|
|
201
259
|
query_script: str = "",
|
|
202
|
-
job_id:
|
|
260
|
+
job_id: str | None = None,
|
|
203
261
|
):
|
|
262
|
+
if isinstance(schema, str):
|
|
263
|
+
schema_parsed = parse_schema(json.loads(schema) if schema else {})
|
|
264
|
+
else:
|
|
265
|
+
schema_parsed = schema
|
|
266
|
+
|
|
204
267
|
return cls(
|
|
205
268
|
id,
|
|
206
269
|
uuid,
|
|
@@ -213,7 +276,7 @@ class DatasetVersion:
|
|
|
213
276
|
error_message,
|
|
214
277
|
error_stack,
|
|
215
278
|
script_output,
|
|
216
|
-
|
|
279
|
+
schema_parsed,
|
|
217
280
|
num_objects,
|
|
218
281
|
size,
|
|
219
282
|
preview,
|
|
@@ -222,6 +285,10 @@ class DatasetVersion:
|
|
|
222
285
|
job_id,
|
|
223
286
|
)
|
|
224
287
|
|
|
288
|
+
@property
|
|
289
|
+
def version_value(self) -> int:
|
|
290
|
+
return semver.value(self.version)
|
|
291
|
+
|
|
225
292
|
def __eq__(self, other):
|
|
226
293
|
if not isinstance(other, DatasetVersion):
|
|
227
294
|
return False
|
|
@@ -230,7 +297,7 @@ class DatasetVersion:
|
|
|
230
297
|
def __lt__(self, other):
|
|
231
298
|
if not isinstance(other, DatasetVersion):
|
|
232
299
|
return False
|
|
233
|
-
return self.
|
|
300
|
+
return self.version_value < other.version_value
|
|
234
301
|
|
|
235
302
|
def __hash__(self):
|
|
236
303
|
return hash(f"{self.dataset_id}_{self.version}")
|
|
@@ -257,7 +324,7 @@ class DatasetVersion:
|
|
|
257
324
|
}
|
|
258
325
|
|
|
259
326
|
@cached_property
|
|
260
|
-
def preview(self) ->
|
|
327
|
+
def preview(self) -> list[dict] | None:
|
|
261
328
|
if isinstance(self._preview_data, str):
|
|
262
329
|
return json.loads(self._preview_data)
|
|
263
330
|
return self._preview_data if self._preview_data else None
|
|
@@ -275,16 +342,16 @@ class DatasetListVersion:
|
|
|
275
342
|
id: int
|
|
276
343
|
uuid: str
|
|
277
344
|
dataset_id: int
|
|
278
|
-
version:
|
|
345
|
+
version: str
|
|
279
346
|
status: int
|
|
280
347
|
created_at: datetime
|
|
281
|
-
finished_at:
|
|
348
|
+
finished_at: datetime | None
|
|
282
349
|
error_message: str
|
|
283
350
|
error_stack: str
|
|
284
|
-
num_objects:
|
|
285
|
-
size:
|
|
351
|
+
num_objects: int | None
|
|
352
|
+
size: int | None
|
|
286
353
|
query_script: str = ""
|
|
287
|
-
job_id:
|
|
354
|
+
job_id: str | None = None
|
|
288
355
|
|
|
289
356
|
@classmethod
|
|
290
357
|
def parse(
|
|
@@ -292,16 +359,16 @@ class DatasetListVersion:
|
|
|
292
359
|
id: int,
|
|
293
360
|
uuid: str,
|
|
294
361
|
dataset_id: int,
|
|
295
|
-
version:
|
|
362
|
+
version: str,
|
|
296
363
|
status: int,
|
|
297
364
|
created_at: datetime,
|
|
298
|
-
finished_at:
|
|
365
|
+
finished_at: datetime | None,
|
|
299
366
|
error_message: str,
|
|
300
367
|
error_stack: str,
|
|
301
|
-
num_objects:
|
|
302
|
-
size:
|
|
368
|
+
num_objects: int | None,
|
|
369
|
+
size: int | None,
|
|
303
370
|
query_script: str = "",
|
|
304
|
-
job_id:
|
|
371
|
+
job_id: str | None = None,
|
|
305
372
|
**kwargs,
|
|
306
373
|
):
|
|
307
374
|
return cls(
|
|
@@ -323,45 +390,65 @@ class DatasetListVersion:
|
|
|
323
390
|
def __hash__(self):
|
|
324
391
|
return hash(f"{self.dataset_id}_{self.version}")
|
|
325
392
|
|
|
393
|
+
@property
|
|
394
|
+
def version_value(self) -> int:
|
|
395
|
+
return semver.value(self.version)
|
|
396
|
+
|
|
326
397
|
|
|
327
398
|
@dataclass
|
|
328
399
|
class DatasetRecord:
|
|
329
400
|
id: int
|
|
330
401
|
name: str
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
402
|
+
project: Project
|
|
403
|
+
description: str | None
|
|
404
|
+
attrs: list[str]
|
|
405
|
+
schema: dict[str, SQLType | type[SQLType]]
|
|
334
406
|
feature_schema: dict
|
|
335
407
|
versions: list[DatasetVersion]
|
|
336
408
|
status: int = DatasetStatus.CREATED
|
|
337
|
-
created_at:
|
|
338
|
-
finished_at:
|
|
409
|
+
created_at: datetime | None = None
|
|
410
|
+
finished_at: datetime | None = None
|
|
339
411
|
error_message: str = ""
|
|
340
412
|
error_stack: str = ""
|
|
341
413
|
script_output: str = ""
|
|
342
414
|
sources: str = ""
|
|
343
415
|
query_script: str = ""
|
|
344
416
|
|
|
417
|
+
def __hash__(self):
|
|
418
|
+
return hash(f"{self.id}")
|
|
419
|
+
|
|
345
420
|
@staticmethod
|
|
346
|
-
def
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
421
|
+
def validate_name(name: str) -> None:
|
|
422
|
+
"""Throws exception if name has reserved characters"""
|
|
423
|
+
for c in DATASET_NAME_RESERVED_CHARS:
|
|
424
|
+
if c in name:
|
|
425
|
+
raise InvalidDatasetNameError(
|
|
426
|
+
f"Character {c} is reserved and not allowed in dataset name"
|
|
427
|
+
)
|
|
353
428
|
|
|
354
429
|
@classmethod
|
|
355
430
|
def parse( # noqa: PLR0913
|
|
356
431
|
cls,
|
|
357
|
-
|
|
432
|
+
namespace_id: int,
|
|
433
|
+
namespace_uuid: str,
|
|
434
|
+
namespace_name: str,
|
|
435
|
+
namespace_description: str | None,
|
|
436
|
+
namespace_created_at: datetime,
|
|
437
|
+
project_id: int,
|
|
438
|
+
project_uuid: str,
|
|
439
|
+
project_name: str,
|
|
440
|
+
project_description: str | None,
|
|
441
|
+
project_created_at: datetime,
|
|
442
|
+
project_namespace_id: int,
|
|
443
|
+
dataset_id: int,
|
|
444
|
+
dataset_project_id: int,
|
|
358
445
|
name: str,
|
|
359
|
-
description:
|
|
360
|
-
|
|
446
|
+
description: str | None,
|
|
447
|
+
attrs: str,
|
|
361
448
|
status: int,
|
|
362
|
-
feature_schema:
|
|
449
|
+
feature_schema: str | None,
|
|
363
450
|
created_at: datetime,
|
|
364
|
-
finished_at:
|
|
451
|
+
finished_at: datetime | None,
|
|
365
452
|
error_message: str,
|
|
366
453
|
error_stack: str,
|
|
367
454
|
script_output: str,
|
|
@@ -371,26 +458,40 @@ class DatasetRecord:
|
|
|
371
458
|
version_id: int,
|
|
372
459
|
version_uuid: str,
|
|
373
460
|
version_dataset_id: int,
|
|
374
|
-
version:
|
|
461
|
+
version: str,
|
|
375
462
|
version_status: int,
|
|
376
|
-
version_feature_schema:
|
|
463
|
+
version_feature_schema: str | None,
|
|
377
464
|
version_created_at: datetime,
|
|
378
|
-
version_finished_at:
|
|
465
|
+
version_finished_at: datetime | None,
|
|
379
466
|
version_error_message: str,
|
|
380
467
|
version_error_stack: str,
|
|
381
468
|
version_script_output: str,
|
|
382
|
-
version_num_objects:
|
|
383
|
-
version_size:
|
|
384
|
-
version_preview:
|
|
385
|
-
version_sources:
|
|
386
|
-
version_query_script:
|
|
469
|
+
version_num_objects: int | None,
|
|
470
|
+
version_size: int | None,
|
|
471
|
+
version_preview: str | None,
|
|
472
|
+
version_sources: str | None,
|
|
473
|
+
version_query_script: str | None,
|
|
387
474
|
version_schema: str,
|
|
388
|
-
version_job_id:
|
|
475
|
+
version_job_id: str | None = None,
|
|
389
476
|
) -> "DatasetRecord":
|
|
390
|
-
|
|
477
|
+
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
391
478
|
schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
|
|
392
|
-
|
|
393
|
-
|
|
479
|
+
|
|
480
|
+
namespace = Namespace(
|
|
481
|
+
namespace_id,
|
|
482
|
+
namespace_uuid,
|
|
483
|
+
namespace_name,
|
|
484
|
+
namespace_description,
|
|
485
|
+
namespace_created_at,
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
project = Project(
|
|
489
|
+
project_id,
|
|
490
|
+
project_uuid,
|
|
491
|
+
project_name,
|
|
492
|
+
project_description,
|
|
493
|
+
project_created_at,
|
|
494
|
+
namespace,
|
|
394
495
|
)
|
|
395
496
|
|
|
396
497
|
dataset_version = DatasetVersion.parse(
|
|
@@ -408,18 +509,19 @@ class DatasetRecord:
|
|
|
408
509
|
version_num_objects,
|
|
409
510
|
version_size,
|
|
410
511
|
version_preview,
|
|
411
|
-
|
|
512
|
+
version_schema,
|
|
412
513
|
version_sources, # type: ignore[arg-type]
|
|
413
514
|
version_query_script, # type: ignore[arg-type]
|
|
414
515
|
version_job_id,
|
|
415
516
|
)
|
|
416
517
|
|
|
417
518
|
return cls(
|
|
418
|
-
|
|
519
|
+
dataset_id,
|
|
419
520
|
name,
|
|
521
|
+
project,
|
|
420
522
|
description,
|
|
421
|
-
|
|
422
|
-
|
|
523
|
+
attrs_lst,
|
|
524
|
+
parse_schema(schema_dct), # type: ignore[arg-type]
|
|
423
525
|
json.loads(feature_schema) if feature_schema else {},
|
|
424
526
|
[dataset_version],
|
|
425
527
|
status,
|
|
@@ -441,7 +543,11 @@ class DatasetRecord:
|
|
|
441
543
|
for c_name, c_type in self.schema.items()
|
|
442
544
|
}
|
|
443
545
|
|
|
444
|
-
|
|
546
|
+
@property
|
|
547
|
+
def full_name(self) -> str:
|
|
548
|
+
return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
|
|
549
|
+
|
|
550
|
+
def get_schema(self, version: str) -> dict[str, SQLType | type[SQLType]]:
|
|
445
551
|
return self.get_version(version).schema if version else self.schema
|
|
446
552
|
|
|
447
553
|
def update(self, **kwargs):
|
|
@@ -460,20 +566,23 @@ class DatasetRecord:
|
|
|
460
566
|
self.versions = []
|
|
461
567
|
|
|
462
568
|
self.versions = list(set(self.versions + other.versions))
|
|
463
|
-
self.versions.sort(key=lambda v: v.
|
|
569
|
+
self.versions.sort(key=lambda v: v.version_value)
|
|
464
570
|
return self
|
|
465
571
|
|
|
466
|
-
def has_version(self, version:
|
|
467
|
-
return version in self.
|
|
572
|
+
def has_version(self, version: str) -> bool:
|
|
573
|
+
return version in [v.version for v in self.versions]
|
|
468
574
|
|
|
469
|
-
def is_valid_next_version(self, version:
|
|
575
|
+
def is_valid_next_version(self, version: str) -> bool:
|
|
470
576
|
"""
|
|
471
577
|
Checks if a number can be a valid next latest version for dataset.
|
|
472
578
|
The only rule is that it cannot be lower than current latest version
|
|
473
579
|
"""
|
|
474
|
-
return not (
|
|
580
|
+
return not (
|
|
581
|
+
self.latest_version
|
|
582
|
+
and semver.value(self.latest_version) >= semver.value(version)
|
|
583
|
+
)
|
|
475
584
|
|
|
476
|
-
def get_version(self, version:
|
|
585
|
+
def get_version(self, version: str) -> DatasetVersion:
|
|
477
586
|
if not self.has_version(version):
|
|
478
587
|
raise DatasetVersionNotFoundError(
|
|
479
588
|
f"Dataset {self.name} does not have version {version}"
|
|
@@ -496,15 +605,15 @@ class DatasetRecord:
|
|
|
496
605
|
f"Dataset {self.name} does not have version with uuid {uuid}"
|
|
497
606
|
) from None
|
|
498
607
|
|
|
499
|
-
def remove_version(self, version:
|
|
608
|
+
def remove_version(self, version: str) -> None:
|
|
500
609
|
if not self.versions or not self.has_version(version):
|
|
501
610
|
return
|
|
502
611
|
|
|
503
612
|
self.versions = [v for v in self.versions if v.version != version]
|
|
504
613
|
|
|
505
|
-
def identifier(self, version:
|
|
614
|
+
def identifier(self, version: str) -> str:
|
|
506
615
|
"""
|
|
507
|
-
Get identifier in the form my-dataset@v3
|
|
616
|
+
Get identifier in the form my-dataset@v3.0.1
|
|
508
617
|
"""
|
|
509
618
|
if not self.has_version(version):
|
|
510
619
|
raise DatasetVersionNotFoundError(
|
|
@@ -512,83 +621,172 @@ class DatasetRecord:
|
|
|
512
621
|
)
|
|
513
622
|
return f"{self.name}@v{version}"
|
|
514
623
|
|
|
515
|
-
def uri(self, version:
|
|
624
|
+
def uri(self, version: str) -> str:
|
|
516
625
|
"""
|
|
517
|
-
Dataset uri example: ds://dogs@v3
|
|
626
|
+
Dataset uri example: ds://dogs@v3.0.1
|
|
518
627
|
"""
|
|
519
628
|
identifier = self.identifier(version)
|
|
520
|
-
return
|
|
629
|
+
return (
|
|
630
|
+
f"{DATASET_PREFIX}{self.project.namespace.name}"
|
|
631
|
+
f".{self.project.name}.{identifier}"
|
|
632
|
+
)
|
|
521
633
|
|
|
522
634
|
@property
|
|
523
|
-
def
|
|
635
|
+
def next_version_major(self) -> str:
|
|
524
636
|
"""
|
|
525
|
-
|
|
526
|
-
in self.versions attribute
|
|
637
|
+
Returns the next auto-incremented version if the major part is being bumped.
|
|
527
638
|
"""
|
|
528
639
|
if not self.versions:
|
|
529
|
-
return
|
|
640
|
+
return "1.0.0"
|
|
530
641
|
|
|
531
|
-
|
|
642
|
+
major, _, _ = semver.parse(self.latest_version)
|
|
643
|
+
return semver.create(major + 1, 0, 0)
|
|
532
644
|
|
|
533
645
|
@property
|
|
534
|
-
def
|
|
535
|
-
"""
|
|
646
|
+
def next_version_minor(self) -> str:
|
|
647
|
+
"""
|
|
648
|
+
Returns the next auto-incremented version if the minor part is being bumped.
|
|
649
|
+
"""
|
|
536
650
|
if not self.versions:
|
|
537
|
-
return 1
|
|
538
|
-
|
|
651
|
+
return "1.0.0"
|
|
652
|
+
|
|
653
|
+
major, minor, _ = semver.parse(self.latest_version)
|
|
654
|
+
return semver.create(major, minor + 1, 0)
|
|
539
655
|
|
|
540
656
|
@property
|
|
541
|
-
def
|
|
542
|
-
"""
|
|
543
|
-
|
|
657
|
+
def next_version_patch(self) -> str:
|
|
658
|
+
"""
|
|
659
|
+
Returns the next auto-incremented version if the patch part is being bumped.
|
|
660
|
+
"""
|
|
661
|
+
if not self.versions:
|
|
662
|
+
return "1.0.0"
|
|
663
|
+
|
|
664
|
+
major, minor, patch = semver.parse(self.latest_version)
|
|
665
|
+
return semver.create(major, minor, patch + 1)
|
|
544
666
|
|
|
545
667
|
@property
|
|
546
|
-
def
|
|
547
|
-
"""Returns
|
|
548
|
-
|
|
668
|
+
def latest_version(self) -> str:
|
|
669
|
+
"""Returns latest version of a dataset"""
|
|
670
|
+
return max(self.versions).version
|
|
671
|
+
|
|
672
|
+
def latest_major_version(self, major: int) -> str | None:
|
|
673
|
+
"""
|
|
674
|
+
Returns latest specific major version, e.g if dataset has versions:
|
|
675
|
+
- 1.4.1
|
|
676
|
+
- 2.0.1
|
|
677
|
+
- 2.1.1
|
|
678
|
+
- 2.4.0
|
|
679
|
+
and we call `.latest_major_version(2)` it will return: "2.4.0".
|
|
680
|
+
If no major version is find with input value, None will be returned
|
|
681
|
+
"""
|
|
682
|
+
versions = [v for v in self.versions if semver.parse(v.version)[0] == major]
|
|
683
|
+
if not versions:
|
|
684
|
+
return None
|
|
685
|
+
return max(versions).version
|
|
686
|
+
|
|
687
|
+
def latest_compatible_version(self, version_spec: str) -> str | None:
|
|
688
|
+
"""
|
|
689
|
+
Returns the latest version that matches the given version specifier.
|
|
690
|
+
|
|
691
|
+
Supports Python version specifiers like:
|
|
692
|
+
- ">=1.0.0,<2.0.0" (compatible release range)
|
|
693
|
+
- "~=1.4.2" (compatible release clause)
|
|
694
|
+
- "==1.2.*" (prefix matching)
|
|
695
|
+
- ">1.0.0" (exclusive ordered comparison)
|
|
696
|
+
- ">=1.0.0" (inclusive ordered comparison)
|
|
697
|
+
- "!=1.3.0" (version exclusion)
|
|
698
|
+
|
|
699
|
+
Args:
|
|
700
|
+
version_spec: Version specifier string following PEP 440
|
|
701
|
+
|
|
702
|
+
Returns:
|
|
703
|
+
Latest compatible version string, or None if no compatible version found
|
|
704
|
+
"""
|
|
705
|
+
spec_set = SpecifierSet(version_spec)
|
|
706
|
+
|
|
707
|
+
# Convert dataset versions to packaging.Version objects
|
|
708
|
+
# and filter compatible ones
|
|
709
|
+
compatible_versions = []
|
|
710
|
+
for v in self.versions:
|
|
711
|
+
pkg_version = Version(v.version)
|
|
712
|
+
if spec_set.contains(pkg_version):
|
|
713
|
+
compatible_versions.append(v)
|
|
714
|
+
|
|
715
|
+
if not compatible_versions:
|
|
549
716
|
return None
|
|
550
717
|
|
|
551
|
-
|
|
718
|
+
# Return the latest compatible version
|
|
719
|
+
return max(compatible_versions).version
|
|
552
720
|
|
|
553
721
|
@classmethod
|
|
554
722
|
def from_dict(cls, d: dict[str, Any]) -> "DatasetRecord":
|
|
723
|
+
project = Project.from_dict(d.pop("project"))
|
|
555
724
|
versions = [DatasetVersion.from_dict(v) for v in d.pop("versions", [])]
|
|
556
725
|
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
|
|
557
|
-
return cls(**kwargs, versions=versions)
|
|
726
|
+
return cls(**kwargs, versions=versions, project=project)
|
|
558
727
|
|
|
559
728
|
|
|
560
729
|
@dataclass
|
|
561
730
|
class DatasetListRecord:
|
|
562
731
|
id: int
|
|
563
732
|
name: str
|
|
564
|
-
|
|
565
|
-
|
|
733
|
+
project: Project
|
|
734
|
+
description: str | None
|
|
735
|
+
attrs: list[str]
|
|
566
736
|
versions: list[DatasetListVersion]
|
|
567
|
-
created_at:
|
|
737
|
+
created_at: datetime | None = None
|
|
568
738
|
|
|
569
739
|
@classmethod
|
|
570
740
|
def parse( # noqa: PLR0913
|
|
571
741
|
cls,
|
|
572
|
-
|
|
742
|
+
namespace_id: int,
|
|
743
|
+
namespace_uuid: str,
|
|
744
|
+
namespace_name: str,
|
|
745
|
+
namespace_description: str | None,
|
|
746
|
+
namespace_created_at: datetime,
|
|
747
|
+
project_id: int,
|
|
748
|
+
project_uuid: str,
|
|
749
|
+
project_name: str,
|
|
750
|
+
project_description: str | None,
|
|
751
|
+
project_created_at: datetime,
|
|
752
|
+
project_namespace_id: int,
|
|
753
|
+
dataset_id: int,
|
|
573
754
|
name: str,
|
|
574
|
-
description:
|
|
575
|
-
|
|
755
|
+
description: str | None,
|
|
756
|
+
attrs: str,
|
|
576
757
|
created_at: datetime,
|
|
577
758
|
version_id: int,
|
|
578
759
|
version_uuid: str,
|
|
579
760
|
version_dataset_id: int,
|
|
580
|
-
version:
|
|
761
|
+
version: str,
|
|
581
762
|
version_status: int,
|
|
582
763
|
version_created_at: datetime,
|
|
583
|
-
version_finished_at:
|
|
764
|
+
version_finished_at: datetime | None,
|
|
584
765
|
version_error_message: str,
|
|
585
766
|
version_error_stack: str,
|
|
586
|
-
version_num_objects:
|
|
587
|
-
version_size:
|
|
588
|
-
version_query_script:
|
|
589
|
-
version_job_id:
|
|
767
|
+
version_num_objects: int | None,
|
|
768
|
+
version_size: int | None,
|
|
769
|
+
version_query_script: str | None,
|
|
770
|
+
version_job_id: str | None = None,
|
|
590
771
|
) -> "DatasetListRecord":
|
|
591
|
-
|
|
772
|
+
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
773
|
+
|
|
774
|
+
namespace = Namespace(
|
|
775
|
+
namespace_id,
|
|
776
|
+
namespace_uuid,
|
|
777
|
+
namespace_name,
|
|
778
|
+
namespace_description,
|
|
779
|
+
namespace_created_at,
|
|
780
|
+
)
|
|
781
|
+
|
|
782
|
+
project = Project(
|
|
783
|
+
project_id,
|
|
784
|
+
project_uuid,
|
|
785
|
+
project_name,
|
|
786
|
+
project_description,
|
|
787
|
+
project_created_at,
|
|
788
|
+
namespace,
|
|
789
|
+
)
|
|
592
790
|
|
|
593
791
|
dataset_version = DatasetListVersion.parse(
|
|
594
792
|
version_id,
|
|
@@ -607,14 +805,19 @@ class DatasetListRecord:
|
|
|
607
805
|
)
|
|
608
806
|
|
|
609
807
|
return cls(
|
|
610
|
-
|
|
808
|
+
dataset_id,
|
|
611
809
|
name,
|
|
810
|
+
project,
|
|
612
811
|
description,
|
|
613
|
-
|
|
812
|
+
attrs_lst,
|
|
614
813
|
[dataset_version],
|
|
615
814
|
created_at,
|
|
616
815
|
)
|
|
617
816
|
|
|
817
|
+
@property
|
|
818
|
+
def full_name(self) -> str:
|
|
819
|
+
return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
|
|
820
|
+
|
|
618
821
|
def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
|
|
619
822
|
"""Merge versions from another dataset"""
|
|
620
823
|
if other.id != self.id:
|
|
@@ -626,11 +829,11 @@ class DatasetListRecord:
|
|
|
626
829
|
self.versions = []
|
|
627
830
|
|
|
628
831
|
self.versions = list(set(self.versions + other.versions))
|
|
629
|
-
self.versions.sort(key=lambda v: v.
|
|
832
|
+
self.versions.sort(key=lambda v: v.version_value)
|
|
630
833
|
return self
|
|
631
834
|
|
|
632
835
|
def latest_version(self) -> DatasetListVersion:
|
|
633
|
-
return max(self.versions, key=lambda v: v.
|
|
836
|
+
return max(self.versions, key=lambda v: v.version_value)
|
|
634
837
|
|
|
635
838
|
@property
|
|
636
839
|
def is_bucket_listing(self) -> bool:
|
|
@@ -641,7 +844,7 @@ class DatasetListRecord:
|
|
|
641
844
|
from datachain.client import Client
|
|
642
845
|
|
|
643
846
|
# TODO refactor and maybe remove method in
|
|
644
|
-
# https://github.com/
|
|
847
|
+
# https://github.com/datachain-ai/datachain/issues/318
|
|
645
848
|
return Client.is_data_source_uri(self.name) or self.name.startswith(
|
|
646
849
|
LISTING_PREFIX
|
|
647
850
|
)
|
|
@@ -651,9 +854,11 @@ class DatasetListRecord:
|
|
|
651
854
|
|
|
652
855
|
@classmethod
|
|
653
856
|
def from_dict(cls, d: dict[str, Any]) -> "DatasetListRecord":
|
|
857
|
+
project = Project.from_dict(d.pop("project"))
|
|
654
858
|
versions = [DatasetListVersion.parse(**v) for v in d.get("versions", [])]
|
|
655
859
|
kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
|
|
656
860
|
kwargs["versions"] = versions
|
|
861
|
+
kwargs["project"] = project
|
|
657
862
|
return cls(**kwargs)
|
|
658
863
|
|
|
659
864
|
|