datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. datachain/__init__.py +20 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +7 -7
  4. datachain/catalog/__init__.py +2 -2
  5. datachain/catalog/catalog.py +621 -507
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +28 -18
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +24 -33
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +83 -52
  12. datachain/cli/commands/ls.py +17 -17
  13. datachain/cli/commands/show.py +4 -4
  14. datachain/cli/parser/__init__.py +8 -74
  15. datachain/cli/parser/job.py +95 -3
  16. datachain/cli/parser/studio.py +11 -4
  17. datachain/cli/parser/utils.py +1 -2
  18. datachain/cli/utils.py +2 -15
  19. datachain/client/azure.py +4 -4
  20. datachain/client/fsspec.py +45 -28
  21. datachain/client/gcs.py +6 -6
  22. datachain/client/hf.py +29 -2
  23. datachain/client/http.py +157 -0
  24. datachain/client/local.py +15 -11
  25. datachain/client/s3.py +17 -9
  26. datachain/config.py +4 -8
  27. datachain/data_storage/db_engine.py +12 -6
  28. datachain/data_storage/job.py +5 -1
  29. datachain/data_storage/metastore.py +1252 -186
  30. datachain/data_storage/schema.py +58 -45
  31. datachain/data_storage/serializer.py +105 -15
  32. datachain/data_storage/sqlite.py +286 -127
  33. datachain/data_storage/warehouse.py +250 -113
  34. datachain/dataset.py +353 -148
  35. datachain/delta.py +391 -0
  36. datachain/diff/__init__.py +27 -29
  37. datachain/error.py +60 -0
  38. datachain/func/__init__.py +2 -1
  39. datachain/func/aggregate.py +66 -42
  40. datachain/func/array.py +242 -38
  41. datachain/func/base.py +7 -4
  42. datachain/func/conditional.py +110 -60
  43. datachain/func/func.py +96 -45
  44. datachain/func/numeric.py +55 -38
  45. datachain/func/path.py +32 -20
  46. datachain/func/random.py +2 -2
  47. datachain/func/string.py +67 -37
  48. datachain/func/window.py +7 -8
  49. datachain/hash_utils.py +123 -0
  50. datachain/job.py +11 -7
  51. datachain/json.py +138 -0
  52. datachain/lib/arrow.py +58 -22
  53. datachain/lib/audio.py +245 -0
  54. datachain/lib/clip.py +14 -13
  55. datachain/lib/convert/flatten.py +5 -3
  56. datachain/lib/convert/python_to_sql.py +6 -10
  57. datachain/lib/convert/sql_to_python.py +8 -0
  58. datachain/lib/convert/values_to_tuples.py +156 -51
  59. datachain/lib/data_model.py +42 -20
  60. datachain/lib/dataset_info.py +36 -8
  61. datachain/lib/dc/__init__.py +8 -2
  62. datachain/lib/dc/csv.py +25 -28
  63. datachain/lib/dc/database.py +398 -0
  64. datachain/lib/dc/datachain.py +1289 -425
  65. datachain/lib/dc/datasets.py +320 -38
  66. datachain/lib/dc/hf.py +38 -24
  67. datachain/lib/dc/json.py +29 -32
  68. datachain/lib/dc/listings.py +112 -8
  69. datachain/lib/dc/pandas.py +16 -12
  70. datachain/lib/dc/parquet.py +35 -23
  71. datachain/lib/dc/records.py +31 -23
  72. datachain/lib/dc/storage.py +154 -64
  73. datachain/lib/dc/storage_pattern.py +251 -0
  74. datachain/lib/dc/utils.py +24 -16
  75. datachain/lib/dc/values.py +8 -9
  76. datachain/lib/file.py +622 -89
  77. datachain/lib/hf.py +69 -39
  78. datachain/lib/image.py +14 -14
  79. datachain/lib/listing.py +14 -11
  80. datachain/lib/listing_info.py +1 -2
  81. datachain/lib/meta_formats.py +3 -4
  82. datachain/lib/model_store.py +39 -7
  83. datachain/lib/namespaces.py +125 -0
  84. datachain/lib/projects.py +130 -0
  85. datachain/lib/pytorch.py +32 -21
  86. datachain/lib/settings.py +192 -56
  87. datachain/lib/signal_schema.py +427 -104
  88. datachain/lib/tar.py +1 -2
  89. datachain/lib/text.py +8 -7
  90. datachain/lib/udf.py +164 -76
  91. datachain/lib/udf_signature.py +60 -35
  92. datachain/lib/utils.py +118 -4
  93. datachain/lib/video.py +17 -9
  94. datachain/lib/webdataset.py +61 -56
  95. datachain/lib/webdataset_laion.py +15 -16
  96. datachain/listing.py +22 -10
  97. datachain/model/bbox.py +3 -1
  98. datachain/model/ultralytics/bbox.py +16 -12
  99. datachain/model/ultralytics/pose.py +16 -12
  100. datachain/model/ultralytics/segment.py +16 -12
  101. datachain/namespace.py +84 -0
  102. datachain/node.py +6 -6
  103. datachain/nodes_thread_pool.py +0 -1
  104. datachain/plugins.py +24 -0
  105. datachain/project.py +78 -0
  106. datachain/query/batch.py +40 -41
  107. datachain/query/dataset.py +604 -322
  108. datachain/query/dispatch.py +261 -154
  109. datachain/query/metrics.py +4 -6
  110. datachain/query/params.py +2 -3
  111. datachain/query/queue.py +3 -12
  112. datachain/query/schema.py +11 -6
  113. datachain/query/session.py +200 -33
  114. datachain/query/udf.py +34 -2
  115. datachain/remote/studio.py +171 -69
  116. datachain/script_meta.py +12 -12
  117. datachain/semver.py +68 -0
  118. datachain/sql/__init__.py +2 -0
  119. datachain/sql/functions/array.py +33 -1
  120. datachain/sql/postgresql_dialect.py +9 -0
  121. datachain/sql/postgresql_types.py +21 -0
  122. datachain/sql/sqlite/__init__.py +5 -1
  123. datachain/sql/sqlite/base.py +102 -29
  124. datachain/sql/sqlite/types.py +8 -13
  125. datachain/sql/types.py +70 -15
  126. datachain/studio.py +223 -46
  127. datachain/toolkit/split.py +31 -10
  128. datachain/utils.py +101 -59
  129. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
  130. datachain-0.39.0.dist-info/RECORD +173 -0
  131. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
  132. datachain/cli/commands/query.py +0 -53
  133. datachain/query/utils.py +0 -42
  134. datachain-0.14.2.dist-info/RECORD +0 -158
  135. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  136. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  137. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/dataset.py CHANGED
@@ -1,18 +1,17 @@
1
1
  import builtins
2
- import json
3
2
  from dataclasses import dataclass, fields
4
3
  from datetime import datetime
5
4
  from functools import cached_property
6
- from typing import (
7
- Any,
8
- NewType,
9
- Optional,
10
- TypeVar,
11
- Union,
12
- )
5
+ from typing import Any, NewType, TypeVar
13
6
  from urllib.parse import urlparse
14
7
 
15
- from datachain.error import DatasetVersionNotFoundError
8
+ from packaging.specifiers import SpecifierSet
9
+ from packaging.version import Version
10
+
11
+ from datachain import json, semver
12
+ from datachain.error import DatasetVersionNotFoundError, InvalidDatasetNameError
13
+ from datachain.namespace import Namespace
14
+ from datachain.project import Project
16
15
  from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
17
16
 
18
17
  T = TypeVar("T", bound="DatasetRecord")
@@ -25,6 +24,10 @@ DATASET_PREFIX = "ds://"
25
24
  QUERY_DATASET_PREFIX = "ds_query_"
26
25
  LISTING_PREFIX = "lst__"
27
26
 
27
+ DEFAULT_DATASET_VERSION = "1.0.0"
28
+ DATASET_NAME_RESERVED_CHARS = [".", "@"]
29
+ DATASET_NAME_REPLACEMENT_CHAR = "_"
30
+
28
31
 
29
32
  # StorageURI represents a normalised URI to a valid storage location (full bucket or
30
33
  # absolute local path).
@@ -33,12 +36,12 @@ LISTING_PREFIX = "lst__"
33
36
  StorageURI = NewType("StorageURI", str)
34
37
 
35
38
 
36
- def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
39
+ def parse_dataset_uri(uri: str) -> tuple[str, str | None]:
37
40
  """
38
41
  Parse dataser uri to extract name and version out of it (if version is defined)
39
42
  Example:
40
- Input: ds://zalando@v3
41
- Output: (zalando, 3)
43
+ Input: ds://zalando@v3.0.1
44
+ Output: (zalando, 3.0.1)
42
45
  """
43
46
  p = urlparse(uri)
44
47
  if p.scheme != "ds":
@@ -51,24 +54,74 @@ def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
51
54
  raise Exception(
52
55
  "Wrong dataset uri format, it should be: ds://<name>@v<version>"
53
56
  )
54
- version = int(s[1])
55
- return name, version
57
+ return name, s[1]
56
58
 
57
59
 
58
- def create_dataset_uri(name: str, version: Optional[int] = None) -> str:
60
+ def create_dataset_uri(
61
+ name: str, namespace: str, project: str, version: str | None = None
62
+ ) -> str:
59
63
  """
60
- Creates a dataset uri based on dataset name and optionally version
64
+ Creates a dataset uri based on namespace, project, dataset name and optionally
65
+ version.
61
66
  Example:
62
- Input: zalando, 3
63
- Output: ds//zalando@v3
67
+ Input: dev, clothes, zalando, 3.0.1
68
+ Output: ds//dev.clothes.zalando@v3.0.1
64
69
  """
65
- uri = f"{DATASET_PREFIX}{name}"
70
+ uri = f"{DATASET_PREFIX}{namespace}.{project}.{name}"
66
71
  if version:
67
72
  uri += f"@v{version}"
68
73
 
69
74
  return uri
70
75
 
71
76
 
77
+ def parse_dataset_name(name: str) -> tuple[str | None, str | None, str]:
78
+ """Parses dataset name and returns namespace, project and name"""
79
+ if not name:
80
+ raise InvalidDatasetNameError("Name must be defined to parse it")
81
+ split = name.split(".")
82
+ if len(split) > 3:
83
+ raise InvalidDatasetNameError(f"Invalid dataset name {name}")
84
+ name = split[-1]
85
+ project_name = split[-2] if len(split) > 1 else None
86
+ namespace_name = split[-3] if len(split) > 2 else None
87
+
88
+ return namespace_name, project_name, name
89
+
90
+
91
+ def parse_schema(ct: dict[str, Any]) -> dict[str, SQLType | type[SQLType]]:
92
+ """Parse dataset schema from dictionary representation.
93
+
94
+ Args:
95
+ ct: Dictionary with column definitions
96
+
97
+ Returns:
98
+ Dictionary mapping column names to SQL types
99
+
100
+ Raises:
101
+ TypeError: If schema format is invalid
102
+ ValueError: If column type is not defined or not supported
103
+ """
104
+ if not isinstance(ct, dict):
105
+ raise TypeError("Schema definition must be a dictionary")
106
+ res = {}
107
+ for c_name, c_type in ct.items():
108
+ if not isinstance(c_type, dict):
109
+ raise TypeError(f"Schema column '{c_name}' type must be a dictionary")
110
+ if "type" not in c_type:
111
+ raise ValueError(f"Schema column '{c_name}' type is not defined")
112
+ if c_type["type"] not in NAME_TYPES_MAPPING:
113
+ raise ValueError(
114
+ f"Schema column '{c_name}' type '{c_type['type']}' is not supported"
115
+ )
116
+ try:
117
+ res[c_name] = NAME_TYPES_MAPPING[c_type["type"]].from_dict(c_type) # type: ignore [attr-defined]
118
+ except Exception as e:
119
+ raise ValueError(
120
+ f"Schema column '{c_name}' type '{c_type['type']}' parsing error: {e}"
121
+ ) from e
122
+ return res
123
+
124
+
72
125
  class DatasetDependencyType:
73
126
  DATASET = "dataset"
74
127
  STORAGE = "storage"
@@ -76,12 +129,16 @@ class DatasetDependencyType:
76
129
 
77
130
  @dataclass
78
131
  class DatasetDependency:
132
+ # TODO put `DatasetRecord` instead of name + version which will
133
+ # simplify codebase in various places
79
134
  id: int
80
135
  type: str
136
+ namespace: str
137
+ project: str
81
138
  name: str
82
- version: str # TODO change to int
139
+ version: str
83
140
  created_at: datetime
84
- dependencies: list[Optional["DatasetDependency"]]
141
+ dependencies: list["DatasetDependency | None"]
85
142
 
86
143
  @property
87
144
  def dataset_name(self) -> str:
@@ -91,40 +148,41 @@ class DatasetDependency:
91
148
  if self.type == DatasetDependencyType.DATASET:
92
149
  return self.name
93
150
 
94
- list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), {})
151
+ list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"))
95
152
  assert list_dataset_name
96
153
  return list_dataset_name
97
154
 
98
155
  @classmethod
99
156
  def parse(
100
157
  cls: builtins.type[DD],
158
+ namespace_name: str,
159
+ project_name: str,
101
160
  id: int,
102
- dataset_id: Optional[int],
103
- dataset_version_id: Optional[int],
104
- dataset_name: Optional[str],
105
- dataset_version: Optional[int],
106
- dataset_version_created_at: Optional[datetime],
107
- ) -> Optional["DatasetDependency"]:
108
- from datachain.client import Client
109
- from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
161
+ dataset_id: int | None,
162
+ dataset_version_id: int | None,
163
+ dataset_name: str | None,
164
+ dataset_version: str | None,
165
+ dataset_version_created_at: datetime | None,
166
+ ) -> "DatasetDependency | None":
167
+ from datachain.lib.listing import is_listing_dataset
110
168
 
111
169
  if not dataset_id:
112
170
  return None
113
171
 
114
172
  assert dataset_name is not None
115
- dependency_type = DatasetDependencyType.DATASET
116
- dependency_name = dataset_name
117
-
118
- if is_listing_dataset(dataset_name):
119
- dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
120
- dependency_name, _ = Client.parse_url(listing_uri_from_name(dataset_name))
121
173
 
122
174
  return cls(
123
175
  id,
124
- dependency_type,
125
- dependency_name,
126
176
  (
127
- str(dataset_version) # type: ignore[arg-type]
177
+ DatasetDependencyType.STORAGE
178
+ if is_listing_dataset(dataset_name)
179
+ else DatasetDependencyType.DATASET
180
+ ),
181
+ namespace_name,
182
+ project_name,
183
+ dataset_name,
184
+ (
185
+ dataset_version # type: ignore[arg-type]
128
186
  if dataset_version
129
187
  else None
130
188
  ),
@@ -163,21 +221,21 @@ class DatasetVersion:
163
221
  id: int
164
222
  uuid: str
165
223
  dataset_id: int
166
- version: int
224
+ version: str
167
225
  status: int
168
226
  feature_schema: dict
169
227
  created_at: datetime
170
- finished_at: Optional[datetime]
228
+ finished_at: datetime | None
171
229
  error_message: str
172
230
  error_stack: str
173
231
  script_output: str
174
- schema: dict[str, Union[SQLType, type[SQLType]]]
175
- num_objects: Optional[int]
176
- size: Optional[int]
177
- _preview_data: Optional[Union[str, list[dict]]]
232
+ schema: dict[str, SQLType | type[SQLType]]
233
+ num_objects: int | None
234
+ size: int | None
235
+ _preview_data: str | list[dict] | None
178
236
  sources: str = ""
179
237
  query_script: str = ""
180
- job_id: Optional[str] = None
238
+ job_id: str | None = None
181
239
 
182
240
  @classmethod
183
241
  def parse( # noqa: PLR0913
@@ -185,22 +243,27 @@ class DatasetVersion:
185
243
  id: int,
186
244
  uuid: str,
187
245
  dataset_id: int,
188
- version: int,
246
+ version: str,
189
247
  status: int,
190
- feature_schema: Optional[str],
248
+ feature_schema: str | None,
191
249
  created_at: datetime,
192
- finished_at: Optional[datetime],
250
+ finished_at: datetime | None,
193
251
  error_message: str,
194
252
  error_stack: str,
195
253
  script_output: str,
196
- num_objects: Optional[int],
197
- size: Optional[int],
198
- preview: Optional[Union[str, list[dict]]],
199
- schema: dict[str, Union[SQLType, type[SQLType]]],
254
+ num_objects: int | None,
255
+ size: int | None,
256
+ preview: str | list[dict] | None,
257
+ schema: str | dict[str, SQLType | type[SQLType]],
200
258
  sources: str = "",
201
259
  query_script: str = "",
202
- job_id: Optional[str] = None,
260
+ job_id: str | None = None,
203
261
  ):
262
+ if isinstance(schema, str):
263
+ schema_parsed = parse_schema(json.loads(schema) if schema else {})
264
+ else:
265
+ schema_parsed = schema
266
+
204
267
  return cls(
205
268
  id,
206
269
  uuid,
@@ -213,7 +276,7 @@ class DatasetVersion:
213
276
  error_message,
214
277
  error_stack,
215
278
  script_output,
216
- schema,
279
+ schema_parsed,
217
280
  num_objects,
218
281
  size,
219
282
  preview,
@@ -222,6 +285,10 @@ class DatasetVersion:
222
285
  job_id,
223
286
  )
224
287
 
288
+ @property
289
+ def version_value(self) -> int:
290
+ return semver.value(self.version)
291
+
225
292
  def __eq__(self, other):
226
293
  if not isinstance(other, DatasetVersion):
227
294
  return False
@@ -230,7 +297,7 @@ class DatasetVersion:
230
297
  def __lt__(self, other):
231
298
  if not isinstance(other, DatasetVersion):
232
299
  return False
233
- return self.version < other.version
300
+ return self.version_value < other.version_value
234
301
 
235
302
  def __hash__(self):
236
303
  return hash(f"{self.dataset_id}_{self.version}")
@@ -257,7 +324,7 @@ class DatasetVersion:
257
324
  }
258
325
 
259
326
  @cached_property
260
- def preview(self) -> Optional[list[dict]]:
327
+ def preview(self) -> list[dict] | None:
261
328
  if isinstance(self._preview_data, str):
262
329
  return json.loads(self._preview_data)
263
330
  return self._preview_data if self._preview_data else None
@@ -275,16 +342,16 @@ class DatasetListVersion:
275
342
  id: int
276
343
  uuid: str
277
344
  dataset_id: int
278
- version: int
345
+ version: str
279
346
  status: int
280
347
  created_at: datetime
281
- finished_at: Optional[datetime]
348
+ finished_at: datetime | None
282
349
  error_message: str
283
350
  error_stack: str
284
- num_objects: Optional[int]
285
- size: Optional[int]
351
+ num_objects: int | None
352
+ size: int | None
286
353
  query_script: str = ""
287
- job_id: Optional[str] = None
354
+ job_id: str | None = None
288
355
 
289
356
  @classmethod
290
357
  def parse(
@@ -292,16 +359,16 @@ class DatasetListVersion:
292
359
  id: int,
293
360
  uuid: str,
294
361
  dataset_id: int,
295
- version: int,
362
+ version: str,
296
363
  status: int,
297
364
  created_at: datetime,
298
- finished_at: Optional[datetime],
365
+ finished_at: datetime | None,
299
366
  error_message: str,
300
367
  error_stack: str,
301
- num_objects: Optional[int],
302
- size: Optional[int],
368
+ num_objects: int | None,
369
+ size: int | None,
303
370
  query_script: str = "",
304
- job_id: Optional[str] = None,
371
+ job_id: str | None = None,
305
372
  **kwargs,
306
373
  ):
307
374
  return cls(
@@ -323,45 +390,65 @@ class DatasetListVersion:
323
390
  def __hash__(self):
324
391
  return hash(f"{self.dataset_id}_{self.version}")
325
392
 
393
+ @property
394
+ def version_value(self) -> int:
395
+ return semver.value(self.version)
396
+
326
397
 
327
398
  @dataclass
328
399
  class DatasetRecord:
329
400
  id: int
330
401
  name: str
331
- description: Optional[str]
332
- labels: list[str]
333
- schema: dict[str, Union[SQLType, type[SQLType]]]
402
+ project: Project
403
+ description: str | None
404
+ attrs: list[str]
405
+ schema: dict[str, SQLType | type[SQLType]]
334
406
  feature_schema: dict
335
407
  versions: list[DatasetVersion]
336
408
  status: int = DatasetStatus.CREATED
337
- created_at: Optional[datetime] = None
338
- finished_at: Optional[datetime] = None
409
+ created_at: datetime | None = None
410
+ finished_at: datetime | None = None
339
411
  error_message: str = ""
340
412
  error_stack: str = ""
341
413
  script_output: str = ""
342
414
  sources: str = ""
343
415
  query_script: str = ""
344
416
 
417
+ def __hash__(self):
418
+ return hash(f"{self.id}")
419
+
345
420
  @staticmethod
346
- def parse_schema(
347
- ct: dict[str, Any],
348
- ) -> dict[str, Union[SQLType, type[SQLType]]]:
349
- return {
350
- c_name: NAME_TYPES_MAPPING[c_type["type"]].from_dict(c_type) # type: ignore [attr-defined]
351
- for c_name, c_type in ct.items()
352
- }
421
+ def validate_name(name: str) -> None:
422
+ """Throws exception if name has reserved characters"""
423
+ for c in DATASET_NAME_RESERVED_CHARS:
424
+ if c in name:
425
+ raise InvalidDatasetNameError(
426
+ f"Character {c} is reserved and not allowed in dataset name"
427
+ )
353
428
 
354
429
  @classmethod
355
430
  def parse( # noqa: PLR0913
356
431
  cls,
357
- id: int,
432
+ namespace_id: int,
433
+ namespace_uuid: str,
434
+ namespace_name: str,
435
+ namespace_description: str | None,
436
+ namespace_created_at: datetime,
437
+ project_id: int,
438
+ project_uuid: str,
439
+ project_name: str,
440
+ project_description: str | None,
441
+ project_created_at: datetime,
442
+ project_namespace_id: int,
443
+ dataset_id: int,
444
+ dataset_project_id: int,
358
445
  name: str,
359
- description: Optional[str],
360
- labels: str,
446
+ description: str | None,
447
+ attrs: str,
361
448
  status: int,
362
- feature_schema: Optional[str],
449
+ feature_schema: str | None,
363
450
  created_at: datetime,
364
- finished_at: Optional[datetime],
451
+ finished_at: datetime | None,
365
452
  error_message: str,
366
453
  error_stack: str,
367
454
  script_output: str,
@@ -371,26 +458,40 @@ class DatasetRecord:
371
458
  version_id: int,
372
459
  version_uuid: str,
373
460
  version_dataset_id: int,
374
- version: int,
461
+ version: str,
375
462
  version_status: int,
376
- version_feature_schema: Optional[str],
463
+ version_feature_schema: str | None,
377
464
  version_created_at: datetime,
378
- version_finished_at: Optional[datetime],
465
+ version_finished_at: datetime | None,
379
466
  version_error_message: str,
380
467
  version_error_stack: str,
381
468
  version_script_output: str,
382
- version_num_objects: Optional[int],
383
- version_size: Optional[int],
384
- version_preview: Optional[str],
385
- version_sources: Optional[str],
386
- version_query_script: Optional[str],
469
+ version_num_objects: int | None,
470
+ version_size: int | None,
471
+ version_preview: str | None,
472
+ version_sources: str | None,
473
+ version_query_script: str | None,
387
474
  version_schema: str,
388
- version_job_id: Optional[str] = None,
475
+ version_job_id: str | None = None,
389
476
  ) -> "DatasetRecord":
390
- labels_lst: list[str] = json.loads(labels) if labels else []
477
+ attrs_lst: list[str] = json.loads(attrs) if attrs else []
391
478
  schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
392
- version_schema_dct: dict[str, str] = (
393
- json.loads(version_schema) if version_schema else {}
479
+
480
+ namespace = Namespace(
481
+ namespace_id,
482
+ namespace_uuid,
483
+ namespace_name,
484
+ namespace_description,
485
+ namespace_created_at,
486
+ )
487
+
488
+ project = Project(
489
+ project_id,
490
+ project_uuid,
491
+ project_name,
492
+ project_description,
493
+ project_created_at,
494
+ namespace,
394
495
  )
395
496
 
396
497
  dataset_version = DatasetVersion.parse(
@@ -408,18 +509,19 @@ class DatasetRecord:
408
509
  version_num_objects,
409
510
  version_size,
410
511
  version_preview,
411
- cls.parse_schema(version_schema_dct), # type: ignore[arg-type]
512
+ version_schema,
412
513
  version_sources, # type: ignore[arg-type]
413
514
  version_query_script, # type: ignore[arg-type]
414
515
  version_job_id,
415
516
  )
416
517
 
417
518
  return cls(
418
- id,
519
+ dataset_id,
419
520
  name,
521
+ project,
420
522
  description,
421
- labels_lst,
422
- cls.parse_schema(schema_dct), # type: ignore[arg-type]
523
+ attrs_lst,
524
+ parse_schema(schema_dct), # type: ignore[arg-type]
423
525
  json.loads(feature_schema) if feature_schema else {},
424
526
  [dataset_version],
425
527
  status,
@@ -441,7 +543,11 @@ class DatasetRecord:
441
543
  for c_name, c_type in self.schema.items()
442
544
  }
443
545
 
444
- def get_schema(self, version: int) -> dict[str, Union[SQLType, type[SQLType]]]:
546
+ @property
547
+ def full_name(self) -> str:
548
+ return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
549
+
550
+ def get_schema(self, version: str) -> dict[str, SQLType | type[SQLType]]:
445
551
  return self.get_version(version).schema if version else self.schema
446
552
 
447
553
  def update(self, **kwargs):
@@ -460,20 +566,23 @@ class DatasetRecord:
460
566
  self.versions = []
461
567
 
462
568
  self.versions = list(set(self.versions + other.versions))
463
- self.versions.sort(key=lambda v: v.version)
569
+ self.versions.sort(key=lambda v: v.version_value)
464
570
  return self
465
571
 
466
- def has_version(self, version: int) -> bool:
467
- return version in self.versions_values
572
+ def has_version(self, version: str) -> bool:
573
+ return version in [v.version for v in self.versions]
468
574
 
469
- def is_valid_next_version(self, version: int) -> bool:
575
+ def is_valid_next_version(self, version: str) -> bool:
470
576
  """
471
577
  Checks if a number can be a valid next latest version for dataset.
472
578
  The only rule is that it cannot be lower than current latest version
473
579
  """
474
- return not (self.latest_version and self.latest_version >= version)
580
+ return not (
581
+ self.latest_version
582
+ and semver.value(self.latest_version) >= semver.value(version)
583
+ )
475
584
 
476
- def get_version(self, version: int) -> DatasetVersion:
585
+ def get_version(self, version: str) -> DatasetVersion:
477
586
  if not self.has_version(version):
478
587
  raise DatasetVersionNotFoundError(
479
588
  f"Dataset {self.name} does not have version {version}"
@@ -496,15 +605,15 @@ class DatasetRecord:
496
605
  f"Dataset {self.name} does not have version with uuid {uuid}"
497
606
  ) from None
498
607
 
499
- def remove_version(self, version: int) -> None:
608
+ def remove_version(self, version: str) -> None:
500
609
  if not self.versions or not self.has_version(version):
501
610
  return
502
611
 
503
612
  self.versions = [v for v in self.versions if v.version != version]
504
613
 
505
- def identifier(self, version: int) -> str:
614
+ def identifier(self, version: str) -> str:
506
615
  """
507
- Get identifier in the form my-dataset@v3
616
+ Get identifier in the form my-dataset@v3.0.1
508
617
  """
509
618
  if not self.has_version(version):
510
619
  raise DatasetVersionNotFoundError(
@@ -512,83 +621,172 @@ class DatasetRecord:
512
621
  )
513
622
  return f"{self.name}@v{version}"
514
623
 
515
- def uri(self, version: int) -> str:
624
+ def uri(self, version: str) -> str:
516
625
  """
517
- Dataset uri example: ds://dogs@v3
626
+ Dataset uri example: ds://dogs@v3.0.1
518
627
  """
519
628
  identifier = self.identifier(version)
520
- return f"{DATASET_PREFIX}{identifier}"
629
+ return (
630
+ f"{DATASET_PREFIX}{self.project.namespace.name}"
631
+ f".{self.project.name}.{identifier}"
632
+ )
521
633
 
522
634
  @property
523
- def versions_values(self) -> list[int]:
635
+ def next_version_major(self) -> str:
524
636
  """
525
- Extracts actual versions from list of DatasetVersion objects
526
- in self.versions attribute
637
+ Returns the next auto-incremented version if the major part is being bumped.
527
638
  """
528
639
  if not self.versions:
529
- return []
640
+ return "1.0.0"
530
641
 
531
- return sorted(v.version for v in self.versions)
642
+ major, _, _ = semver.parse(self.latest_version)
643
+ return semver.create(major + 1, 0, 0)
532
644
 
533
645
  @property
534
- def next_version(self) -> int:
535
- """Returns what should be next autoincrement version of dataset"""
646
+ def next_version_minor(self) -> str:
647
+ """
648
+ Returns the next auto-incremented version if the minor part is being bumped.
649
+ """
536
650
  if not self.versions:
537
- return 1
538
- return max(self.versions_values) + 1
651
+ return "1.0.0"
652
+
653
+ major, minor, _ = semver.parse(self.latest_version)
654
+ return semver.create(major, minor + 1, 0)
539
655
 
540
656
  @property
541
- def latest_version(self) -> int:
542
- """Returns latest version of a dataset"""
543
- return max(self.versions_values)
657
+ def next_version_patch(self) -> str:
658
+ """
659
+ Returns the next auto-incremented version if the patch part is being bumped.
660
+ """
661
+ if not self.versions:
662
+ return "1.0.0"
663
+
664
+ major, minor, patch = semver.parse(self.latest_version)
665
+ return semver.create(major, minor, patch + 1)
544
666
 
545
667
  @property
546
- def prev_version(self) -> Optional[int]:
547
- """Returns previous version of a dataset"""
548
- if len(self.versions) == 1:
668
+ def latest_version(self) -> str:
669
+ """Returns latest version of a dataset"""
670
+ return max(self.versions).version
671
+
672
+ def latest_major_version(self, major: int) -> str | None:
673
+ """
674
+ Returns latest specific major version, e.g if dataset has versions:
675
+ - 1.4.1
676
+ - 2.0.1
677
+ - 2.1.1
678
+ - 2.4.0
679
+ and we call `.latest_major_version(2)` it will return: "2.4.0".
680
+ If no major version is find with input value, None will be returned
681
+ """
682
+ versions = [v for v in self.versions if semver.parse(v.version)[0] == major]
683
+ if not versions:
684
+ return None
685
+ return max(versions).version
686
+
687
+ def latest_compatible_version(self, version_spec: str) -> str | None:
688
+ """
689
+ Returns the latest version that matches the given version specifier.
690
+
691
+ Supports Python version specifiers like:
692
+ - ">=1.0.0,<2.0.0" (compatible release range)
693
+ - "~=1.4.2" (compatible release clause)
694
+ - "==1.2.*" (prefix matching)
695
+ - ">1.0.0" (exclusive ordered comparison)
696
+ - ">=1.0.0" (inclusive ordered comparison)
697
+ - "!=1.3.0" (version exclusion)
698
+
699
+ Args:
700
+ version_spec: Version specifier string following PEP 440
701
+
702
+ Returns:
703
+ Latest compatible version string, or None if no compatible version found
704
+ """
705
+ spec_set = SpecifierSet(version_spec)
706
+
707
+ # Convert dataset versions to packaging.Version objects
708
+ # and filter compatible ones
709
+ compatible_versions = []
710
+ for v in self.versions:
711
+ pkg_version = Version(v.version)
712
+ if spec_set.contains(pkg_version):
713
+ compatible_versions.append(v)
714
+
715
+ if not compatible_versions:
549
716
  return None
550
717
 
551
- return sorted(self.versions_values)[-2]
718
+ # Return the latest compatible version
719
+ return max(compatible_versions).version
552
720
 
553
721
  @classmethod
554
722
  def from_dict(cls, d: dict[str, Any]) -> "DatasetRecord":
723
+ project = Project.from_dict(d.pop("project"))
555
724
  versions = [DatasetVersion.from_dict(v) for v in d.pop("versions", [])]
556
725
  kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
557
- return cls(**kwargs, versions=versions)
726
+ return cls(**kwargs, versions=versions, project=project)
558
727
 
559
728
 
560
729
  @dataclass
561
730
  class DatasetListRecord:
562
731
  id: int
563
732
  name: str
564
- description: Optional[str]
565
- labels: list[str]
733
+ project: Project
734
+ description: str | None
735
+ attrs: list[str]
566
736
  versions: list[DatasetListVersion]
567
- created_at: Optional[datetime] = None
737
+ created_at: datetime | None = None
568
738
 
569
739
  @classmethod
570
740
  def parse( # noqa: PLR0913
571
741
  cls,
572
- id: int,
742
+ namespace_id: int,
743
+ namespace_uuid: str,
744
+ namespace_name: str,
745
+ namespace_description: str | None,
746
+ namespace_created_at: datetime,
747
+ project_id: int,
748
+ project_uuid: str,
749
+ project_name: str,
750
+ project_description: str | None,
751
+ project_created_at: datetime,
752
+ project_namespace_id: int,
753
+ dataset_id: int,
573
754
  name: str,
574
- description: Optional[str],
575
- labels: str,
755
+ description: str | None,
756
+ attrs: str,
576
757
  created_at: datetime,
577
758
  version_id: int,
578
759
  version_uuid: str,
579
760
  version_dataset_id: int,
580
- version: int,
761
+ version: str,
581
762
  version_status: int,
582
763
  version_created_at: datetime,
583
- version_finished_at: Optional[datetime],
764
+ version_finished_at: datetime | None,
584
765
  version_error_message: str,
585
766
  version_error_stack: str,
586
- version_num_objects: Optional[int],
587
- version_size: Optional[int],
588
- version_query_script: Optional[str],
589
- version_job_id: Optional[str] = None,
767
+ version_num_objects: int | None,
768
+ version_size: int | None,
769
+ version_query_script: str | None,
770
+ version_job_id: str | None = None,
590
771
  ) -> "DatasetListRecord":
591
- labels_lst: list[str] = json.loads(labels) if labels else []
772
+ attrs_lst: list[str] = json.loads(attrs) if attrs else []
773
+
774
+ namespace = Namespace(
775
+ namespace_id,
776
+ namespace_uuid,
777
+ namespace_name,
778
+ namespace_description,
779
+ namespace_created_at,
780
+ )
781
+
782
+ project = Project(
783
+ project_id,
784
+ project_uuid,
785
+ project_name,
786
+ project_description,
787
+ project_created_at,
788
+ namespace,
789
+ )
592
790
 
593
791
  dataset_version = DatasetListVersion.parse(
594
792
  version_id,
@@ -607,14 +805,19 @@ class DatasetListRecord:
607
805
  )
608
806
 
609
807
  return cls(
610
- id,
808
+ dataset_id,
611
809
  name,
810
+ project,
612
811
  description,
613
- labels_lst,
812
+ attrs_lst,
614
813
  [dataset_version],
615
814
  created_at,
616
815
  )
617
816
 
817
+ @property
818
+ def full_name(self) -> str:
819
+ return f"{self.project.namespace.name}.{self.project.name}.{self.name}"
820
+
618
821
  def merge_versions(self, other: "DatasetListRecord") -> "DatasetListRecord":
619
822
  """Merge versions from another dataset"""
620
823
  if other.id != self.id:
@@ -626,11 +829,11 @@ class DatasetListRecord:
626
829
  self.versions = []
627
830
 
628
831
  self.versions = list(set(self.versions + other.versions))
629
- self.versions.sort(key=lambda v: v.version)
832
+ self.versions.sort(key=lambda v: v.version_value)
630
833
  return self
631
834
 
632
835
  def latest_version(self) -> DatasetListVersion:
633
- return max(self.versions, key=lambda v: v.version)
836
+ return max(self.versions, key=lambda v: v.version_value)
634
837
 
635
838
  @property
636
839
  def is_bucket_listing(self) -> bool:
@@ -641,7 +844,7 @@ class DatasetListRecord:
641
844
  from datachain.client import Client
642
845
 
643
846
  # TODO refactor and maybe remove method in
644
- # https://github.com/iterative/datachain/issues/318
847
+ # https://github.com/datachain-ai/datachain/issues/318
645
848
  return Client.is_data_source_uri(self.name) or self.name.startswith(
646
849
  LISTING_PREFIX
647
850
  )
@@ -651,9 +854,11 @@ class DatasetListRecord:
651
854
 
652
855
  @classmethod
653
856
  def from_dict(cls, d: dict[str, Any]) -> "DatasetListRecord":
857
+ project = Project.from_dict(d.pop("project"))
654
858
  versions = [DatasetListVersion.parse(**v) for v in d.get("versions", [])]
655
859
  kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
656
860
  kwargs["versions"] = versions
861
+ kwargs["project"] = project
657
862
  return cls(**kwargs)
658
863
 
659
864