datachain 0.16.5__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +20 -91
- datachain/cli/commands/datasets.py +1 -1
- datachain/cli/commands/show.py +1 -1
- datachain/cli/parser/__init__.py +2 -2
- datachain/data_storage/metastore.py +23 -23
- datachain/data_storage/sqlite.py +8 -7
- datachain/data_storage/warehouse.py +12 -12
- datachain/dataset.py +88 -45
- datachain/lib/dataset_info.py +2 -1
- datachain/lib/dc/datachain.py +8 -3
- datachain/lib/dc/datasets.py +28 -7
- datachain/lib/dc/storage.py +10 -2
- datachain/lib/pytorch.py +2 -2
- datachain/listing.py +1 -1
- datachain/query/dataset.py +9 -9
- datachain/query/session.py +2 -2
- datachain/remote/studio.py +4 -4
- datachain/semver.py +58 -0
- datachain/studio.py +1 -1
- {datachain-0.16.5.dist-info → datachain-0.17.0.dist-info}/METADATA +1 -1
- {datachain-0.16.5.dist-info → datachain-0.17.0.dist-info}/RECORD +25 -24
- {datachain-0.16.5.dist-info → datachain-0.17.0.dist-info}/WHEEL +0 -0
- {datachain-0.16.5.dist-info → datachain-0.17.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.16.5.dist-info → datachain-0.17.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.16.5.dist-info → datachain-0.17.0.dist-info}/top_level.txt +0 -0
datachain/dataset.py
CHANGED
|
@@ -12,6 +12,7 @@ from typing import (
|
|
|
12
12
|
)
|
|
13
13
|
from urllib.parse import urlparse
|
|
14
14
|
|
|
15
|
+
from datachain import semver
|
|
15
16
|
from datachain.error import DatasetVersionNotFoundError
|
|
16
17
|
from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
|
|
17
18
|
|
|
@@ -25,6 +26,8 @@ DATASET_PREFIX = "ds://"
|
|
|
25
26
|
QUERY_DATASET_PREFIX = "ds_query_"
|
|
26
27
|
LISTING_PREFIX = "lst__"
|
|
27
28
|
|
|
29
|
+
DEFAULT_DATASET_VERSION = "1.0.0"
|
|
30
|
+
|
|
28
31
|
|
|
29
32
|
# StorageURI represents a normalised URI to a valid storage location (full bucket or
|
|
30
33
|
# absolute local path).
|
|
@@ -33,12 +36,12 @@ LISTING_PREFIX = "lst__"
|
|
|
33
36
|
StorageURI = NewType("StorageURI", str)
|
|
34
37
|
|
|
35
38
|
|
|
36
|
-
def parse_dataset_uri(uri: str) -> tuple[str, Optional[
|
|
39
|
+
def parse_dataset_uri(uri: str) -> tuple[str, Optional[str]]:
|
|
37
40
|
"""
|
|
38
41
|
Parse dataser uri to extract name and version out of it (if version is defined)
|
|
39
42
|
Example:
|
|
40
|
-
Input: ds://zalando@v3
|
|
41
|
-
Output: (zalando, 3)
|
|
43
|
+
Input: ds://zalando@v3.0.1
|
|
44
|
+
Output: (zalando, 3.0.1)
|
|
42
45
|
"""
|
|
43
46
|
p = urlparse(uri)
|
|
44
47
|
if p.scheme != "ds":
|
|
@@ -51,16 +54,15 @@ def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
|
|
|
51
54
|
raise Exception(
|
|
52
55
|
"Wrong dataset uri format, it should be: ds://<name>@v<version>"
|
|
53
56
|
)
|
|
54
|
-
|
|
55
|
-
return name, version
|
|
57
|
+
return name, s[1]
|
|
56
58
|
|
|
57
59
|
|
|
58
|
-
def create_dataset_uri(name: str, version: Optional[
|
|
60
|
+
def create_dataset_uri(name: str, version: Optional[str] = None) -> str:
|
|
59
61
|
"""
|
|
60
62
|
Creates a dataset uri based on dataset name and optionally version
|
|
61
63
|
Example:
|
|
62
|
-
Input: zalando, 3
|
|
63
|
-
Output: ds//zalando@v3
|
|
64
|
+
Input: zalando, 3.0.1
|
|
65
|
+
Output: ds//zalando@v3.0.1
|
|
64
66
|
"""
|
|
65
67
|
uri = f"{DATASET_PREFIX}{name}"
|
|
66
68
|
if version:
|
|
@@ -79,7 +81,7 @@ class DatasetDependency:
|
|
|
79
81
|
id: int
|
|
80
82
|
type: str
|
|
81
83
|
name: str
|
|
82
|
-
version: str
|
|
84
|
+
version: str
|
|
83
85
|
created_at: datetime
|
|
84
86
|
dependencies: list[Optional["DatasetDependency"]]
|
|
85
87
|
|
|
@@ -102,7 +104,7 @@ class DatasetDependency:
|
|
|
102
104
|
dataset_id: Optional[int],
|
|
103
105
|
dataset_version_id: Optional[int],
|
|
104
106
|
dataset_name: Optional[str],
|
|
105
|
-
dataset_version: Optional[
|
|
107
|
+
dataset_version: Optional[str],
|
|
106
108
|
dataset_version_created_at: Optional[datetime],
|
|
107
109
|
) -> Optional["DatasetDependency"]:
|
|
108
110
|
from datachain.client import Client
|
|
@@ -124,7 +126,7 @@ class DatasetDependency:
|
|
|
124
126
|
dependency_type,
|
|
125
127
|
dependency_name,
|
|
126
128
|
(
|
|
127
|
-
|
|
129
|
+
dataset_version # type: ignore[arg-type]
|
|
128
130
|
if dataset_version
|
|
129
131
|
else None
|
|
130
132
|
),
|
|
@@ -163,7 +165,7 @@ class DatasetVersion:
|
|
|
163
165
|
id: int
|
|
164
166
|
uuid: str
|
|
165
167
|
dataset_id: int
|
|
166
|
-
version:
|
|
168
|
+
version: str
|
|
167
169
|
status: int
|
|
168
170
|
feature_schema: dict
|
|
169
171
|
created_at: datetime
|
|
@@ -185,7 +187,7 @@ class DatasetVersion:
|
|
|
185
187
|
id: int,
|
|
186
188
|
uuid: str,
|
|
187
189
|
dataset_id: int,
|
|
188
|
-
version:
|
|
190
|
+
version: str,
|
|
189
191
|
status: int,
|
|
190
192
|
feature_schema: Optional[str],
|
|
191
193
|
created_at: datetime,
|
|
@@ -222,6 +224,10 @@ class DatasetVersion:
|
|
|
222
224
|
job_id,
|
|
223
225
|
)
|
|
224
226
|
|
|
227
|
+
@property
|
|
228
|
+
def version_value(self) -> int:
|
|
229
|
+
return semver.value(self.version)
|
|
230
|
+
|
|
225
231
|
def __eq__(self, other):
|
|
226
232
|
if not isinstance(other, DatasetVersion):
|
|
227
233
|
return False
|
|
@@ -230,7 +236,7 @@ class DatasetVersion:
|
|
|
230
236
|
def __lt__(self, other):
|
|
231
237
|
if not isinstance(other, DatasetVersion):
|
|
232
238
|
return False
|
|
233
|
-
return self.
|
|
239
|
+
return self.version_value < other.version_value
|
|
234
240
|
|
|
235
241
|
def __hash__(self):
|
|
236
242
|
return hash(f"{self.dataset_id}_{self.version}")
|
|
@@ -275,7 +281,7 @@ class DatasetListVersion:
|
|
|
275
281
|
id: int
|
|
276
282
|
uuid: str
|
|
277
283
|
dataset_id: int
|
|
278
|
-
version:
|
|
284
|
+
version: str
|
|
279
285
|
status: int
|
|
280
286
|
created_at: datetime
|
|
281
287
|
finished_at: Optional[datetime]
|
|
@@ -292,7 +298,7 @@ class DatasetListVersion:
|
|
|
292
298
|
id: int,
|
|
293
299
|
uuid: str,
|
|
294
300
|
dataset_id: int,
|
|
295
|
-
version:
|
|
301
|
+
version: str,
|
|
296
302
|
status: int,
|
|
297
303
|
created_at: datetime,
|
|
298
304
|
finished_at: Optional[datetime],
|
|
@@ -323,6 +329,10 @@ class DatasetListVersion:
|
|
|
323
329
|
def __hash__(self):
|
|
324
330
|
return hash(f"{self.dataset_id}_{self.version}")
|
|
325
331
|
|
|
332
|
+
@property
|
|
333
|
+
def version_value(self) -> int:
|
|
334
|
+
return semver.value(self.version)
|
|
335
|
+
|
|
326
336
|
|
|
327
337
|
@dataclass
|
|
328
338
|
class DatasetRecord:
|
|
@@ -371,7 +381,7 @@ class DatasetRecord:
|
|
|
371
381
|
version_id: int,
|
|
372
382
|
version_uuid: str,
|
|
373
383
|
version_dataset_id: int,
|
|
374
|
-
version:
|
|
384
|
+
version: str,
|
|
375
385
|
version_status: int,
|
|
376
386
|
version_feature_schema: Optional[str],
|
|
377
387
|
version_created_at: datetime,
|
|
@@ -441,7 +451,7 @@ class DatasetRecord:
|
|
|
441
451
|
for c_name, c_type in self.schema.items()
|
|
442
452
|
}
|
|
443
453
|
|
|
444
|
-
def get_schema(self, version:
|
|
454
|
+
def get_schema(self, version: str) -> dict[str, Union[SQLType, type[SQLType]]]:
|
|
445
455
|
return self.get_version(version).schema if version else self.schema
|
|
446
456
|
|
|
447
457
|
def update(self, **kwargs):
|
|
@@ -460,20 +470,23 @@ class DatasetRecord:
|
|
|
460
470
|
self.versions = []
|
|
461
471
|
|
|
462
472
|
self.versions = list(set(self.versions + other.versions))
|
|
463
|
-
self.versions.sort(key=lambda v: v.
|
|
473
|
+
self.versions.sort(key=lambda v: v.version_value)
|
|
464
474
|
return self
|
|
465
475
|
|
|
466
|
-
def has_version(self, version:
|
|
467
|
-
return version in self.
|
|
476
|
+
def has_version(self, version: str) -> bool:
|
|
477
|
+
return version in [v.version for v in self.versions]
|
|
468
478
|
|
|
469
|
-
def is_valid_next_version(self, version:
|
|
479
|
+
def is_valid_next_version(self, version: str) -> bool:
|
|
470
480
|
"""
|
|
471
481
|
Checks if a number can be a valid next latest version for dataset.
|
|
472
482
|
The only rule is that it cannot be lower than current latest version
|
|
473
483
|
"""
|
|
474
|
-
return not (
|
|
484
|
+
return not (
|
|
485
|
+
self.latest_version
|
|
486
|
+
and semver.value(self.latest_version) >= semver.value(version)
|
|
487
|
+
)
|
|
475
488
|
|
|
476
|
-
def get_version(self, version:
|
|
489
|
+
def get_version(self, version: str) -> DatasetVersion:
|
|
477
490
|
if not self.has_version(version):
|
|
478
491
|
raise DatasetVersionNotFoundError(
|
|
479
492
|
f"Dataset {self.name} does not have version {version}"
|
|
@@ -496,15 +509,15 @@ class DatasetRecord:
|
|
|
496
509
|
f"Dataset {self.name} does not have version with uuid {uuid}"
|
|
497
510
|
) from None
|
|
498
511
|
|
|
499
|
-
def remove_version(self, version:
|
|
512
|
+
def remove_version(self, version: str) -> None:
|
|
500
513
|
if not self.versions or not self.has_version(version):
|
|
501
514
|
return
|
|
502
515
|
|
|
503
516
|
self.versions = [v for v in self.versions if v.version != version]
|
|
504
517
|
|
|
505
|
-
def identifier(self, version:
|
|
518
|
+
def identifier(self, version: str) -> str:
|
|
506
519
|
"""
|
|
507
|
-
Get identifier in the form my-dataset@v3
|
|
520
|
+
Get identifier in the form my-dataset@v3.0.1
|
|
508
521
|
"""
|
|
509
522
|
if not self.has_version(version):
|
|
510
523
|
raise DatasetVersionNotFoundError(
|
|
@@ -512,43 +525,73 @@ class DatasetRecord:
|
|
|
512
525
|
)
|
|
513
526
|
return f"{self.name}@v{version}"
|
|
514
527
|
|
|
515
|
-
def uri(self, version:
|
|
528
|
+
def uri(self, version: str) -> str:
|
|
516
529
|
"""
|
|
517
|
-
Dataset uri example: ds://dogs@v3
|
|
530
|
+
Dataset uri example: ds://dogs@v3.0.1
|
|
518
531
|
"""
|
|
519
532
|
identifier = self.identifier(version)
|
|
520
533
|
return f"{DATASET_PREFIX}{identifier}"
|
|
521
534
|
|
|
522
535
|
@property
|
|
523
|
-
def
|
|
536
|
+
def next_version_major(self) -> str:
|
|
524
537
|
"""
|
|
525
|
-
|
|
526
|
-
in self.versions attribute
|
|
538
|
+
Returns the next auto-incremented version if the major part is being bumped.
|
|
527
539
|
"""
|
|
528
540
|
if not self.versions:
|
|
529
|
-
return
|
|
541
|
+
return "1.0.0"
|
|
530
542
|
|
|
531
|
-
|
|
543
|
+
major, minor, patch = semver.parse(self.latest_version)
|
|
544
|
+
return semver.create(major + 1, 0, 0)
|
|
532
545
|
|
|
533
546
|
@property
|
|
534
|
-
def
|
|
535
|
-
"""
|
|
547
|
+
def next_version_minor(self) -> str:
|
|
548
|
+
"""
|
|
549
|
+
Returns the next auto-incremented version if the minor part is being bumped.
|
|
550
|
+
"""
|
|
536
551
|
if not self.versions:
|
|
537
|
-
return 1
|
|
538
|
-
|
|
552
|
+
return "1.0.0"
|
|
553
|
+
|
|
554
|
+
major, minor, patch = semver.parse(self.latest_version)
|
|
555
|
+
return semver.create(major, minor + 1, 0)
|
|
539
556
|
|
|
540
557
|
@property
|
|
541
|
-
def
|
|
558
|
+
def next_version_patch(self) -> str:
|
|
559
|
+
"""
|
|
560
|
+
Returns the next auto-incremented version if the patch part is being bumped.
|
|
561
|
+
"""
|
|
562
|
+
if not self.versions:
|
|
563
|
+
return "1.0.0"
|
|
564
|
+
|
|
565
|
+
major, minor, patch = semver.parse(self.latest_version)
|
|
566
|
+
return semver.create(major, minor, patch + 1)
|
|
567
|
+
|
|
568
|
+
@property
|
|
569
|
+
def latest_version(self) -> str:
|
|
542
570
|
"""Returns latest version of a dataset"""
|
|
543
|
-
return max(self.
|
|
571
|
+
return max(self.versions).version
|
|
572
|
+
|
|
573
|
+
def latest_major_version(self, major: int) -> Optional[str]:
|
|
574
|
+
"""
|
|
575
|
+
Returns latest specific major version, e.g if dataset has versions:
|
|
576
|
+
- 1.4.1
|
|
577
|
+
- 2.0.1
|
|
578
|
+
- 2.1.1
|
|
579
|
+
- 2.4.0
|
|
580
|
+
and we call `.latest_major_version(2)` it will return: "2.4.0".
|
|
581
|
+
If no major version is find with input value, None will be returned
|
|
582
|
+
"""
|
|
583
|
+
versions = [v for v in self.versions if semver.parse(v.version)[0] == major]
|
|
584
|
+
if not versions:
|
|
585
|
+
return None
|
|
586
|
+
return max(versions).version
|
|
544
587
|
|
|
545
588
|
@property
|
|
546
|
-
def prev_version(self) -> Optional[
|
|
589
|
+
def prev_version(self) -> Optional[str]:
|
|
547
590
|
"""Returns previous version of a dataset"""
|
|
548
591
|
if len(self.versions) == 1:
|
|
549
592
|
return None
|
|
550
593
|
|
|
551
|
-
return sorted(self.
|
|
594
|
+
return sorted(self.versions)[-2].version
|
|
552
595
|
|
|
553
596
|
@classmethod
|
|
554
597
|
def from_dict(cls, d: dict[str, Any]) -> "DatasetRecord":
|
|
@@ -577,7 +620,7 @@ class DatasetListRecord:
|
|
|
577
620
|
version_id: int,
|
|
578
621
|
version_uuid: str,
|
|
579
622
|
version_dataset_id: int,
|
|
580
|
-
version:
|
|
623
|
+
version: str,
|
|
581
624
|
version_status: int,
|
|
582
625
|
version_created_at: datetime,
|
|
583
626
|
version_finished_at: Optional[datetime],
|
|
@@ -626,11 +669,11 @@ class DatasetListRecord:
|
|
|
626
669
|
self.versions = []
|
|
627
670
|
|
|
628
671
|
self.versions = list(set(self.versions + other.versions))
|
|
629
|
-
self.versions.sort(key=lambda v: v.
|
|
672
|
+
self.versions.sort(key=lambda v: v.version_value)
|
|
630
673
|
return self
|
|
631
674
|
|
|
632
675
|
def latest_version(self) -> DatasetListVersion:
|
|
633
|
-
return max(self.versions, key=lambda v: v.
|
|
676
|
+
return max(self.versions, key=lambda v: v.version_value)
|
|
634
677
|
|
|
635
678
|
@property
|
|
636
679
|
def is_bucket_listing(self) -> bool:
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -6,6 +6,7 @@ from uuid import uuid4
|
|
|
6
6
|
from pydantic import Field, field_validator
|
|
7
7
|
|
|
8
8
|
from datachain.dataset import (
|
|
9
|
+
DEFAULT_DATASET_VERSION,
|
|
9
10
|
DatasetListRecord,
|
|
10
11
|
DatasetListVersion,
|
|
11
12
|
DatasetStatus,
|
|
@@ -22,7 +23,7 @@ if TYPE_CHECKING:
|
|
|
22
23
|
class DatasetInfo(DataModel):
|
|
23
24
|
name: str
|
|
24
25
|
uuid: str = Field(default=str(uuid4()))
|
|
25
|
-
version:
|
|
26
|
+
version: str = Field(default=DEFAULT_DATASET_VERSION)
|
|
26
27
|
status: int = Field(default=DatasetStatus.CREATED)
|
|
27
28
|
created_at: datetime = Field(default=TIME_ZERO)
|
|
28
29
|
finished_at: Optional[datetime] = Field(default=None)
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -23,6 +23,7 @@ import sqlalchemy
|
|
|
23
23
|
from pydantic import BaseModel
|
|
24
24
|
from tqdm import tqdm
|
|
25
25
|
|
|
26
|
+
from datachain import semver
|
|
26
27
|
from datachain.dataset import DatasetRecord
|
|
27
28
|
from datachain.func import literal
|
|
28
29
|
from datachain.func.base import Function
|
|
@@ -214,7 +215,7 @@ class DataChain:
|
|
|
214
215
|
return self._query.name
|
|
215
216
|
|
|
216
217
|
@property
|
|
217
|
-
def version(self) -> Optional[
|
|
218
|
+
def version(self) -> Optional[str]:
|
|
218
219
|
"""Version of the underlying dataset, if there is one."""
|
|
219
220
|
return self._query.version
|
|
220
221
|
|
|
@@ -457,7 +458,7 @@ class DataChain:
|
|
|
457
458
|
def save( # type: ignore[override]
|
|
458
459
|
self,
|
|
459
460
|
name: str,
|
|
460
|
-
version: Optional[
|
|
461
|
+
version: Optional[str] = None,
|
|
461
462
|
description: Optional[str] = None,
|
|
462
463
|
attrs: Optional[list[str]] = None,
|
|
463
464
|
**kwargs,
|
|
@@ -466,11 +467,15 @@ class DataChain:
|
|
|
466
467
|
|
|
467
468
|
Parameters:
|
|
468
469
|
name : dataset name.
|
|
469
|
-
version : version of a dataset.
|
|
470
|
+
version : version of a dataset. If version is not specified and dataset
|
|
471
|
+
already exists, version patch increment will happen e.g 1.2.1 -> 1.2.2.
|
|
470
472
|
description : description of a dataset.
|
|
471
473
|
attrs : attributes of a dataset. They can be without value, e.g "NLP",
|
|
472
474
|
or with a value, e.g "location=US".
|
|
473
475
|
"""
|
|
476
|
+
if version is not None:
|
|
477
|
+
semver.validate(version)
|
|
478
|
+
|
|
474
479
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
475
480
|
return self._evolve(
|
|
476
481
|
query=self._query.save(
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Optional, get_origin, get_type_hints
|
|
1
|
+
from typing import TYPE_CHECKING, Optional, Union, get_origin, get_type_hints
|
|
2
2
|
|
|
3
|
+
from datachain.error import DatasetVersionNotFoundError
|
|
3
4
|
from datachain.lib.dataset_info import DatasetInfo
|
|
4
5
|
from datachain.lib.file import (
|
|
5
6
|
File,
|
|
@@ -22,7 +23,7 @@ if TYPE_CHECKING:
|
|
|
22
23
|
|
|
23
24
|
def read_dataset(
|
|
24
25
|
name: str,
|
|
25
|
-
version: Optional[int] = None,
|
|
26
|
+
version: Optional[Union[str, int]] = None,
|
|
26
27
|
session: Optional[Session] = None,
|
|
27
28
|
settings: Optional[dict] = None,
|
|
28
29
|
fallback_to_studio: bool = True,
|
|
@@ -49,7 +50,7 @@ def read_dataset(
|
|
|
49
50
|
```
|
|
50
51
|
|
|
51
52
|
```py
|
|
52
|
-
chain = dc.read_dataset("my_cats", version=1)
|
|
53
|
+
chain = dc.read_dataset("my_cats", version="1.0.0")
|
|
53
54
|
```
|
|
54
55
|
|
|
55
56
|
```py
|
|
@@ -63,7 +64,7 @@ def read_dataset(
|
|
|
63
64
|
}
|
|
64
65
|
chain = dc.read_dataset(
|
|
65
66
|
name="my_cats",
|
|
66
|
-
version=1,
|
|
67
|
+
version="1.0.0",
|
|
67
68
|
session=session,
|
|
68
69
|
settings=settings,
|
|
69
70
|
fallback_to_studio=True,
|
|
@@ -74,9 +75,29 @@ def read_dataset(
|
|
|
74
75
|
|
|
75
76
|
from .datachain import DataChain
|
|
76
77
|
|
|
78
|
+
if version is not None:
|
|
79
|
+
try:
|
|
80
|
+
# for backward compatibility we still allow users to put version as integer
|
|
81
|
+
# in which case we are trying to find latest version where major part is
|
|
82
|
+
# equal to that input version. For example if user sets version=2, we could
|
|
83
|
+
# continue with something like 2.4.3 (assuming 2.4.3 is the biggest among
|
|
84
|
+
# all 2.* dataset versions). If dataset doesn't have any versions where
|
|
85
|
+
# major part is equal to that input, exception is thrown.
|
|
86
|
+
major = int(version)
|
|
87
|
+
dataset = Session.get(session).catalog.get_dataset(name)
|
|
88
|
+
latest_major = dataset.latest_major_version(major)
|
|
89
|
+
if not latest_major:
|
|
90
|
+
raise DatasetVersionNotFoundError(
|
|
91
|
+
f"Dataset {name} does not have version {version}"
|
|
92
|
+
)
|
|
93
|
+
version = latest_major
|
|
94
|
+
except ValueError:
|
|
95
|
+
# version is in new semver string format, continuing as normal
|
|
96
|
+
pass
|
|
97
|
+
|
|
77
98
|
query = DatasetQuery(
|
|
78
99
|
name=name,
|
|
79
|
-
version=version,
|
|
100
|
+
version=version, # type: ignore[arg-type]
|
|
80
101
|
session=session,
|
|
81
102
|
indexing_column_types=File._datachain_column_types,
|
|
82
103
|
fallback_to_studio=fallback_to_studio,
|
|
@@ -179,7 +200,7 @@ def datasets(
|
|
|
179
200
|
|
|
180
201
|
def delete_dataset(
|
|
181
202
|
name: str,
|
|
182
|
-
version: Optional[
|
|
203
|
+
version: Optional[str] = None,
|
|
183
204
|
force: Optional[bool] = False,
|
|
184
205
|
studio: Optional[bool] = False,
|
|
185
206
|
session: Optional[Session] = None,
|
|
@@ -207,7 +228,7 @@ def delete_dataset(
|
|
|
207
228
|
|
|
208
229
|
```py
|
|
209
230
|
import datachain as dc
|
|
210
|
-
dc.delete_dataset("cats", version=1)
|
|
231
|
+
dc.delete_dataset("cats", version="1.0.0")
|
|
211
232
|
```
|
|
212
233
|
"""
|
|
213
234
|
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import (
|
|
|
5
5
|
Union,
|
|
6
6
|
)
|
|
7
7
|
|
|
8
|
+
from datachain.error import DatasetNotFoundError
|
|
8
9
|
from datachain.lib.file import (
|
|
9
10
|
FileType,
|
|
10
11
|
get_file_type,
|
|
@@ -97,7 +98,8 @@ def read_storage(
|
|
|
97
98
|
if anon:
|
|
98
99
|
client_config = (client_config or {}) | {"anon": True}
|
|
99
100
|
session = Session.get(session, client_config=client_config, in_memory=in_memory)
|
|
100
|
-
|
|
101
|
+
catalog = session.catalog
|
|
102
|
+
cache = catalog.cache
|
|
101
103
|
client_config = session.catalog.client_config
|
|
102
104
|
|
|
103
105
|
uris = uri if isinstance(uri, (list, tuple)) else [uri]
|
|
@@ -130,6 +132,11 @@ def read_storage(
|
|
|
130
132
|
|
|
131
133
|
def lst_fn(ds_name, lst_uri):
|
|
132
134
|
# disable prefetch for listing, as it pre-downloads all files
|
|
135
|
+
try:
|
|
136
|
+
version = catalog.get_dataset(ds_name).next_version_major
|
|
137
|
+
except DatasetNotFoundError:
|
|
138
|
+
version = None
|
|
139
|
+
|
|
133
140
|
(
|
|
134
141
|
read_records(
|
|
135
142
|
DataChain.DEFAULT_FILE_RECORD,
|
|
@@ -142,7 +149,8 @@ def read_storage(
|
|
|
142
149
|
list_bucket(lst_uri, cache, client_config=client_config),
|
|
143
150
|
output={f"{column}": file_type},
|
|
144
151
|
)
|
|
145
|
-
|
|
152
|
+
# for internal listing datasets, we always bump major version
|
|
153
|
+
.save(ds_name, listing=True, version=version)
|
|
146
154
|
)
|
|
147
155
|
|
|
148
156
|
dc._query.set_listing_fn(
|
datachain/lib/pytorch.py
CHANGED
|
@@ -43,7 +43,7 @@ class PytorchDataset(IterableDataset):
|
|
|
43
43
|
def __init__(
|
|
44
44
|
self,
|
|
45
45
|
name: str,
|
|
46
|
-
version: Optional[
|
|
46
|
+
version: Optional[str] = None,
|
|
47
47
|
catalog: Optional["Catalog"] = None,
|
|
48
48
|
transform: Optional["Transform"] = None,
|
|
49
49
|
tokenizer: Optional[Callable] = None,
|
|
@@ -60,7 +60,7 @@ class PytorchDataset(IterableDataset):
|
|
|
60
60
|
|
|
61
61
|
Args:
|
|
62
62
|
name (str): Name of DataChain dataset to stream.
|
|
63
|
-
version (
|
|
63
|
+
version (str): Version of DataChain dataset to stream.
|
|
64
64
|
catalog (Catalog): DataChain catalog to which dataset belongs.
|
|
65
65
|
transform (Transform): Torchvision transforms to apply to the dataset.
|
|
66
66
|
tokenizer (Callable): Tokenizer to use to tokenize text values.
|
datachain/listing.py
CHANGED
datachain/query/dataset.py
CHANGED
|
@@ -83,7 +83,7 @@ PartitionByType = Union[
|
|
|
83
83
|
Function, ColumnElement, Sequence[Union[Function, ColumnElement]]
|
|
84
84
|
]
|
|
85
85
|
JoinPredicateType = Union[str, ColumnClause, ColumnElement]
|
|
86
|
-
DatasetDependencyType = tuple[str,
|
|
86
|
+
DatasetDependencyType = tuple[str, str]
|
|
87
87
|
|
|
88
88
|
logger = logging.getLogger("datachain")
|
|
89
89
|
|
|
@@ -168,7 +168,7 @@ class Step(ABC):
|
|
|
168
168
|
class QueryStep:
|
|
169
169
|
catalog: "Catalog"
|
|
170
170
|
dataset_name: str
|
|
171
|
-
dataset_version:
|
|
171
|
+
dataset_version: str
|
|
172
172
|
|
|
173
173
|
def apply(self):
|
|
174
174
|
def q(*columns):
|
|
@@ -1092,7 +1092,7 @@ class DatasetQuery:
|
|
|
1092
1092
|
def __init__(
|
|
1093
1093
|
self,
|
|
1094
1094
|
name: str,
|
|
1095
|
-
version: Optional[
|
|
1095
|
+
version: Optional[str] = None,
|
|
1096
1096
|
catalog: Optional["Catalog"] = None,
|
|
1097
1097
|
session: Optional[Session] = None,
|
|
1098
1098
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
@@ -1112,7 +1112,7 @@ class DatasetQuery:
|
|
|
1112
1112
|
self.table = self.get_table()
|
|
1113
1113
|
self.starting_step: Optional[QueryStep] = None
|
|
1114
1114
|
self.name: Optional[str] = None
|
|
1115
|
-
self.version: Optional[
|
|
1115
|
+
self.version: Optional[str] = None
|
|
1116
1116
|
self.feature_schema: Optional[dict] = None
|
|
1117
1117
|
self.column_types: Optional[dict[str, Any]] = None
|
|
1118
1118
|
self.before_steps: list[Callable] = []
|
|
@@ -1155,7 +1155,7 @@ class DatasetQuery:
|
|
|
1155
1155
|
def __or__(self, other):
|
|
1156
1156
|
return self.union(other)
|
|
1157
1157
|
|
|
1158
|
-
def pull_dataset(self, name: str, version: Optional[
|
|
1158
|
+
def pull_dataset(self, name: str, version: Optional[str] = None) -> "DatasetRecord":
|
|
1159
1159
|
print("Dataset not found in local catalog, trying to get from studio")
|
|
1160
1160
|
|
|
1161
1161
|
remote_ds_uri = f"{DATASET_PREFIX}{name}"
|
|
@@ -1185,8 +1185,8 @@ class DatasetQuery:
|
|
|
1185
1185
|
it completely. If this is the case, name and version of underlying dataset
|
|
1186
1186
|
will be defined.
|
|
1187
1187
|
DatasetQuery instance can become attached in two scenarios:
|
|
1188
|
-
1. ds = DatasetQuery(name="dogs", version=1) -> ds is attached to dogs
|
|
1189
|
-
2. ds = ds.save("dogs", version=1) -> ds is attached to dogs dataset
|
|
1188
|
+
1. ds = DatasetQuery(name="dogs", version="1.0.0") -> ds is attached to dogs
|
|
1189
|
+
2. ds = ds.save("dogs", version="1.0.0") -> ds is attached to dogs dataset
|
|
1190
1190
|
It can move to detached state if filter or similar methods are called on it,
|
|
1191
1191
|
as then it no longer 100% represents underlying datasets.
|
|
1192
1192
|
"""
|
|
@@ -1663,7 +1663,7 @@ class DatasetQuery:
|
|
|
1663
1663
|
)
|
|
1664
1664
|
return query
|
|
1665
1665
|
|
|
1666
|
-
def _add_dependencies(self, dataset: "DatasetRecord", version:
|
|
1666
|
+
def _add_dependencies(self, dataset: "DatasetRecord", version: str):
|
|
1667
1667
|
for dependency in self.dependencies:
|
|
1668
1668
|
ds_dependency_name, ds_dependency_version = dependency
|
|
1669
1669
|
self.catalog.metastore.add_dataset_dependency(
|
|
@@ -1685,7 +1685,7 @@ class DatasetQuery:
|
|
|
1685
1685
|
def save(
|
|
1686
1686
|
self,
|
|
1687
1687
|
name: Optional[str] = None,
|
|
1688
|
-
version: Optional[
|
|
1688
|
+
version: Optional[str] = None,
|
|
1689
1689
|
feature_schema: Optional[dict] = None,
|
|
1690
1690
|
description: Optional[str] = None,
|
|
1691
1691
|
attrs: Optional[list[str]] = None,
|
datachain/query/session.py
CHANGED
|
@@ -69,7 +69,7 @@ class Session:
|
|
|
69
69
|
self.catalog = catalog or get_catalog(
|
|
70
70
|
client_config=client_config, in_memory=in_memory
|
|
71
71
|
)
|
|
72
|
-
self.dataset_versions: list[tuple[DatasetRecord,
|
|
72
|
+
self.dataset_versions: list[tuple[DatasetRecord, str, bool]] = []
|
|
73
73
|
|
|
74
74
|
def __enter__(self):
|
|
75
75
|
# Push the current context onto the stack
|
|
@@ -90,7 +90,7 @@ class Session:
|
|
|
90
90
|
Session.SESSION_CONTEXTS.pop()
|
|
91
91
|
|
|
92
92
|
def add_dataset_version(
|
|
93
|
-
self, dataset: "DatasetRecord", version:
|
|
93
|
+
self, dataset: "DatasetRecord", version: str, listing: bool = False
|
|
94
94
|
) -> None:
|
|
95
95
|
self.dataset_versions.append((dataset, version, listing))
|
|
96
96
|
|
datachain/remote/studio.py
CHANGED
|
@@ -307,7 +307,7 @@ class StudioClient:
|
|
|
307
307
|
def rm_dataset(
|
|
308
308
|
self,
|
|
309
309
|
name: str,
|
|
310
|
-
version: Optional[
|
|
310
|
+
version: Optional[str] = None,
|
|
311
311
|
force: Optional[bool] = False,
|
|
312
312
|
) -> Response[DatasetInfoData]:
|
|
313
313
|
return self._send_request(
|
|
@@ -336,7 +336,7 @@ class StudioClient:
|
|
|
336
336
|
return response
|
|
337
337
|
|
|
338
338
|
def dataset_rows_chunk(
|
|
339
|
-
self, name: str, version:
|
|
339
|
+
self, name: str, version: str, offset: int
|
|
340
340
|
) -> Response[DatasetRowsData]:
|
|
341
341
|
req_data = {"dataset_name": name, "dataset_version": version}
|
|
342
342
|
return self._send_request_msgpack(
|
|
@@ -353,7 +353,7 @@ class StudioClient:
|
|
|
353
353
|
)
|
|
354
354
|
|
|
355
355
|
def export_dataset_table(
|
|
356
|
-
self, name: str, version:
|
|
356
|
+
self, name: str, version: str
|
|
357
357
|
) -> Response[DatasetExportSignedUrls]:
|
|
358
358
|
return self._send_request(
|
|
359
359
|
"datachain/datasets/export",
|
|
@@ -362,7 +362,7 @@ class StudioClient:
|
|
|
362
362
|
)
|
|
363
363
|
|
|
364
364
|
def dataset_export_status(
|
|
365
|
-
self, name: str, version:
|
|
365
|
+
self, name: str, version: str
|
|
366
366
|
) -> Response[DatasetExportStatus]:
|
|
367
367
|
return self._send_request(
|
|
368
368
|
"datachain/datasets/export-status",
|