datachain 0.20.1__py3-none-any.whl → 0.20.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -3
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +3 -3
- datachain/cli/commands/ls.py +2 -2
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +19 -6
- datachain/data_storage/sqlite.py +2 -2
- datachain/dataset.py +4 -3
- datachain/delta.py +2 -2
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -4
- datachain/lib/dc/datachain.py +174 -86
- datachain/lib/dc/datasets.py +25 -37
- datachain/lib/dc/storage.py +24 -38
- datachain/lib/file.py +77 -23
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +16 -18
- datachain/lib/projects.py +26 -26
- datachain/lib/pytorch.py +1 -1
- datachain/lib/tar.py +1 -2
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/namespace.py +3 -3
- datachain/project.py +5 -5
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/METADATA +1 -1
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/RECORD +33 -33
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/WHEEL +0 -0
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/top_level.txt +0 -0
datachain/lib/file.py
CHANGED
|
@@ -5,13 +5,14 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
import posixpath
|
|
8
|
+
import warnings
|
|
8
9
|
from abc import ABC, abstractmethod
|
|
9
10
|
from collections.abc import Iterator
|
|
10
11
|
from contextlib import contextmanager
|
|
11
12
|
from datetime import datetime
|
|
12
13
|
from functools import partial
|
|
13
14
|
from io import BytesIO
|
|
14
|
-
from pathlib import Path, PurePosixPath
|
|
15
|
+
from pathlib import Path, PurePath, PurePosixPath
|
|
15
16
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
|
|
16
17
|
from urllib.parse import unquote, urlparse
|
|
17
18
|
from urllib.request import url2pathname
|
|
@@ -69,7 +70,7 @@ class FileExporter(NodesThreadPool):
|
|
|
69
70
|
for task in done:
|
|
70
71
|
task.result()
|
|
71
72
|
|
|
72
|
-
def do_task(self, file):
|
|
73
|
+
def do_task(self, file: "File"):
|
|
73
74
|
file.export(
|
|
74
75
|
self.output,
|
|
75
76
|
self.placement,
|
|
@@ -274,8 +275,8 @@ class File(DataModel):
|
|
|
274
275
|
|
|
275
276
|
@field_validator("path", mode="before")
|
|
276
277
|
@classmethod
|
|
277
|
-
def validate_path(cls, path):
|
|
278
|
-
return
|
|
278
|
+
def validate_path(cls, path: str) -> str:
|
|
279
|
+
return PurePath(path).as_posix() if path else ""
|
|
279
280
|
|
|
280
281
|
def model_dump_custom(self):
|
|
281
282
|
res = self.model_dump()
|
|
@@ -337,11 +338,11 @@ class File(DataModel):
|
|
|
337
338
|
return cls(**{key: row[key] for key in cls._datachain_column_types})
|
|
338
339
|
|
|
339
340
|
@property
|
|
340
|
-
def name(self):
|
|
341
|
+
def name(self) -> str:
|
|
341
342
|
return PurePosixPath(self.path).name
|
|
342
343
|
|
|
343
344
|
@property
|
|
344
|
-
def parent(self):
|
|
345
|
+
def parent(self) -> str:
|
|
345
346
|
return str(PurePosixPath(self.path).parent)
|
|
346
347
|
|
|
347
348
|
@contextmanager
|
|
@@ -391,7 +392,7 @@ class File(DataModel):
|
|
|
391
392
|
|
|
392
393
|
client.upload(self.read(), destination)
|
|
393
394
|
|
|
394
|
-
def _symlink_to(self, destination: str):
|
|
395
|
+
def _symlink_to(self, destination: str) -> None:
|
|
395
396
|
if self.location:
|
|
396
397
|
raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
|
|
397
398
|
|
|
@@ -400,7 +401,7 @@ class File(DataModel):
|
|
|
400
401
|
source = self.get_local_path()
|
|
401
402
|
assert source, "File was not cached"
|
|
402
403
|
elif self.source.startswith("file://"):
|
|
403
|
-
source = self.
|
|
404
|
+
source = self.get_fs_path()
|
|
404
405
|
else:
|
|
405
406
|
raise OSError(errno.EXDEV, "can't link across filesystems")
|
|
406
407
|
|
|
@@ -481,27 +482,62 @@ class File(DataModel):
|
|
|
481
482
|
|
|
482
483
|
def get_file_ext(self):
|
|
483
484
|
"""Returns last part of file name without `.`."""
|
|
484
|
-
return PurePosixPath(self.path).suffix.
|
|
485
|
+
return PurePosixPath(self.path).suffix.lstrip(".")
|
|
485
486
|
|
|
486
487
|
def get_file_stem(self):
|
|
487
488
|
"""Returns file name without extension."""
|
|
488
489
|
return PurePosixPath(self.path).stem
|
|
489
490
|
|
|
490
491
|
def get_full_name(self):
|
|
491
|
-
"""
|
|
492
|
+
"""
|
|
493
|
+
[DEPRECATED] Use `file.path` directly instead.
|
|
494
|
+
|
|
495
|
+
Returns name with parent directories.
|
|
496
|
+
"""
|
|
497
|
+
warnings.warn(
|
|
498
|
+
"file.get_full_name() is deprecated and will be removed "
|
|
499
|
+
"in a future version. Use `file.path` directly.",
|
|
500
|
+
DeprecationWarning,
|
|
501
|
+
stacklevel=2,
|
|
502
|
+
)
|
|
492
503
|
return self.path
|
|
493
504
|
|
|
494
|
-
def
|
|
505
|
+
def get_path_normalized(self) -> str:
|
|
506
|
+
if not self.path:
|
|
507
|
+
raise FileError("path must not be empty", self.source, self.path)
|
|
508
|
+
|
|
509
|
+
if self.path.endswith("/"):
|
|
510
|
+
raise FileError("path must not be a directory", self.source, self.path)
|
|
511
|
+
|
|
512
|
+
normpath = os.path.normpath(self.path)
|
|
513
|
+
normpath = PurePath(normpath).as_posix()
|
|
514
|
+
|
|
515
|
+
if normpath == ".":
|
|
516
|
+
raise FileError("path must not be a directory", self.source, self.path)
|
|
517
|
+
|
|
518
|
+
if any(part == ".." for part in PurePath(normpath).parts):
|
|
519
|
+
raise FileError("path must not contain '..'", self.source, self.path)
|
|
520
|
+
|
|
521
|
+
return normpath
|
|
522
|
+
|
|
523
|
+
def get_uri(self) -> str:
|
|
495
524
|
"""Returns file URI."""
|
|
496
|
-
return f"{self.source}/{self.
|
|
525
|
+
return f"{self.source}/{self.get_path_normalized()}"
|
|
497
526
|
|
|
498
|
-
def
|
|
499
|
-
"""
|
|
527
|
+
def get_fs_path(self) -> str:
|
|
528
|
+
"""
|
|
529
|
+
Returns file path with respect to the filescheme.
|
|
530
|
+
|
|
531
|
+
If `normalize` is True, the path is normalized to remove any redundant
|
|
532
|
+
separators and up-level references.
|
|
533
|
+
|
|
534
|
+
If the file scheme is "file", the path is converted to a local file path
|
|
535
|
+
using `url2pathname`. Otherwise, the original path with scheme is returned.
|
|
536
|
+
"""
|
|
500
537
|
path = unquote(self.get_uri())
|
|
501
|
-
|
|
502
|
-
if
|
|
503
|
-
path =
|
|
504
|
-
path = url2pathname(path)
|
|
538
|
+
path_parsed = urlparse(path)
|
|
539
|
+
if path_parsed.scheme == "file":
|
|
540
|
+
path = url2pathname(path_parsed.path)
|
|
505
541
|
return path
|
|
506
542
|
|
|
507
543
|
def get_destination_path(
|
|
@@ -516,7 +552,7 @@ class File(DataModel):
|
|
|
516
552
|
elif placement == "etag":
|
|
517
553
|
path = f"{self.etag}{self.get_file_suffix()}"
|
|
518
554
|
elif placement == "fullpath":
|
|
519
|
-
path = unquote(self.
|
|
555
|
+
path = unquote(self.get_path_normalized())
|
|
520
556
|
source = urlparse(self.source)
|
|
521
557
|
if source.scheme and source.scheme != "file":
|
|
522
558
|
path = posixpath.join(source.netloc, path)
|
|
@@ -554,8 +590,9 @@ class File(DataModel):
|
|
|
554
590
|
) from e
|
|
555
591
|
|
|
556
592
|
try:
|
|
557
|
-
|
|
558
|
-
|
|
593
|
+
normalized_path = self.get_path_normalized()
|
|
594
|
+
info = client.fs.info(client.get_full_path(normalized_path))
|
|
595
|
+
converted_info = client.info_to_file(info, normalized_path)
|
|
559
596
|
return type(self)(
|
|
560
597
|
path=self.path,
|
|
561
598
|
source=self.source,
|
|
@@ -566,8 +603,17 @@ class File(DataModel):
|
|
|
566
603
|
last_modified=converted_info.last_modified,
|
|
567
604
|
location=self.location,
|
|
568
605
|
)
|
|
606
|
+
except FileError as e:
|
|
607
|
+
logger.warning(
|
|
608
|
+
"File error when resolving %s/%s: %s", self.source, self.path, str(e)
|
|
609
|
+
)
|
|
569
610
|
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
570
|
-
logger.warning(
|
|
611
|
+
logger.warning(
|
|
612
|
+
"File system error when resolving %s/%s: %s",
|
|
613
|
+
self.source,
|
|
614
|
+
self.path,
|
|
615
|
+
str(e),
|
|
616
|
+
)
|
|
571
617
|
|
|
572
618
|
return type(self)(
|
|
573
619
|
path=self.path,
|
|
@@ -583,6 +629,8 @@ class File(DataModel):
|
|
|
583
629
|
|
|
584
630
|
def resolve(file: File) -> File:
|
|
585
631
|
"""
|
|
632
|
+
[DEPRECATED] Use `file.resolve()` directly instead.
|
|
633
|
+
|
|
586
634
|
Resolve a File object by checking its existence and updating its metadata.
|
|
587
635
|
|
|
588
636
|
This function is a wrapper around the File.resolve() method, designed to be
|
|
@@ -598,6 +646,12 @@ def resolve(file: File) -> File:
|
|
|
598
646
|
RuntimeError: If the file's catalog is not set or if
|
|
599
647
|
the file source protocol is unsupported.
|
|
600
648
|
"""
|
|
649
|
+
warnings.warn(
|
|
650
|
+
"resolve() is deprecated and will be removed "
|
|
651
|
+
"in a future version. Use file.resolve() directly.",
|
|
652
|
+
DeprecationWarning,
|
|
653
|
+
stacklevel=2,
|
|
654
|
+
)
|
|
601
655
|
return file.resolve()
|
|
602
656
|
|
|
603
657
|
|
|
@@ -945,7 +999,7 @@ class ArrowRow(DataModel):
|
|
|
945
999
|
ds = dataset(path, **self.kwargs)
|
|
946
1000
|
|
|
947
1001
|
else:
|
|
948
|
-
path = self.file.
|
|
1002
|
+
path = self.file.get_fs_path()
|
|
949
1003
|
ds = dataset(path, filesystem=self.file.get_fs(), **self.kwargs)
|
|
950
1004
|
|
|
951
1005
|
return ds.take([self.index]).to_reader()
|
datachain/lib/meta_formats.py
CHANGED
|
@@ -106,7 +106,7 @@ def read_meta( # noqa: C901
|
|
|
106
106
|
from datachain import read_storage
|
|
107
107
|
|
|
108
108
|
if schema_from:
|
|
109
|
-
file =
|
|
109
|
+
file = read_storage(schema_from, type="text").limit(1).to_values("file")[0]
|
|
110
110
|
model_code = gen_datamodel_code(
|
|
111
111
|
file, format=format, jmespath=jmespath, model_name=model_name
|
|
112
112
|
)
|
datachain/lib/namespaces.py
CHANGED
|
@@ -6,39 +6,37 @@ from datachain.query import Session
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def create(
|
|
9
|
-
name: str,
|
|
9
|
+
name: str, descr: Optional[str] = None, session: Optional[Session] = None
|
|
10
10
|
) -> Namespace:
|
|
11
11
|
"""
|
|
12
|
-
Creates a new
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
in Studio, where it is allowed.
|
|
18
|
-
In local environment all datasets are created under the default `local` namespace.
|
|
12
|
+
Creates a new namespace.
|
|
13
|
+
|
|
14
|
+
Namespaces organize projects, which in turn organize datasets. A default
|
|
15
|
+
namespace always exists and is used if none is specified. Multiple namespaces
|
|
16
|
+
can be created in Studio, but only the default is available in the CLI.
|
|
19
17
|
|
|
20
18
|
Parameters:
|
|
21
|
-
name
|
|
22
|
-
|
|
23
|
-
session
|
|
19
|
+
name: Name of the new namespace.
|
|
20
|
+
descr: Optional description of the namespace.
|
|
21
|
+
session: Optional session to use for the operation.
|
|
24
22
|
|
|
25
23
|
Example:
|
|
26
24
|
```py
|
|
27
|
-
|
|
28
|
-
namespace =
|
|
25
|
+
from datachain.lib.namespaces import create as create_namespace
|
|
26
|
+
namespace = create_namespace("dev", "Dev namespace")
|
|
29
27
|
```
|
|
30
28
|
"""
|
|
31
29
|
session = Session.get(session)
|
|
32
30
|
|
|
33
31
|
if not session.catalog.metastore.namespace_allowed_to_create:
|
|
34
|
-
raise NamespaceCreateNotAllowedError("Creating
|
|
32
|
+
raise NamespaceCreateNotAllowedError("Creating namespace is not allowed")
|
|
35
33
|
|
|
36
34
|
Namespace.validate_name(name)
|
|
37
35
|
|
|
38
|
-
return session.catalog.metastore.create_namespace(name,
|
|
36
|
+
return session.catalog.metastore.create_namespace(name, descr)
|
|
39
37
|
|
|
40
38
|
|
|
41
|
-
def get(name: str, session: Optional[Session]) -> Namespace:
|
|
39
|
+
def get(name: str, session: Optional[Session] = None) -> Namespace:
|
|
42
40
|
"""
|
|
43
41
|
Gets a namespace by name.
|
|
44
42
|
If the namespace is not found, a `NamespaceNotFoundError` is raised.
|
|
@@ -66,8 +64,8 @@ def ls(session: Optional[Session] = None) -> list[Namespace]:
|
|
|
66
64
|
|
|
67
65
|
Example:
|
|
68
66
|
```py
|
|
69
|
-
|
|
70
|
-
namespaces =
|
|
67
|
+
from datachain.lib.namespaces import ls as ls_namespaces
|
|
68
|
+
namespaces = ls_namespaces()
|
|
71
69
|
```
|
|
72
70
|
"""
|
|
73
71
|
return Session.get(session).catalog.metastore.list_namespaces()
|
datachain/lib/projects.py
CHANGED
|
@@ -6,81 +6,81 @@ from datachain.query import Session
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def create(
|
|
9
|
+
namespace: str,
|
|
9
10
|
name: str,
|
|
10
|
-
|
|
11
|
-
description: Optional[str] = None,
|
|
11
|
+
descr: Optional[str] = None,
|
|
12
12
|
session: Optional[Session] = None,
|
|
13
13
|
) -> Project:
|
|
14
14
|
"""
|
|
15
|
-
Creates a new
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
In local environment all datasets are created under the default `local` project.
|
|
15
|
+
Creates a new project under a specified namespace.
|
|
16
|
+
|
|
17
|
+
Projects help organize datasets. A default project is always available,
|
|
18
|
+
but users can create additional ones (only in Studio, not via CLI).
|
|
19
|
+
|
|
21
20
|
|
|
22
21
|
Parameters:
|
|
23
|
-
name
|
|
24
|
-
namespace
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
session : Session to use for creating project.
|
|
22
|
+
name: Name of the new project.
|
|
23
|
+
namespace: Namespace to create the project in. Created if it doesn't exist.
|
|
24
|
+
descr: Optional description of the project.
|
|
25
|
+
session: Optional session to use for the operation.
|
|
28
26
|
|
|
29
27
|
Example:
|
|
30
28
|
```py
|
|
31
29
|
import datachain as dc
|
|
32
|
-
project = dc.
|
|
30
|
+
project = dc.create_project("dev", "my-project", "My personal project")
|
|
33
31
|
```
|
|
34
32
|
"""
|
|
35
33
|
session = Session.get(session)
|
|
36
34
|
|
|
37
35
|
if not session.catalog.metastore.project_allowed_to_create:
|
|
38
|
-
raise ProjectCreateNotAllowedError("Creating
|
|
36
|
+
raise ProjectCreateNotAllowedError("Creating project is not allowed")
|
|
39
37
|
|
|
40
38
|
Project.validate_name(name)
|
|
41
39
|
|
|
42
|
-
return session.catalog.metastore.create_project(
|
|
40
|
+
return session.catalog.metastore.create_project(namespace, name, descr)
|
|
43
41
|
|
|
44
42
|
|
|
45
|
-
def get(name: str,
|
|
43
|
+
def get(name: str, namespace: str, session: Optional[Session]) -> Project:
|
|
46
44
|
"""
|
|
47
45
|
Gets a project by name in some namespace.
|
|
48
46
|
If the project is not found, a `ProjectNotFoundError` is raised.
|
|
49
47
|
|
|
50
48
|
Parameters:
|
|
51
49
|
name : The name of the project.
|
|
52
|
-
|
|
50
|
+
namespace : The name of the namespace.
|
|
53
51
|
session : Session to use for getting project.
|
|
54
52
|
|
|
55
53
|
Example:
|
|
56
54
|
```py
|
|
57
55
|
import datachain as dc
|
|
58
|
-
|
|
56
|
+
from datachain.lib.projects import get as get_project
|
|
57
|
+
project = get_project("my-project", "local")
|
|
59
58
|
```
|
|
60
59
|
"""
|
|
61
|
-
return Session.get(session).catalog.metastore.get_project(name,
|
|
60
|
+
return Session.get(session).catalog.metastore.get_project(name, namespace)
|
|
62
61
|
|
|
63
62
|
|
|
64
63
|
def ls(
|
|
65
|
-
|
|
64
|
+
namespace: Optional[str] = None, session: Optional[Session] = None
|
|
66
65
|
) -> list[Project]:
|
|
67
66
|
"""
|
|
68
67
|
Gets a list of projects in a specific namespace or from all namespaces.
|
|
69
68
|
|
|
70
69
|
Parameters:
|
|
71
|
-
|
|
70
|
+
namespace : An optional namespace name.
|
|
72
71
|
session : Session to use for getting project.
|
|
73
72
|
|
|
74
73
|
Example:
|
|
75
74
|
```py
|
|
76
75
|
import datachain as dc
|
|
77
|
-
|
|
78
|
-
|
|
76
|
+
from datachain.lib.projects import ls as ls_projects
|
|
77
|
+
local_namespace_projects = ls_projects("local")
|
|
78
|
+
all_projects = ls_projects()
|
|
79
79
|
```
|
|
80
80
|
"""
|
|
81
81
|
session = Session.get(session)
|
|
82
82
|
namespace_id = None
|
|
83
|
-
if
|
|
84
|
-
namespace_id = session.catalog.metastore.get_namespace(
|
|
83
|
+
if namespace:
|
|
84
|
+
namespace_id = session.catalog.metastore.get_namespace(namespace).id
|
|
85
85
|
|
|
86
86
|
return session.catalog.metastore.list_projects(namespace_id)
|
datachain/lib/pytorch.py
CHANGED
|
@@ -130,7 +130,7 @@ class PytorchDataset(IterableDataset):
|
|
|
130
130
|
if self.num_samples > 0:
|
|
131
131
|
ds = ds.sample(self.num_samples)
|
|
132
132
|
ds = ds.chunk(total_rank, total_workers)
|
|
133
|
-
yield from ds.
|
|
133
|
+
yield from ds.to_iter()
|
|
134
134
|
|
|
135
135
|
def _iter_with_prefetch(self) -> Generator[tuple[Any], None, None]:
|
|
136
136
|
from datachain.lib.udf import _prefetch_inputs
|
datachain/lib/tar.py
CHANGED
|
@@ -6,12 +6,11 @@ from datachain.lib.file import File, TarVFile
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
|
|
9
|
-
new_parent = parent.get_full_name()
|
|
10
9
|
etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
|
|
11
10
|
etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
|
|
12
11
|
return File(
|
|
13
12
|
source=parent.source,
|
|
14
|
-
path=f"{
|
|
13
|
+
path=f"{parent.path}/{info.name}",
|
|
15
14
|
version=parent.version,
|
|
16
15
|
size=info.size,
|
|
17
16
|
etag=etag,
|
datachain/lib/udf_signature.py
CHANGED
datachain/lib/webdataset.py
CHANGED
|
@@ -34,29 +34,29 @@ warnings.filterwarnings(
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
class WDSError(DataChainError):
|
|
37
|
-
def __init__(self,
|
|
38
|
-
super().__init__(f"WebDataset error '{
|
|
37
|
+
def __init__(self, tar_name: str, message: str):
|
|
38
|
+
super().__init__(f"WebDataset error '{tar_name}': {message}")
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
class CoreFileDuplicationError(WDSError):
|
|
42
|
-
def __init__(self,
|
|
42
|
+
def __init__(self, tar_name: str, file1: str, file2: str):
|
|
43
43
|
super().__init__(
|
|
44
|
-
|
|
44
|
+
tar_name, f"duplication of files with core extensions: {file1}, {file2}"
|
|
45
45
|
)
|
|
46
46
|
|
|
47
47
|
|
|
48
48
|
class CoreFileNotFoundError(WDSError):
|
|
49
|
-
def __init__(self,
|
|
49
|
+
def __init__(self, tar_name: str, extensions: Sequence[str], stem: str):
|
|
50
50
|
super().__init__(
|
|
51
|
-
|
|
51
|
+
tar_name,
|
|
52
52
|
f"no files with the extensions '{','.join(extensions)}'"
|
|
53
53
|
f" were found for file stem {stem}",
|
|
54
54
|
)
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
class UnknownFileExtensionError(WDSError):
|
|
58
|
-
def __init__(self,
|
|
59
|
-
super().__init__(
|
|
58
|
+
def __init__(self, tar_name, name: str, ext: str):
|
|
59
|
+
super().__init__(tar_name, f"unknown extension '{ext}' for file '{name}'")
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
class WDSBasic(DataModel):
|
|
@@ -113,10 +113,10 @@ class Builder:
|
|
|
113
113
|
def __init__(
|
|
114
114
|
self,
|
|
115
115
|
tar_stream: File,
|
|
116
|
-
core_extensions:
|
|
116
|
+
core_extensions: Sequence[str],
|
|
117
117
|
wds_class: type[WDSBasic],
|
|
118
|
-
tar,
|
|
119
|
-
encoding="utf-8",
|
|
118
|
+
tar: tarfile.TarFile,
|
|
119
|
+
encoding: str = "utf-8",
|
|
120
120
|
):
|
|
121
121
|
self._core_extensions = core_extensions
|
|
122
122
|
self._tar_stream = tar_stream
|
|
@@ -145,18 +145,20 @@ class Builder:
|
|
|
145
145
|
if ext in self._core_extensions:
|
|
146
146
|
if self.state.core_file is not None:
|
|
147
147
|
raise CoreFileDuplicationError(
|
|
148
|
-
self._tar_stream, file.name, self.state.core_file.name
|
|
148
|
+
self._tar_stream.name, file.name, self.state.core_file.name
|
|
149
149
|
)
|
|
150
150
|
self.state.core_file = file
|
|
151
151
|
elif ext in self.state.data:
|
|
152
152
|
raise WDSError(
|
|
153
|
-
self._tar_stream,
|
|
153
|
+
self._tar_stream.name,
|
|
154
154
|
f"file with extension '.{ext}' already exists in the archive",
|
|
155
155
|
)
|
|
156
156
|
else:
|
|
157
157
|
type_ = self._get_type(ext)
|
|
158
158
|
if type_ is None:
|
|
159
|
-
raise UnknownFileExtensionError(
|
|
159
|
+
raise UnknownFileExtensionError(
|
|
160
|
+
self._tar_stream.name, fstream.name, ext
|
|
161
|
+
)
|
|
160
162
|
|
|
161
163
|
if issubclass(type_, WDSReadableSubclass):
|
|
162
164
|
reader = type_._reader
|
|
@@ -165,7 +167,7 @@ class Builder:
|
|
|
165
167
|
|
|
166
168
|
if reader is None:
|
|
167
169
|
raise WDSError(
|
|
168
|
-
self._tar_stream,
|
|
170
|
+
self._tar_stream.name,
|
|
169
171
|
f"unable to find a reader for type {type_}, extension .{ext}",
|
|
170
172
|
)
|
|
171
173
|
self.state.data[ext] = reader(self, file)
|
|
@@ -173,7 +175,7 @@ class Builder:
|
|
|
173
175
|
def produce(self):
|
|
174
176
|
if self.state.core_file is None:
|
|
175
177
|
raise CoreFileNotFoundError(
|
|
176
|
-
self._tar_stream, self._core_extensions, self.state.stem
|
|
178
|
+
self._tar_stream.name, self._core_extensions, self.state.stem
|
|
177
179
|
)
|
|
178
180
|
|
|
179
181
|
file = build_tar_member(self._tar_stream, self.state.core_file)
|
|
@@ -194,7 +196,13 @@ class Builder:
|
|
|
194
196
|
return anno
|
|
195
197
|
|
|
196
198
|
|
|
197
|
-
def get_tar_groups(
|
|
199
|
+
def get_tar_groups(
|
|
200
|
+
stream: File,
|
|
201
|
+
tar: tarfile.TarFile,
|
|
202
|
+
core_extensions: Sequence[str],
|
|
203
|
+
spec: type[WDSBasic],
|
|
204
|
+
encoding: str = "utf-8",
|
|
205
|
+
) -> Iterator[WDSBasic]:
|
|
198
206
|
builder = Builder(stream, core_extensions, spec, tar, encoding)
|
|
199
207
|
|
|
200
208
|
for item in sorted(tar.getmembers(), key=lambda m: Path(m.name).stem):
|
|
@@ -210,9 +218,11 @@ def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
|
|
|
210
218
|
|
|
211
219
|
|
|
212
220
|
def process_webdataset(
|
|
213
|
-
core_extensions: Sequence[str] = ("jpg", "png"),
|
|
214
|
-
|
|
215
|
-
|
|
221
|
+
core_extensions: Sequence[str] = ("jpg", "png"),
|
|
222
|
+
spec: type[WDSBasic] = WDSAllFile,
|
|
223
|
+
encoding: str = "utf-8",
|
|
224
|
+
) -> Callable[[File], Iterator]:
|
|
225
|
+
def wds_func(file: File) -> Iterator[spec]: # type: ignore[valid-type]
|
|
216
226
|
with file.open() as fd:
|
|
217
227
|
with tarfile.open(fileobj=fd) as tar:
|
|
218
228
|
yield from get_tar_groups(file, tar, core_extensions, spec, encoding)
|
datachain/namespace.py
CHANGED
|
@@ -14,7 +14,7 @@ class Namespace:
|
|
|
14
14
|
id: int
|
|
15
15
|
uuid: str
|
|
16
16
|
name: str
|
|
17
|
-
|
|
17
|
+
descr: Optional[str]
|
|
18
18
|
created_at: datetime
|
|
19
19
|
|
|
20
20
|
@staticmethod
|
|
@@ -54,10 +54,10 @@ class Namespace:
|
|
|
54
54
|
id: int,
|
|
55
55
|
uuid: str,
|
|
56
56
|
name: str,
|
|
57
|
-
|
|
57
|
+
descr: Optional[str],
|
|
58
58
|
created_at: datetime,
|
|
59
59
|
) -> "Namespace":
|
|
60
|
-
return cls(id, uuid, name,
|
|
60
|
+
return cls(id, uuid, name, descr, created_at)
|
|
61
61
|
|
|
62
62
|
@classmethod
|
|
63
63
|
def from_dict(cls, d: dict[str, Any]) -> "Namespace":
|
datachain/project.py
CHANGED
|
@@ -15,7 +15,7 @@ class Project:
|
|
|
15
15
|
id: int
|
|
16
16
|
uuid: str
|
|
17
17
|
name: str
|
|
18
|
-
|
|
18
|
+
descr: Optional[str]
|
|
19
19
|
created_at: datetime
|
|
20
20
|
namespace: Namespace
|
|
21
21
|
|
|
@@ -52,12 +52,12 @@ class Project:
|
|
|
52
52
|
namespace_id: int,
|
|
53
53
|
namespace_uuid: str,
|
|
54
54
|
namespace_name: str,
|
|
55
|
-
|
|
55
|
+
namespace_descr: Optional[str],
|
|
56
56
|
namespace_created_at: datetime,
|
|
57
57
|
project_id: int,
|
|
58
58
|
uuid: str,
|
|
59
59
|
name: str,
|
|
60
|
-
|
|
60
|
+
descr: Optional[str],
|
|
61
61
|
created_at: datetime,
|
|
62
62
|
project_namespace_id: int,
|
|
63
63
|
) -> "Project":
|
|
@@ -65,11 +65,11 @@ class Project:
|
|
|
65
65
|
namespace_id,
|
|
66
66
|
namespace_uuid,
|
|
67
67
|
namespace_name,
|
|
68
|
-
|
|
68
|
+
namespace_descr,
|
|
69
69
|
namespace_created_at,
|
|
70
70
|
)
|
|
71
71
|
|
|
72
|
-
return cls(project_id, uuid, name,
|
|
72
|
+
return cls(project_id, uuid, name, descr, created_at, namespace)
|
|
73
73
|
|
|
74
74
|
@classmethod
|
|
75
75
|
def from_dict(cls, d: dict[str, Any]) -> "Project":
|