datachain 0.20.1__py3-none-any.whl → 0.20.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -3
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +3 -3
- datachain/cli/commands/ls.py +2 -2
- datachain/client/fsspec.py +5 -3
- datachain/client/hf.py +10 -0
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +19 -6
- datachain/data_storage/sqlite.py +2 -2
- datachain/dataset.py +4 -3
- datachain/delta.py +2 -2
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -4
- datachain/lib/dc/datachain.py +174 -86
- datachain/lib/dc/datasets.py +25 -37
- datachain/lib/dc/storage.py +24 -38
- datachain/lib/file.py +77 -23
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +16 -18
- datachain/lib/projects.py +26 -26
- datachain/lib/pytorch.py +1 -1
- datachain/lib/tar.py +1 -2
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/namespace.py +3 -3
- datachain/project.py +5 -5
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/METADATA +1 -1
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/RECORD +33 -33
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/WHEEL +0 -0
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.1.dist-info → datachain-0.20.3.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from datachain.lib import namespaces, projects
|
|
2
1
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
3
2
|
from datachain.lib.dc import (
|
|
4
3
|
C,
|
|
@@ -33,6 +32,7 @@ from datachain.lib.file import (
|
|
|
33
32
|
VideoFrame,
|
|
34
33
|
)
|
|
35
34
|
from datachain.lib.model_store import ModelStore
|
|
35
|
+
from datachain.lib.projects import create as create_project
|
|
36
36
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
37
37
|
from datachain.lib.utils import AbstractUDF, DataChainError
|
|
38
38
|
from datachain.query import metrics, param
|
|
@@ -63,14 +63,13 @@ __all__ = [
|
|
|
63
63
|
"VideoFile",
|
|
64
64
|
"VideoFragment",
|
|
65
65
|
"VideoFrame",
|
|
66
|
+
"create_project",
|
|
66
67
|
"datasets",
|
|
67
68
|
"delete_dataset",
|
|
68
69
|
"is_chain_type",
|
|
69
70
|
"listings",
|
|
70
71
|
"metrics",
|
|
71
|
-
"namespaces",
|
|
72
72
|
"param",
|
|
73
|
-
"projects",
|
|
74
73
|
"read_csv",
|
|
75
74
|
"read_database",
|
|
76
75
|
"read_dataset",
|
datachain/cache.py
CHANGED
|
@@ -39,7 +39,7 @@ def temporary_cache(
|
|
|
39
39
|
cache.destroy()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class Cache:
|
|
42
|
+
class Cache: # noqa: PLW1641
|
|
43
43
|
def __init__(self, cache_dir: str, tmp_dir: str):
|
|
44
44
|
self.odb = LocalHashFileDB(
|
|
45
45
|
LocalFileSystem(),
|
|
@@ -76,9 +76,9 @@ class Cache:
|
|
|
76
76
|
async def download(
|
|
77
77
|
self, file: "File", client: "Client", callback: Optional[Callback] = None
|
|
78
78
|
) -> None:
|
|
79
|
-
from_path = f"{file.source}/{file.path}"
|
|
80
79
|
from dvc_objects.fs.utils import tmp_fname
|
|
81
80
|
|
|
81
|
+
from_path = file.get_uri()
|
|
82
82
|
odb_fs = self.odb.fs
|
|
83
83
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
84
84
|
size = file.size
|
datachain/catalog/catalog.py
CHANGED
|
@@ -1491,13 +1491,13 @@ class Catalog:
|
|
|
1491
1491
|
|
|
1492
1492
|
namespace = self.metastore.create_namespace(
|
|
1493
1493
|
remote_ds.project.namespace.name,
|
|
1494
|
-
description=remote_ds.project.namespace.
|
|
1494
|
+
description=remote_ds.project.namespace.descr,
|
|
1495
1495
|
uuid=remote_ds.project.namespace.uuid,
|
|
1496
1496
|
)
|
|
1497
1497
|
project = self.metastore.create_project(
|
|
1498
|
-
remote_ds.project.name,
|
|
1499
1498
|
namespace.name,
|
|
1500
|
-
|
|
1499
|
+
remote_ds.project.name,
|
|
1500
|
+
description=remote_ds.project.descr,
|
|
1501
1501
|
uuid=remote_ds.project.uuid,
|
|
1502
1502
|
)
|
|
1503
1503
|
|
datachain/cli/commands/ls.py
CHANGED
|
@@ -63,8 +63,8 @@ def ls_local(
|
|
|
63
63
|
print(format_ls_entry(entry))
|
|
64
64
|
else:
|
|
65
65
|
# Collect results in a list here to prevent interference from `tqdm` and `print`
|
|
66
|
-
listing =
|
|
67
|
-
for ls in listing:
|
|
66
|
+
listing = listings().to_list("listing")
|
|
67
|
+
for (ls,) in listing:
|
|
68
68
|
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
69
69
|
|
|
70
70
|
|
datachain/client/fsspec.py
CHANGED
|
@@ -207,13 +207,14 @@ class Client(ABC):
|
|
|
207
207
|
)
|
|
208
208
|
|
|
209
209
|
async def get_current_etag(self, file: "File") -> str:
|
|
210
|
+
file_path = file.get_path_normalized()
|
|
210
211
|
kwargs = {}
|
|
211
212
|
if self._is_version_aware():
|
|
212
213
|
kwargs["version_id"] = file.version
|
|
213
214
|
info = await self.fs._info(
|
|
214
|
-
self.get_full_path(
|
|
215
|
+
self.get_full_path(file_path, file.version), **kwargs
|
|
215
216
|
)
|
|
216
|
-
return self.info_to_file(info,
|
|
217
|
+
return self.info_to_file(info, file_path).etag
|
|
217
218
|
|
|
218
219
|
def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
|
|
219
220
|
info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
|
|
@@ -386,7 +387,8 @@ class Client(ABC):
|
|
|
386
387
|
return open(cache_path, mode="rb")
|
|
387
388
|
assert not file.location
|
|
388
389
|
return FileWrapper(
|
|
389
|
-
self.fs.open(self.get_full_path(file.
|
|
390
|
+
self.fs.open(self.get_full_path(file.get_path_normalized(), file.version)),
|
|
391
|
+
cb,
|
|
390
392
|
) # type: ignore[return-value]
|
|
391
393
|
|
|
392
394
|
def upload(self, data: bytes, path: str) -> "File":
|
datachain/client/hf.py
CHANGED
|
@@ -21,6 +21,9 @@ def _wrap_class(sync_fs_class):
|
|
|
21
21
|
asynchronous to False by default. This is similar to other Async FS
|
|
22
22
|
we initialize. E.g. it means we don't break things in Jupyter where code
|
|
23
23
|
run in async.
|
|
24
|
+
|
|
25
|
+
This also fixes write operations by ensuring they are properly forwarded
|
|
26
|
+
to the underlying filesystem without async buffering issues.
|
|
24
27
|
"""
|
|
25
28
|
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
|
|
26
29
|
|
|
@@ -29,6 +32,13 @@ def _wrap_class(sync_fs_class):
|
|
|
29
32
|
sync_fs = sync_fs_class(*args, **kwargs)
|
|
30
33
|
super().__init__(sync_fs, asynchronous=False)
|
|
31
34
|
|
|
35
|
+
def open(self, path, mode="rb", **kwargs):
|
|
36
|
+
# Override open to ensure write operations work correctly.
|
|
37
|
+
# It seems to be a bug in the fsspec wrapper. It avoids
|
|
38
|
+
# wrapping open() explicitly but also doesn't redirect it to
|
|
39
|
+
# sync filesystem.
|
|
40
|
+
return self.sync_fs.open(path, mode, **kwargs)
|
|
41
|
+
|
|
32
42
|
GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
|
|
33
43
|
return GeneratedAsyncFileSystemWrapper
|
|
34
44
|
|
datachain/client/local.py
CHANGED
|
@@ -99,7 +99,7 @@ class FileClient(Client):
|
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
async def get_current_etag(self, file: "File") -> str:
|
|
102
|
-
info = self.fs.info(self.get_full_path(file.
|
|
102
|
+
info = self.fs.info(self.get_full_path(file.get_path_normalized()))
|
|
103
103
|
return self.info_to_file(info, "").etag
|
|
104
104
|
|
|
105
105
|
async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
|
|
@@ -138,8 +138,8 @@ class FileClient(Client):
|
|
|
138
138
|
if not self.use_symlinks:
|
|
139
139
|
super().fetch_nodes(nodes, shared_progress_bar)
|
|
140
140
|
|
|
141
|
-
def do_instantiate_object(self,
|
|
141
|
+
def do_instantiate_object(self, file: File, dst: str) -> None:
|
|
142
142
|
if self.use_symlinks:
|
|
143
|
-
os.symlink(Path(self.name,
|
|
143
|
+
os.symlink(Path(self.name, file.path), dst)
|
|
144
144
|
else:
|
|
145
|
-
super().do_instantiate_object(
|
|
145
|
+
super().do_instantiate_object(file, dst)
|
|
@@ -185,8 +185,8 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
185
185
|
@abstractmethod
|
|
186
186
|
def create_project(
|
|
187
187
|
self,
|
|
188
|
-
name: str,
|
|
189
188
|
namespace_name: str,
|
|
189
|
+
name: str,
|
|
190
190
|
description: Optional[str] = None,
|
|
191
191
|
uuid: Optional[str] = None,
|
|
192
192
|
ignore_if_exists: bool = True,
|
|
@@ -195,8 +195,13 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
195
195
|
"""Creates new project in specific namespace"""
|
|
196
196
|
|
|
197
197
|
@abstractmethod
|
|
198
|
-
def get_project(
|
|
199
|
-
|
|
198
|
+
def get_project(
|
|
199
|
+
self, name: str, namespace_name: str, create: bool = False, conn=None
|
|
200
|
+
) -> Project:
|
|
201
|
+
"""
|
|
202
|
+
Gets a single project inside some namespace by name.
|
|
203
|
+
It also creates project if not found and create flag is set to True.
|
|
204
|
+
"""
|
|
200
205
|
|
|
201
206
|
@abstractmethod
|
|
202
207
|
def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
|
|
@@ -763,14 +768,18 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
763
768
|
|
|
764
769
|
def create_project(
|
|
765
770
|
self,
|
|
766
|
-
name: str,
|
|
767
771
|
namespace_name: str,
|
|
772
|
+
name: str,
|
|
768
773
|
description: Optional[str] = None,
|
|
769
774
|
uuid: Optional[str] = None,
|
|
770
775
|
ignore_if_exists: bool = True,
|
|
771
776
|
**kwargs,
|
|
772
777
|
) -> Project:
|
|
773
|
-
|
|
778
|
+
try:
|
|
779
|
+
namespace = self.get_namespace(namespace_name)
|
|
780
|
+
except NamespaceNotFoundError:
|
|
781
|
+
namespace = self.create_namespace(namespace_name)
|
|
782
|
+
|
|
774
783
|
query = self._projects_insert().values(
|
|
775
784
|
namespace_id=namespace.id,
|
|
776
785
|
uuid=uuid or str(uuid4()),
|
|
@@ -788,7 +797,9 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
788
797
|
|
|
789
798
|
return self.get_project(name, namespace.name)
|
|
790
799
|
|
|
791
|
-
def get_project(
|
|
800
|
+
def get_project(
|
|
801
|
+
self, name: str, namespace_name: str, create: bool = False, conn=None
|
|
802
|
+
) -> Project:
|
|
792
803
|
"""Gets a single project inside some namespace by name"""
|
|
793
804
|
n = self._namespaces
|
|
794
805
|
p = self._projects
|
|
@@ -803,6 +814,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
803
814
|
|
|
804
815
|
rows = list(self.db.execute(query, conn=conn))
|
|
805
816
|
if not rows:
|
|
817
|
+
if create:
|
|
818
|
+
return self.create_project(namespace_name, name)
|
|
806
819
|
raise ProjectNotFoundError(
|
|
807
820
|
f"Project {name} in namespace {namespace_name} not found."
|
|
808
821
|
)
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -469,10 +469,10 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
469
469
|
Studio dataset.
|
|
470
470
|
"""
|
|
471
471
|
system_namespace = self.create_namespace(Namespace.system(), "System namespace")
|
|
472
|
-
self.create_project(Project.listing(),
|
|
472
|
+
self.create_project(system_namespace.name, Project.listing(), "Listing project")
|
|
473
473
|
|
|
474
474
|
local_namespace = self.create_namespace(Namespace.default(), "Local namespace")
|
|
475
|
-
self.create_project(Project.default(),
|
|
475
|
+
self.create_project(local_namespace.name, Project.default(), "Local project")
|
|
476
476
|
|
|
477
477
|
def _check_schema_version(self) -> None:
|
|
478
478
|
"""
|
datachain/dataset.py
CHANGED
|
@@ -83,10 +83,11 @@ def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
|
|
|
83
83
|
if not name:
|
|
84
84
|
raise ValueError("Name must be defined to parse it")
|
|
85
85
|
split = name.split(".")
|
|
86
|
-
|
|
87
|
-
|
|
86
|
+
name = split[-1]
|
|
87
|
+
project_name = split[-2] if len(split) > 1 else None
|
|
88
|
+
namespace_name = split[-3] if len(split) > 2 else None
|
|
88
89
|
|
|
89
|
-
return
|
|
90
|
+
return namespace_name, project_name, name
|
|
90
91
|
|
|
91
92
|
|
|
92
93
|
class DatasetDependencyType:
|
datachain/delta.py
CHANGED
|
@@ -62,7 +62,7 @@ def _get_delta_chain(
|
|
|
62
62
|
)
|
|
63
63
|
|
|
64
64
|
# Calculate diff between source versions
|
|
65
|
-
return source_dc_latest.
|
|
65
|
+
return source_dc_latest.diff(source_dc, on=on, compare=compare, deleted=False)
|
|
66
66
|
|
|
67
67
|
|
|
68
68
|
def _get_retry_chain(
|
|
@@ -237,7 +237,7 @@ def delta_retry_update(
|
|
|
237
237
|
return None, None, False
|
|
238
238
|
|
|
239
239
|
latest_dataset = datachain.read_dataset(name, version=latest_version)
|
|
240
|
-
compared_chain = latest_dataset.
|
|
240
|
+
compared_chain = latest_dataset.diff(
|
|
241
241
|
processing_chain,
|
|
242
242
|
on=right_on or on,
|
|
243
243
|
added=True,
|
datachain/func/func.py
CHANGED
datachain/lib/arrow.py
CHANGED
|
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
|
|
|
76
76
|
fs_path = file.path
|
|
77
77
|
fs = ReferenceFileSystem({fs_path: [cache_path]})
|
|
78
78
|
else:
|
|
79
|
-
fs, fs_path = file.get_fs(), file.
|
|
79
|
+
fs, fs_path = file.get_fs(), file.get_fs_path()
|
|
80
80
|
|
|
81
81
|
kwargs = self.kwargs
|
|
82
82
|
if format := kwargs.get("format"):
|
|
@@ -160,8 +160,8 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
|
|
|
160
160
|
kwargs["format"] = fix_pyarrow_format(format, parse_options)
|
|
161
161
|
|
|
162
162
|
schemas = []
|
|
163
|
-
for file in chain.
|
|
164
|
-
ds = dataset(file.
|
|
163
|
+
for (file,) in chain.to_iter("file"):
|
|
164
|
+
ds = dataset(file.get_fs_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
|
|
165
165
|
schemas.append(ds.schema)
|
|
166
166
|
if not schemas:
|
|
167
167
|
raise ValueError(
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -22,8 +22,8 @@ if TYPE_CHECKING:
|
|
|
22
22
|
|
|
23
23
|
class DatasetInfo(DataModel):
|
|
24
24
|
name: str
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
namespace: str
|
|
26
|
+
project: str
|
|
27
27
|
uuid: str = Field(default=str(uuid4()))
|
|
28
28
|
version: str = Field(default=DEFAULT_DATASET_VERSION)
|
|
29
29
|
status: int = Field(default=DatasetStatus.CREATED)
|
|
@@ -93,8 +93,8 @@ class DatasetInfo(DataModel):
|
|
|
93
93
|
return cls(
|
|
94
94
|
uuid=version.uuid,
|
|
95
95
|
name=dataset.name,
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
namespace=dataset.project.namespace.name,
|
|
97
|
+
project=dataset.project.name,
|
|
98
98
|
version=version.version,
|
|
99
99
|
status=version.status,
|
|
100
100
|
created_at=version.created_at,
|