datachain 0.20.2__py3-none-any.whl → 0.20.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +2 -3
- datachain/cache.py +2 -2
- datachain/catalog/catalog.py +3 -3
- datachain/cli/commands/ls.py +2 -2
- datachain/client/fsspec.py +5 -3
- datachain/client/local.py +4 -4
- datachain/data_storage/metastore.py +22 -7
- datachain/data_storage/sqlite.py +1 -4
- datachain/dataset.py +4 -3
- datachain/delta.py +2 -2
- datachain/func/func.py +1 -1
- datachain/lib/arrow.py +3 -3
- datachain/lib/dataset_info.py +4 -4
- datachain/lib/dc/datachain.py +178 -89
- datachain/lib/dc/datasets.py +46 -42
- datachain/lib/dc/storage.py +24 -38
- datachain/lib/file.py +77 -23
- datachain/lib/meta_formats.py +1 -1
- datachain/lib/namespaces.py +16 -18
- datachain/lib/projects.py +26 -26
- datachain/lib/pytorch.py +1 -1
- datachain/lib/tar.py +1 -2
- datachain/lib/udf_signature.py +1 -1
- datachain/lib/webdataset.py +30 -20
- datachain/namespace.py +3 -3
- datachain/project.py +5 -5
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/METADATA +1 -1
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/RECORD +32 -32
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/WHEEL +0 -0
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/entry_points.txt +0 -0
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.20.2.dist-info → datachain-0.20.4.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from datachain.lib import namespaces, projects
|
|
2
1
|
from datachain.lib.data_model import DataModel, DataType, is_chain_type
|
|
3
2
|
from datachain.lib.dc import (
|
|
4
3
|
C,
|
|
@@ -33,6 +32,7 @@ from datachain.lib.file import (
|
|
|
33
32
|
VideoFrame,
|
|
34
33
|
)
|
|
35
34
|
from datachain.lib.model_store import ModelStore
|
|
35
|
+
from datachain.lib.projects import create as create_project
|
|
36
36
|
from datachain.lib.udf import Aggregator, Generator, Mapper
|
|
37
37
|
from datachain.lib.utils import AbstractUDF, DataChainError
|
|
38
38
|
from datachain.query import metrics, param
|
|
@@ -63,14 +63,13 @@ __all__ = [
|
|
|
63
63
|
"VideoFile",
|
|
64
64
|
"VideoFragment",
|
|
65
65
|
"VideoFrame",
|
|
66
|
+
"create_project",
|
|
66
67
|
"datasets",
|
|
67
68
|
"delete_dataset",
|
|
68
69
|
"is_chain_type",
|
|
69
70
|
"listings",
|
|
70
71
|
"metrics",
|
|
71
|
-
"namespaces",
|
|
72
72
|
"param",
|
|
73
|
-
"projects",
|
|
74
73
|
"read_csv",
|
|
75
74
|
"read_database",
|
|
76
75
|
"read_dataset",
|
datachain/cache.py
CHANGED
|
@@ -39,7 +39,7 @@ def temporary_cache(
|
|
|
39
39
|
cache.destroy()
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class Cache:
|
|
42
|
+
class Cache: # noqa: PLW1641
|
|
43
43
|
def __init__(self, cache_dir: str, tmp_dir: str):
|
|
44
44
|
self.odb = LocalHashFileDB(
|
|
45
45
|
LocalFileSystem(),
|
|
@@ -76,9 +76,9 @@ class Cache:
|
|
|
76
76
|
async def download(
|
|
77
77
|
self, file: "File", client: "Client", callback: Optional[Callback] = None
|
|
78
78
|
) -> None:
|
|
79
|
-
from_path = f"{file.source}/{file.path}"
|
|
80
79
|
from dvc_objects.fs.utils import tmp_fname
|
|
81
80
|
|
|
81
|
+
from_path = file.get_uri()
|
|
82
82
|
odb_fs = self.odb.fs
|
|
83
83
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
84
84
|
size = file.size
|
datachain/catalog/catalog.py
CHANGED
|
@@ -1491,13 +1491,13 @@ class Catalog:
|
|
|
1491
1491
|
|
|
1492
1492
|
namespace = self.metastore.create_namespace(
|
|
1493
1493
|
remote_ds.project.namespace.name,
|
|
1494
|
-
description=remote_ds.project.namespace.
|
|
1494
|
+
description=remote_ds.project.namespace.descr,
|
|
1495
1495
|
uuid=remote_ds.project.namespace.uuid,
|
|
1496
1496
|
)
|
|
1497
1497
|
project = self.metastore.create_project(
|
|
1498
|
-
remote_ds.project.name,
|
|
1499
1498
|
namespace.name,
|
|
1500
|
-
|
|
1499
|
+
remote_ds.project.name,
|
|
1500
|
+
description=remote_ds.project.descr,
|
|
1501
1501
|
uuid=remote_ds.project.uuid,
|
|
1502
1502
|
)
|
|
1503
1503
|
|
datachain/cli/commands/ls.py
CHANGED
|
@@ -63,8 +63,8 @@ def ls_local(
|
|
|
63
63
|
print(format_ls_entry(entry))
|
|
64
64
|
else:
|
|
65
65
|
# Collect results in a list here to prevent interference from `tqdm` and `print`
|
|
66
|
-
listing =
|
|
67
|
-
for ls in listing:
|
|
66
|
+
listing = listings().to_list("listing")
|
|
67
|
+
for (ls,) in listing:
|
|
68
68
|
print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
|
|
69
69
|
|
|
70
70
|
|
datachain/client/fsspec.py
CHANGED
|
@@ -207,13 +207,14 @@ class Client(ABC):
|
|
|
207
207
|
)
|
|
208
208
|
|
|
209
209
|
async def get_current_etag(self, file: "File") -> str:
|
|
210
|
+
file_path = file.get_path_normalized()
|
|
210
211
|
kwargs = {}
|
|
211
212
|
if self._is_version_aware():
|
|
212
213
|
kwargs["version_id"] = file.version
|
|
213
214
|
info = await self.fs._info(
|
|
214
|
-
self.get_full_path(
|
|
215
|
+
self.get_full_path(file_path, file.version), **kwargs
|
|
215
216
|
)
|
|
216
|
-
return self.info_to_file(info,
|
|
217
|
+
return self.info_to_file(info, file_path).etag
|
|
217
218
|
|
|
218
219
|
def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
|
|
219
220
|
info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
|
|
@@ -386,7 +387,8 @@ class Client(ABC):
|
|
|
386
387
|
return open(cache_path, mode="rb")
|
|
387
388
|
assert not file.location
|
|
388
389
|
return FileWrapper(
|
|
389
|
-
self.fs.open(self.get_full_path(file.
|
|
390
|
+
self.fs.open(self.get_full_path(file.get_path_normalized(), file.version)),
|
|
391
|
+
cb,
|
|
390
392
|
) # type: ignore[return-value]
|
|
391
393
|
|
|
392
394
|
def upload(self, data: bytes, path: str) -> "File":
|
datachain/client/local.py
CHANGED
|
@@ -99,7 +99,7 @@ class FileClient(Client):
|
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
async def get_current_etag(self, file: "File") -> str:
|
|
102
|
-
info = self.fs.info(self.get_full_path(file.
|
|
102
|
+
info = self.fs.info(self.get_full_path(file.get_path_normalized()))
|
|
103
103
|
return self.info_to_file(info, "").etag
|
|
104
104
|
|
|
105
105
|
async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
|
|
@@ -138,8 +138,8 @@ class FileClient(Client):
|
|
|
138
138
|
if not self.use_symlinks:
|
|
139
139
|
super().fetch_nodes(nodes, shared_progress_bar)
|
|
140
140
|
|
|
141
|
-
def do_instantiate_object(self,
|
|
141
|
+
def do_instantiate_object(self, file: File, dst: str) -> None:
|
|
142
142
|
if self.use_symlinks:
|
|
143
|
-
os.symlink(Path(self.name,
|
|
143
|
+
os.symlink(Path(self.name, file.path), dst)
|
|
144
144
|
else:
|
|
145
|
-
super().do_instantiate_object(
|
|
145
|
+
super().do_instantiate_object(file, dst)
|
|
@@ -176,7 +176,9 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
176
176
|
|
|
177
177
|
@cached_property
|
|
178
178
|
def default_project(self) -> Project:
|
|
179
|
-
return self.get_project(
|
|
179
|
+
return self.get_project(
|
|
180
|
+
self.default_project_name, self.default_namespace_name, create=True
|
|
181
|
+
)
|
|
180
182
|
|
|
181
183
|
@cached_property
|
|
182
184
|
def listing_project(self) -> Project:
|
|
@@ -185,8 +187,8 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
185
187
|
@abstractmethod
|
|
186
188
|
def create_project(
|
|
187
189
|
self,
|
|
188
|
-
name: str,
|
|
189
190
|
namespace_name: str,
|
|
191
|
+
name: str,
|
|
190
192
|
description: Optional[str] = None,
|
|
191
193
|
uuid: Optional[str] = None,
|
|
192
194
|
ignore_if_exists: bool = True,
|
|
@@ -195,8 +197,13 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
195
197
|
"""Creates new project in specific namespace"""
|
|
196
198
|
|
|
197
199
|
@abstractmethod
|
|
198
|
-
def get_project(
|
|
199
|
-
|
|
200
|
+
def get_project(
|
|
201
|
+
self, name: str, namespace_name: str, create: bool = False, conn=None
|
|
202
|
+
) -> Project:
|
|
203
|
+
"""
|
|
204
|
+
Gets a single project inside some namespace by name.
|
|
205
|
+
It also creates project if not found and create flag is set to True.
|
|
206
|
+
"""
|
|
200
207
|
|
|
201
208
|
@abstractmethod
|
|
202
209
|
def list_projects(self, namespace_id: Optional[int], conn=None) -> list[Project]:
|
|
@@ -763,14 +770,18 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
763
770
|
|
|
764
771
|
def create_project(
|
|
765
772
|
self,
|
|
766
|
-
name: str,
|
|
767
773
|
namespace_name: str,
|
|
774
|
+
name: str,
|
|
768
775
|
description: Optional[str] = None,
|
|
769
776
|
uuid: Optional[str] = None,
|
|
770
777
|
ignore_if_exists: bool = True,
|
|
771
778
|
**kwargs,
|
|
772
779
|
) -> Project:
|
|
773
|
-
|
|
780
|
+
try:
|
|
781
|
+
namespace = self.get_namespace(namespace_name)
|
|
782
|
+
except NamespaceNotFoundError:
|
|
783
|
+
namespace = self.create_namespace(namespace_name)
|
|
784
|
+
|
|
774
785
|
query = self._projects_insert().values(
|
|
775
786
|
namespace_id=namespace.id,
|
|
776
787
|
uuid=uuid or str(uuid4()),
|
|
@@ -788,7 +799,9 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
788
799
|
|
|
789
800
|
return self.get_project(name, namespace.name)
|
|
790
801
|
|
|
791
|
-
def get_project(
|
|
802
|
+
def get_project(
|
|
803
|
+
self, name: str, namespace_name: str, create: bool = False, conn=None
|
|
804
|
+
) -> Project:
|
|
792
805
|
"""Gets a single project inside some namespace by name"""
|
|
793
806
|
n = self._namespaces
|
|
794
807
|
p = self._projects
|
|
@@ -803,6 +816,8 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
803
816
|
|
|
804
817
|
rows = list(self.db.execute(query, conn=conn))
|
|
805
818
|
if not rows:
|
|
819
|
+
if create:
|
|
820
|
+
return self.create_project(namespace_name, name)
|
|
806
821
|
raise ProjectNotFoundError(
|
|
807
822
|
f"Project {name} in namespace {namespace_name} not found."
|
|
808
823
|
)
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -469,10 +469,7 @@ class SQLiteMetastore(AbstractDBMetastore):
|
|
|
469
469
|
Studio dataset.
|
|
470
470
|
"""
|
|
471
471
|
system_namespace = self.create_namespace(Namespace.system(), "System namespace")
|
|
472
|
-
self.create_project(Project.listing(),
|
|
473
|
-
|
|
474
|
-
local_namespace = self.create_namespace(Namespace.default(), "Local namespace")
|
|
475
|
-
self.create_project(Project.default(), local_namespace.name, "Local project")
|
|
472
|
+
self.create_project(system_namespace.name, Project.listing(), "Listing project")
|
|
476
473
|
|
|
477
474
|
def _check_schema_version(self) -> None:
|
|
478
475
|
"""
|
datachain/dataset.py
CHANGED
|
@@ -83,10 +83,11 @@ def parse_dataset_name(name: str) -> tuple[Optional[str], Optional[str], str]:
|
|
|
83
83
|
if not name:
|
|
84
84
|
raise ValueError("Name must be defined to parse it")
|
|
85
85
|
split = name.split(".")
|
|
86
|
-
|
|
87
|
-
|
|
86
|
+
name = split[-1]
|
|
87
|
+
project_name = split[-2] if len(split) > 1 else None
|
|
88
|
+
namespace_name = split[-3] if len(split) > 2 else None
|
|
88
89
|
|
|
89
|
-
return
|
|
90
|
+
return namespace_name, project_name, name
|
|
90
91
|
|
|
91
92
|
|
|
92
93
|
class DatasetDependencyType:
|
datachain/delta.py
CHANGED
|
@@ -62,7 +62,7 @@ def _get_delta_chain(
|
|
|
62
62
|
)
|
|
63
63
|
|
|
64
64
|
# Calculate diff between source versions
|
|
65
|
-
return source_dc_latest.
|
|
65
|
+
return source_dc_latest.diff(source_dc, on=on, compare=compare, deleted=False)
|
|
66
66
|
|
|
67
67
|
|
|
68
68
|
def _get_retry_chain(
|
|
@@ -237,7 +237,7 @@ def delta_retry_update(
|
|
|
237
237
|
return None, None, False
|
|
238
238
|
|
|
239
239
|
latest_dataset = datachain.read_dataset(name, version=latest_version)
|
|
240
|
-
compared_chain = latest_dataset.
|
|
240
|
+
compared_chain = latest_dataset.diff(
|
|
241
241
|
processing_chain,
|
|
242
242
|
on=right_on or on,
|
|
243
243
|
added=True,
|
datachain/func/func.py
CHANGED
datachain/lib/arrow.py
CHANGED
|
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
|
|
|
76
76
|
fs_path = file.path
|
|
77
77
|
fs = ReferenceFileSystem({fs_path: [cache_path]})
|
|
78
78
|
else:
|
|
79
|
-
fs, fs_path = file.get_fs(), file.
|
|
79
|
+
fs, fs_path = file.get_fs(), file.get_fs_path()
|
|
80
80
|
|
|
81
81
|
kwargs = self.kwargs
|
|
82
82
|
if format := kwargs.get("format"):
|
|
@@ -160,8 +160,8 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
|
|
|
160
160
|
kwargs["format"] = fix_pyarrow_format(format, parse_options)
|
|
161
161
|
|
|
162
162
|
schemas = []
|
|
163
|
-
for file in chain.
|
|
164
|
-
ds = dataset(file.
|
|
163
|
+
for (file,) in chain.to_iter("file"):
|
|
164
|
+
ds = dataset(file.get_fs_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
|
|
165
165
|
schemas.append(ds.schema)
|
|
166
166
|
if not schemas:
|
|
167
167
|
raise ValueError(
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -22,8 +22,8 @@ if TYPE_CHECKING:
|
|
|
22
22
|
|
|
23
23
|
class DatasetInfo(DataModel):
|
|
24
24
|
name: str
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
namespace: str
|
|
26
|
+
project: str
|
|
27
27
|
uuid: str = Field(default=str(uuid4()))
|
|
28
28
|
version: str = Field(default=DEFAULT_DATASET_VERSION)
|
|
29
29
|
status: int = Field(default=DatasetStatus.CREATED)
|
|
@@ -93,8 +93,8 @@ class DatasetInfo(DataModel):
|
|
|
93
93
|
return cls(
|
|
94
94
|
uuid=version.uuid,
|
|
95
95
|
name=dataset.name,
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
namespace=dataset.project.namespace.name,
|
|
97
|
+
project=dataset.project.name,
|
|
98
98
|
version=version.version,
|
|
99
99
|
status=version.status,
|
|
100
100
|
created_at=version.created_at,
|