datachain 0.18.6__py3-none-any.whl → 0.18.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +1 -1
- datachain/client/azure.py +1 -1
- datachain/client/fsspec.py +10 -5
- datachain/client/gcs.py +2 -2
- datachain/client/local.py +4 -4
- datachain/client/s3.py +9 -3
- datachain/lib/arrow.py +2 -2
- datachain/lib/file.py +108 -34
- datachain/lib/image.py +1 -1
- datachain/lib/tar.py +1 -2
- datachain/lib/utils.py +2 -4
- datachain/lib/video.py +10 -4
- datachain/lib/webdataset.py +1 -1
- datachain/query/dataset.py +1 -1
- {datachain-0.18.6.dist-info → datachain-0.18.8.dist-info}/METADATA +2 -2
- {datachain-0.18.6.dist-info → datachain-0.18.8.dist-info}/RECORD +20 -20
- {datachain-0.18.6.dist-info → datachain-0.18.8.dist-info}/WHEEL +0 -0
- {datachain-0.18.6.dist-info → datachain-0.18.8.dist-info}/entry_points.txt +0 -0
- {datachain-0.18.6.dist-info → datachain-0.18.8.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.18.6.dist-info → datachain-0.18.8.dist-info}/top_level.txt +0 -0
datachain/cache.py
CHANGED
|
@@ -76,9 +76,9 @@ class Cache:
|
|
|
76
76
|
async def download(
|
|
77
77
|
self, file: "File", client: "Client", callback: Optional[Callback] = None
|
|
78
78
|
) -> None:
|
|
79
|
-
from_path = f"{file.source}/{file.path}"
|
|
80
79
|
from dvc_objects.fs.utils import tmp_fname
|
|
81
80
|
|
|
81
|
+
from_path = file.get_uri()
|
|
82
82
|
odb_fs = self.odb.fs
|
|
83
83
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
84
84
|
size = file.size
|
datachain/client/azure.py
CHANGED
|
@@ -15,7 +15,7 @@ class AzureClient(Client):
|
|
|
15
15
|
protocol = "az"
|
|
16
16
|
|
|
17
17
|
def info_to_file(self, v: dict[str, Any], path: str) -> File:
|
|
18
|
-
version_id = v.get("version_id")
|
|
18
|
+
version_id = v.get("version_id") if self._is_version_aware() else None
|
|
19
19
|
return File(
|
|
20
20
|
source=self.uri,
|
|
21
21
|
path=path,
|
datachain/client/fsspec.py
CHANGED
|
@@ -207,13 +207,14 @@ class Client(ABC):
|
|
|
207
207
|
)
|
|
208
208
|
|
|
209
209
|
async def get_current_etag(self, file: "File") -> str:
|
|
210
|
+
file_path = file.get_path_normalized()
|
|
210
211
|
kwargs = {}
|
|
211
|
-
if
|
|
212
|
+
if self._is_version_aware():
|
|
212
213
|
kwargs["version_id"] = file.version
|
|
213
214
|
info = await self.fs._info(
|
|
214
|
-
self.get_full_path(
|
|
215
|
+
self.get_full_path(file_path, file.version), **kwargs
|
|
215
216
|
)
|
|
216
|
-
return self.info_to_file(info,
|
|
217
|
+
return self.info_to_file(info, file_path).etag
|
|
217
218
|
|
|
218
219
|
def get_file_info(self, path: str, version_id: Optional[str] = None) -> "File":
|
|
219
220
|
info = self.fs.info(self.get_full_path(path, version_id), version_id=version_id)
|
|
@@ -326,8 +327,11 @@ class Client(ABC):
|
|
|
326
327
|
"""
|
|
327
328
|
return not (key.startswith("/") or key.endswith("/") or "//" in key)
|
|
328
329
|
|
|
330
|
+
def _is_version_aware(self) -> bool:
|
|
331
|
+
return getattr(self.fs, "version_aware", False)
|
|
332
|
+
|
|
329
333
|
async def ls_dir(self, path):
|
|
330
|
-
if
|
|
334
|
+
if self._is_version_aware():
|
|
331
335
|
kwargs = {"versions": True}
|
|
332
336
|
return await self.fs._ls(path, detail=True, **kwargs)
|
|
333
337
|
|
|
@@ -382,7 +386,8 @@ class Client(ABC):
|
|
|
382
386
|
return open(cache_path, mode="rb")
|
|
383
387
|
assert not file.location
|
|
384
388
|
return FileWrapper(
|
|
385
|
-
self.fs.open(self.get_full_path(file.
|
|
389
|
+
self.fs.open(self.get_full_path(file.get_path_normalized(), file.version)),
|
|
390
|
+
cb,
|
|
386
391
|
) # type: ignore[return-value]
|
|
387
392
|
|
|
388
393
|
def upload(self, data: bytes, path: str) -> "File":
|
datachain/client/gcs.py
CHANGED
|
@@ -115,7 +115,7 @@ class GCSClient(Client):
|
|
|
115
115
|
maxResults=page_size,
|
|
116
116
|
pageToken=next_page_token,
|
|
117
117
|
json_out=True,
|
|
118
|
-
versions="true",
|
|
118
|
+
versions="true" if self._is_version_aware() else "false",
|
|
119
119
|
)
|
|
120
120
|
assert page["kind"] == "storage#objects"
|
|
121
121
|
await page_queue.put(page.get("items", []))
|
|
@@ -134,7 +134,7 @@ class GCSClient(Client):
|
|
|
134
134
|
source=self.uri,
|
|
135
135
|
path=path,
|
|
136
136
|
etag=v.get("etag", ""),
|
|
137
|
-
version=v.get("generation", ""),
|
|
137
|
+
version=v.get("generation", "") if self._is_version_aware() else "",
|
|
138
138
|
is_latest=not v.get("timeDeleted"),
|
|
139
139
|
last_modified=self.parse_timestamp(v["updated"]),
|
|
140
140
|
size=v.get("size", ""),
|
datachain/client/local.py
CHANGED
|
@@ -99,7 +99,7 @@ class FileClient(Client):
|
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
async def get_current_etag(self, file: "File") -> str:
|
|
102
|
-
info = self.fs.info(self.get_full_path(file.
|
|
102
|
+
info = self.fs.info(self.get_full_path(file.get_path_normalized()))
|
|
103
103
|
return self.info_to_file(info, "").etag
|
|
104
104
|
|
|
105
105
|
async def get_size(self, path: str, version_id: Optional[str] = None) -> int:
|
|
@@ -138,8 +138,8 @@ class FileClient(Client):
|
|
|
138
138
|
if not self.use_symlinks:
|
|
139
139
|
super().fetch_nodes(nodes, shared_progress_bar)
|
|
140
140
|
|
|
141
|
-
def do_instantiate_object(self,
|
|
141
|
+
def do_instantiate_object(self, file: File, dst: str) -> None:
|
|
142
142
|
if self.use_symlinks:
|
|
143
|
-
os.symlink(Path(self.name,
|
|
143
|
+
os.symlink(Path(self.name, file.path), dst)
|
|
144
144
|
else:
|
|
145
|
-
super().do_instantiate_object(
|
|
145
|
+
super().do_instantiate_object(file, dst)
|
datachain/client/s3.py
CHANGED
|
@@ -101,7 +101,7 @@ class ClientS3(Client):
|
|
|
101
101
|
prefix = start_prefix
|
|
102
102
|
if prefix:
|
|
103
103
|
prefix = prefix.lstrip(DELIMITER) + DELIMITER
|
|
104
|
-
versions =
|
|
104
|
+
versions = self._is_version_aware()
|
|
105
105
|
fs = self.fs
|
|
106
106
|
await fs.set_session()
|
|
107
107
|
s3 = await fs.get_s3(self.name)
|
|
@@ -139,7 +139,9 @@ class ClientS3(Client):
|
|
|
139
139
|
source=self.uri,
|
|
140
140
|
path=v["Key"],
|
|
141
141
|
etag=v.get("ETag", "").strip('"'),
|
|
142
|
-
version=
|
|
142
|
+
version=(
|
|
143
|
+
ClientS3.clean_s3_version(v.get("VersionId", "")) if versions else ""
|
|
144
|
+
),
|
|
143
145
|
is_latest=v.get("IsLatest", True),
|
|
144
146
|
last_modified=v.get("LastModified", ""),
|
|
145
147
|
size=v["Size"],
|
|
@@ -193,7 +195,11 @@ class ClientS3(Client):
|
|
|
193
195
|
source=self.uri,
|
|
194
196
|
path=path,
|
|
195
197
|
size=v["size"],
|
|
196
|
-
version=
|
|
198
|
+
version=(
|
|
199
|
+
ClientS3.clean_s3_version(v.get("VersionId", ""))
|
|
200
|
+
if self._is_version_aware()
|
|
201
|
+
else ""
|
|
202
|
+
),
|
|
197
203
|
etag=v.get("ETag", "").strip('"'),
|
|
198
204
|
is_latest=v.get("IsLatest", True),
|
|
199
205
|
last_modified=v.get("LastModified", ""),
|
datachain/lib/arrow.py
CHANGED
|
@@ -76,7 +76,7 @@ class ArrowGenerator(Generator):
|
|
|
76
76
|
fs_path = file.path
|
|
77
77
|
fs = ReferenceFileSystem({fs_path: [cache_path]})
|
|
78
78
|
else:
|
|
79
|
-
fs, fs_path = file.get_fs(), file.
|
|
79
|
+
fs, fs_path = file.get_fs(), file.get_fs_path()
|
|
80
80
|
|
|
81
81
|
kwargs = self.kwargs
|
|
82
82
|
if format := kwargs.get("format"):
|
|
@@ -161,7 +161,7 @@ def infer_schema(chain: "DataChain", **kwargs) -> pa.Schema:
|
|
|
161
161
|
|
|
162
162
|
schemas = []
|
|
163
163
|
for file in chain.collect("file"):
|
|
164
|
-
ds = dataset(file.
|
|
164
|
+
ds = dataset(file.get_fs_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
|
|
165
165
|
schemas.append(ds.schema)
|
|
166
166
|
if not schemas:
|
|
167
167
|
raise ValueError(
|
datachain/lib/file.py
CHANGED
|
@@ -5,13 +5,14 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
import os
|
|
7
7
|
import posixpath
|
|
8
|
+
import warnings
|
|
8
9
|
from abc import ABC, abstractmethod
|
|
9
10
|
from collections.abc import Iterator
|
|
10
11
|
from contextlib import contextmanager
|
|
11
12
|
from datetime import datetime
|
|
12
13
|
from functools import partial
|
|
13
14
|
from io import BytesIO
|
|
14
|
-
from pathlib import Path, PurePosixPath
|
|
15
|
+
from pathlib import Path, PurePath, PurePosixPath
|
|
15
16
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
|
|
16
17
|
from urllib.parse import unquote, urlparse
|
|
17
18
|
from urllib.request import url2pathname
|
|
@@ -69,7 +70,7 @@ class FileExporter(NodesThreadPool):
|
|
|
69
70
|
for task in done:
|
|
70
71
|
task.result()
|
|
71
72
|
|
|
72
|
-
def do_task(self, file):
|
|
73
|
+
def do_task(self, file: "File"):
|
|
73
74
|
file.export(
|
|
74
75
|
self.output,
|
|
75
76
|
self.placement,
|
|
@@ -81,14 +82,28 @@ class FileExporter(NodesThreadPool):
|
|
|
81
82
|
|
|
82
83
|
|
|
83
84
|
class VFileError(DataChainError):
|
|
84
|
-
def __init__(self,
|
|
85
|
+
def __init__(self, message: str, source: str, path: str, vtype: str = ""):
|
|
86
|
+
self.message = message
|
|
87
|
+
self.source = source
|
|
88
|
+
self.path = path
|
|
89
|
+
self.vtype = vtype
|
|
90
|
+
|
|
85
91
|
type_ = f" of vtype '{vtype}'" if vtype else ""
|
|
86
|
-
super().__init__(f"Error in v-file '{
|
|
92
|
+
super().__init__(f"Error in v-file '{source}/{path}'{type_}: {message}")
|
|
93
|
+
|
|
94
|
+
def __reduce__(self):
|
|
95
|
+
return self.__class__, (self.message, self.source, self.path, self.vtype)
|
|
87
96
|
|
|
88
97
|
|
|
89
98
|
class FileError(DataChainError):
|
|
90
|
-
def __init__(self,
|
|
91
|
-
|
|
99
|
+
def __init__(self, message: str, source: str, path: str):
|
|
100
|
+
self.message = message
|
|
101
|
+
self.source = source
|
|
102
|
+
self.path = path
|
|
103
|
+
super().__init__(f"Error in file '{source}/{path}': {message}")
|
|
104
|
+
|
|
105
|
+
def __reduce__(self):
|
|
106
|
+
return self.__class__, (self.message, self.source, self.path)
|
|
92
107
|
|
|
93
108
|
|
|
94
109
|
class VFile(ABC):
|
|
@@ -114,18 +129,20 @@ class TarVFile(VFile):
|
|
|
114
129
|
def open(cls, file: "File", location: list[dict]):
|
|
115
130
|
"""Stream file from tar archive based on location in archive."""
|
|
116
131
|
if len(location) > 1:
|
|
117
|
-
raise VFileError(
|
|
132
|
+
raise VFileError(
|
|
133
|
+
"multiple 'location's are not supported yet", file.source, file.path
|
|
134
|
+
)
|
|
118
135
|
|
|
119
136
|
loc = location[0]
|
|
120
137
|
|
|
121
138
|
if (offset := loc.get("offset", None)) is None:
|
|
122
|
-
raise VFileError(
|
|
139
|
+
raise VFileError("'offset' is not specified", file.source, file.path)
|
|
123
140
|
|
|
124
141
|
if (size := loc.get("size", None)) is None:
|
|
125
|
-
raise VFileError(
|
|
142
|
+
raise VFileError("'size' is not specified", file.source, file.path)
|
|
126
143
|
|
|
127
144
|
if (parent := loc.get("parent", None)) is None:
|
|
128
|
-
raise VFileError(
|
|
145
|
+
raise VFileError("'parent' is not specified", file.source, file.path)
|
|
129
146
|
|
|
130
147
|
tar_file = File(**parent)
|
|
131
148
|
tar_file._set_stream(file._catalog)
|
|
@@ -145,14 +162,18 @@ class VFileRegistry:
|
|
|
145
162
|
@classmethod
|
|
146
163
|
def resolve(cls, file: "File", location: list[dict]):
|
|
147
164
|
if len(location) == 0:
|
|
148
|
-
raise VFileError(
|
|
165
|
+
raise VFileError(
|
|
166
|
+
"'location' must not be list of JSONs", file.source, file.path
|
|
167
|
+
)
|
|
149
168
|
|
|
150
169
|
if not (vtype := location[0].get("vtype", "")):
|
|
151
|
-
raise VFileError(
|
|
170
|
+
raise VFileError("vtype is not specified", file.source, file.path)
|
|
152
171
|
|
|
153
172
|
reader = cls._vtype_readers.get(vtype, None)
|
|
154
173
|
if not reader:
|
|
155
|
-
raise VFileError(
|
|
174
|
+
raise VFileError(
|
|
175
|
+
"reader not registered", file.source, file.path, vtype=vtype
|
|
176
|
+
)
|
|
156
177
|
|
|
157
178
|
return reader.open(file, location)
|
|
158
179
|
|
|
@@ -236,8 +257,8 @@ class File(DataModel):
|
|
|
236
257
|
|
|
237
258
|
@field_validator("path", mode="before")
|
|
238
259
|
@classmethod
|
|
239
|
-
def validate_path(cls, path):
|
|
240
|
-
return
|
|
260
|
+
def validate_path(cls, path: str) -> str:
|
|
261
|
+
return PurePath(path).as_posix() if path else ""
|
|
241
262
|
|
|
242
263
|
def model_dump_custom(self):
|
|
243
264
|
res = self.model_dump()
|
|
@@ -299,11 +320,11 @@ class File(DataModel):
|
|
|
299
320
|
return cls(**{key: row[key] for key in cls._datachain_column_types})
|
|
300
321
|
|
|
301
322
|
@property
|
|
302
|
-
def name(self):
|
|
323
|
+
def name(self) -> str:
|
|
303
324
|
return PurePosixPath(self.path).name
|
|
304
325
|
|
|
305
326
|
@property
|
|
306
|
-
def parent(self):
|
|
327
|
+
def parent(self) -> str:
|
|
307
328
|
return str(PurePosixPath(self.path).parent)
|
|
308
329
|
|
|
309
330
|
@contextmanager
|
|
@@ -346,7 +367,7 @@ class File(DataModel):
|
|
|
346
367
|
|
|
347
368
|
client.upload(self.read(), destination)
|
|
348
369
|
|
|
349
|
-
def _symlink_to(self, destination: str):
|
|
370
|
+
def _symlink_to(self, destination: str) -> None:
|
|
350
371
|
if self.location:
|
|
351
372
|
raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
|
|
352
373
|
|
|
@@ -355,7 +376,7 @@ class File(DataModel):
|
|
|
355
376
|
source = self.get_local_path()
|
|
356
377
|
assert source, "File was not cached"
|
|
357
378
|
elif self.source.startswith("file://"):
|
|
358
|
-
source = self.
|
|
379
|
+
source = self.get_fs_path()
|
|
359
380
|
else:
|
|
360
381
|
raise OSError(errno.EXDEV, "can't link across filesystems")
|
|
361
382
|
|
|
@@ -432,27 +453,62 @@ class File(DataModel):
|
|
|
432
453
|
|
|
433
454
|
def get_file_ext(self):
|
|
434
455
|
"""Returns last part of file name without `.`."""
|
|
435
|
-
return PurePosixPath(self.path).suffix.
|
|
456
|
+
return PurePosixPath(self.path).suffix.lstrip(".")
|
|
436
457
|
|
|
437
458
|
def get_file_stem(self):
|
|
438
459
|
"""Returns file name without extension."""
|
|
439
460
|
return PurePosixPath(self.path).stem
|
|
440
461
|
|
|
441
462
|
def get_full_name(self):
|
|
442
|
-
"""
|
|
463
|
+
"""
|
|
464
|
+
[DEPRECATED] Use `file.path` directly instead.
|
|
465
|
+
|
|
466
|
+
Returns name with parent directories.
|
|
467
|
+
"""
|
|
468
|
+
warnings.warn(
|
|
469
|
+
"file.get_full_name() is deprecated and will be removed "
|
|
470
|
+
"in a future version. Use `file.path` directly.",
|
|
471
|
+
DeprecationWarning,
|
|
472
|
+
stacklevel=2,
|
|
473
|
+
)
|
|
443
474
|
return self.path
|
|
444
475
|
|
|
445
|
-
def
|
|
476
|
+
def get_path_normalized(self) -> str:
|
|
477
|
+
if not self.path:
|
|
478
|
+
raise FileError("path must not be empty", self.source, self.path)
|
|
479
|
+
|
|
480
|
+
if self.path.endswith("/"):
|
|
481
|
+
raise FileError("path must not be a directory", self.source, self.path)
|
|
482
|
+
|
|
483
|
+
normpath = os.path.normpath(self.path)
|
|
484
|
+
normpath = PurePath(normpath).as_posix()
|
|
485
|
+
|
|
486
|
+
if normpath == ".":
|
|
487
|
+
raise FileError("path must not be a directory", self.source, self.path)
|
|
488
|
+
|
|
489
|
+
if any(part == ".." for part in PurePath(normpath).parts):
|
|
490
|
+
raise FileError("path must not contain '..'", self.source, self.path)
|
|
491
|
+
|
|
492
|
+
return normpath
|
|
493
|
+
|
|
494
|
+
def get_uri(self) -> str:
|
|
446
495
|
"""Returns file URI."""
|
|
447
|
-
return f"{self.source}/{self.
|
|
496
|
+
return f"{self.source}/{self.get_path_normalized()}"
|
|
497
|
+
|
|
498
|
+
def get_fs_path(self) -> str:
|
|
499
|
+
"""
|
|
500
|
+
Returns file path with respect to the filescheme.
|
|
448
501
|
|
|
449
|
-
|
|
450
|
-
|
|
502
|
+
If `normalize` is True, the path is normalized to remove any redundant
|
|
503
|
+
separators and up-level references.
|
|
504
|
+
|
|
505
|
+
If the file scheme is "file", the path is converted to a local file path
|
|
506
|
+
using `url2pathname`. Otherwise, the original path with scheme is returned.
|
|
507
|
+
"""
|
|
451
508
|
path = unquote(self.get_uri())
|
|
452
|
-
|
|
453
|
-
if
|
|
454
|
-
path =
|
|
455
|
-
path = url2pathname(path)
|
|
509
|
+
path_parsed = urlparse(path)
|
|
510
|
+
if path_parsed.scheme == "file":
|
|
511
|
+
path = url2pathname(path_parsed.path)
|
|
456
512
|
return path
|
|
457
513
|
|
|
458
514
|
def get_destination_path(
|
|
@@ -467,7 +523,7 @@ class File(DataModel):
|
|
|
467
523
|
elif placement == "etag":
|
|
468
524
|
path = f"{self.etag}{self.get_file_suffix()}"
|
|
469
525
|
elif placement == "fullpath":
|
|
470
|
-
path = unquote(self.
|
|
526
|
+
path = unquote(self.get_path_normalized())
|
|
471
527
|
source = urlparse(self.source)
|
|
472
528
|
if source.scheme and source.scheme != "file":
|
|
473
529
|
path = posixpath.join(source.netloc, path)
|
|
@@ -505,8 +561,9 @@ class File(DataModel):
|
|
|
505
561
|
) from e
|
|
506
562
|
|
|
507
563
|
try:
|
|
508
|
-
|
|
509
|
-
|
|
564
|
+
normalized_path = self.get_path_normalized()
|
|
565
|
+
info = client.fs.info(client.get_full_path(normalized_path))
|
|
566
|
+
converted_info = client.info_to_file(info, normalized_path)
|
|
510
567
|
return type(self)(
|
|
511
568
|
path=self.path,
|
|
512
569
|
source=self.source,
|
|
@@ -517,8 +574,17 @@ class File(DataModel):
|
|
|
517
574
|
last_modified=converted_info.last_modified,
|
|
518
575
|
location=self.location,
|
|
519
576
|
)
|
|
577
|
+
except FileError as e:
|
|
578
|
+
logger.warning(
|
|
579
|
+
"File error when resolving %s/%s: %s", self.source, self.path, str(e)
|
|
580
|
+
)
|
|
520
581
|
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
521
|
-
logger.warning(
|
|
582
|
+
logger.warning(
|
|
583
|
+
"File system error when resolving %s/%s: %s",
|
|
584
|
+
self.source,
|
|
585
|
+
self.path,
|
|
586
|
+
str(e),
|
|
587
|
+
)
|
|
522
588
|
|
|
523
589
|
return type(self)(
|
|
524
590
|
path=self.path,
|
|
@@ -534,6 +600,8 @@ class File(DataModel):
|
|
|
534
600
|
|
|
535
601
|
def resolve(file: File) -> File:
|
|
536
602
|
"""
|
|
603
|
+
[DEPRECATED] Use `file.resolve()` directly instead.
|
|
604
|
+
|
|
537
605
|
Resolve a File object by checking its existence and updating its metadata.
|
|
538
606
|
|
|
539
607
|
This function is a wrapper around the File.resolve() method, designed to be
|
|
@@ -549,6 +617,12 @@ def resolve(file: File) -> File:
|
|
|
549
617
|
RuntimeError: If the file's catalog is not set or if
|
|
550
618
|
the file source protocol is unsupported.
|
|
551
619
|
"""
|
|
620
|
+
warnings.warn(
|
|
621
|
+
"resolve() is deprecated and will be removed "
|
|
622
|
+
"in a future version. Use file.resolve() directly.",
|
|
623
|
+
DeprecationWarning,
|
|
624
|
+
stacklevel=2,
|
|
625
|
+
)
|
|
552
626
|
return file.resolve()
|
|
553
627
|
|
|
554
628
|
|
|
@@ -896,7 +970,7 @@ class ArrowRow(DataModel):
|
|
|
896
970
|
ds = dataset(path, **self.kwargs)
|
|
897
971
|
|
|
898
972
|
else:
|
|
899
|
-
path = self.file.
|
|
973
|
+
path = self.file.get_fs_path()
|
|
900
974
|
ds = dataset(path, filesystem=self.file.get_fs(), **self.kwargs)
|
|
901
975
|
|
|
902
976
|
return ds.take([self.index]).to_reader()
|
datachain/lib/image.py
CHANGED
|
@@ -19,7 +19,7 @@ def image_info(file: Union[File, ImageFile]) -> Image:
|
|
|
19
19
|
try:
|
|
20
20
|
img = file.as_image_file().read()
|
|
21
21
|
except Exception as exc:
|
|
22
|
-
raise FileError(
|
|
22
|
+
raise FileError("unable to open image file", file.source, file.path) from exc
|
|
23
23
|
|
|
24
24
|
return Image(
|
|
25
25
|
width=img.width,
|
datachain/lib/tar.py
CHANGED
|
@@ -6,12 +6,11 @@ from datachain.lib.file import File, TarVFile
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def build_tar_member(parent: File, info: tarfile.TarInfo) -> File:
|
|
9
|
-
new_parent = parent.get_full_name()
|
|
10
9
|
etag_string = "-".join([parent.etag, info.name, str(info.mtime)])
|
|
11
10
|
etag = hashlib.md5(etag_string.encode(), usedforsecurity=False).hexdigest()
|
|
12
11
|
return File(
|
|
13
12
|
source=parent.source,
|
|
14
|
-
path=f"{
|
|
13
|
+
path=f"{parent.name}/{info.name}",
|
|
15
14
|
version=parent.version,
|
|
16
15
|
size=info.size,
|
|
17
16
|
etag=etag,
|
datachain/lib/utils.py
CHANGED
|
@@ -18,13 +18,11 @@ class AbstractUDF(ABC):
|
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class DataChainError(Exception):
|
|
21
|
-
|
|
22
|
-
super().__init__(message)
|
|
21
|
+
pass
|
|
23
22
|
|
|
24
23
|
|
|
25
24
|
class DataChainParamsError(DataChainError):
|
|
26
|
-
|
|
27
|
-
super().__init__(message)
|
|
25
|
+
pass
|
|
28
26
|
|
|
29
27
|
|
|
30
28
|
class DataChainColumnError(DataChainParamsError):
|
datachain/lib/video.py
CHANGED
|
@@ -34,21 +34,27 @@ def video_info(file: Union[File, VideoFile]) -> Video:
|
|
|
34
34
|
file.ensure_cached()
|
|
35
35
|
file_path = file.get_local_path()
|
|
36
36
|
if not file_path:
|
|
37
|
-
raise FileError(
|
|
37
|
+
raise FileError("unable to download video file", file.source, file.path)
|
|
38
38
|
|
|
39
39
|
try:
|
|
40
40
|
probe = ffmpeg.probe(file_path)
|
|
41
41
|
except Exception as exc:
|
|
42
|
-
raise FileError(
|
|
42
|
+
raise FileError(
|
|
43
|
+
"unable to extract metadata from video file", file.source, file.path
|
|
44
|
+
) from exc
|
|
43
45
|
|
|
44
46
|
all_streams = probe.get("streams")
|
|
45
47
|
video_format = probe.get("format")
|
|
46
48
|
if not all_streams or not video_format:
|
|
47
|
-
raise FileError(
|
|
49
|
+
raise FileError(
|
|
50
|
+
"unable to extract metadata from video file", file.source, file.path
|
|
51
|
+
)
|
|
48
52
|
|
|
49
53
|
video_streams = [s for s in all_streams if s["codec_type"] == "video"]
|
|
50
54
|
if len(video_streams) == 0:
|
|
51
|
-
raise FileError(
|
|
55
|
+
raise FileError(
|
|
56
|
+
"unable to extract metadata from video file", file.source, file.path
|
|
57
|
+
)
|
|
52
58
|
|
|
53
59
|
video_stream = video_streams[0]
|
|
54
60
|
|
datachain/lib/webdataset.py
CHANGED
|
@@ -35,7 +35,7 @@ warnings.filterwarnings(
|
|
|
35
35
|
|
|
36
36
|
class WDSError(DataChainError):
|
|
37
37
|
def __init__(self, tar_stream, message: str):
|
|
38
|
-
super().__init__(f"WebDataset error '{tar_stream.
|
|
38
|
+
super().__init__(f"WebDataset error '{tar_stream.name}': {message}")
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
class CoreFileDuplicationError(WDSError):
|
datachain/query/dataset.py
CHANGED
|
@@ -1348,7 +1348,7 @@ class DatasetQuery:
|
|
|
1348
1348
|
|
|
1349
1349
|
async def get_params(row: Sequence) -> tuple:
|
|
1350
1350
|
row_dict = RowDict(zip(query_fields, row))
|
|
1351
|
-
return tuple(
|
|
1351
|
+
return tuple( # noqa: C409
|
|
1352
1352
|
[
|
|
1353
1353
|
await p.get_value_async(
|
|
1354
1354
|
self.catalog, row_dict, mapper, **kwargs
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.18.
|
|
3
|
+
Version: 0.18.8
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -94,7 +94,7 @@ Requires-Dist: scipy; extra == "tests"
|
|
|
94
94
|
Requires-Dist: ultralytics; extra == "tests"
|
|
95
95
|
Provides-Extra: dev
|
|
96
96
|
Requires-Dist: datachain[docs,tests]; extra == "dev"
|
|
97
|
-
Requires-Dist: mypy==1.
|
|
97
|
+
Requires-Dist: mypy==1.16.0; extra == "dev"
|
|
98
98
|
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
99
99
|
Requires-Dist: types-pytz; extra == "dev"
|
|
100
100
|
Requires-Dist: types-PyYAML; extra == "dev"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
datachain/__init__.py,sha256=Dx_Dw6AuvC_CZtXxfRv0Z-ND6ieC4Cz-tZkMW-Rvmz4,1496
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
|
-
datachain/cache.py,sha256=
|
|
4
|
+
datachain/cache.py,sha256=3GWMvF2LMpz2l5lWbtbpmzSB-92eGCCtujeWlFa3r14,3609
|
|
5
5
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
6
6
|
datachain/dataset.py,sha256=XUZ-kSBL1y6juFqlSWXXbattGS1E53lXpyhc0Ip1_AA,20527
|
|
7
7
|
datachain/delta.py,sha256=q-ritPMxgsTh53qJYd2N1TqZ3Inxc7GJ9JED9rE-Z1M,3994
|
|
@@ -37,13 +37,13 @@ datachain/cli/parser/job.py,sha256=acdVYuTsqluRDI_FYhZ1ohjQcVtBj-taUm8y9tGb0_0,4
|
|
|
37
37
|
datachain/cli/parser/studio.py,sha256=Y-1OlQGecLVi9QofvWUfSlPd2ISyaESf7QFGZqGsrdw,3609
|
|
38
38
|
datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI,2888
|
|
39
39
|
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
40
|
-
datachain/client/azure.py,sha256=
|
|
40
|
+
datachain/client/azure.py,sha256=7yyAgANHfu9Kfh187MKNTT1guvu9Q-WYsi4vYoY3aew,3270
|
|
41
41
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
42
|
-
datachain/client/fsspec.py,sha256=
|
|
43
|
-
datachain/client/gcs.py,sha256=
|
|
42
|
+
datachain/client/fsspec.py,sha256=SSKhvl7x2IzECYUsJ_4hYxvy46AiU0wpsfPduE9alFI,13995
|
|
43
|
+
datachain/client/gcs.py,sha256=8hcFhEHp8qGRsJoyfCoawfuwb1Et-MSkyQoM9AnNuXI,5204
|
|
44
44
|
datachain/client/hf.py,sha256=posnI5WOKOMG1yY_ZiV9Orcd24QsUPKZlOXgJVLxxrM,1558
|
|
45
|
-
datachain/client/local.py,sha256=
|
|
46
|
-
datachain/client/s3.py,sha256=
|
|
45
|
+
datachain/client/local.py,sha256=0J52Wzvw25hSucVlzBvLuMRAZwrAHZAYDvD1mNBqf4c,4607
|
|
46
|
+
datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
|
|
47
47
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
48
48
|
datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
|
|
49
49
|
datachain/data_storage/job.py,sha256=9r0OGwh22bHNIvLHqg8_-eJSP1YYB-BN5HOla5TdCxw,402
|
|
@@ -68,13 +68,13 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
|
|
|
68
68
|
datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
|
|
69
69
|
datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
|
|
70
70
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
|
-
datachain/lib/arrow.py,sha256=
|
|
71
|
+
datachain/lib/arrow.py,sha256=K8djofgt4HEgxnkwqZZChccAqeIQ_1D2urGyqti-1-4,10259
|
|
72
72
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
73
73
|
datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
|
|
74
74
|
datachain/lib/dataset_info.py,sha256=d-jz6zeDU5DEgYtyeSF5nK0MU-40FV5km_iOCh4pXzo,3179
|
|
75
|
-
datachain/lib/file.py,sha256
|
|
75
|
+
datachain/lib/file.py,sha256=-Y0ccgfQt-2jOnNhOH5j5fTQpCsS9z2ja97umDUHbmA,33054
|
|
76
76
|
datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
|
|
77
|
-
datachain/lib/image.py,sha256=
|
|
77
|
+
datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
|
|
78
78
|
datachain/lib/listing.py,sha256=5_GoATtIwCtd1JMqlorPB_vQDxndOQZpiWjNOG3NMw4,7007
|
|
79
79
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
80
80
|
datachain/lib/meta_formats.py,sha256=Epydbdch1g4CojK8wd_ePzmwmljC4fVWlJtZ16jsX-A,6349
|
|
@@ -82,13 +82,13 @@ datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,
|
|
|
82
82
|
datachain/lib/pytorch.py,sha256=elrmJ4YUDC2LZ9yXM1KwImVBOYIBJf6k0ZR7eSe6Aao,7712
|
|
83
83
|
datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
|
|
84
84
|
datachain/lib/signal_schema.py,sha256=Zhg8qThFDf9eoNWFH6KGeYB-sIGys7A_ybq2CUBG7Dg,36127
|
|
85
|
-
datachain/lib/tar.py,sha256=
|
|
85
|
+
datachain/lib/tar.py,sha256=k8RFnF72H1jxbMghQQbmoGL-UsA1im8gRLXBM1GJAYI,999
|
|
86
86
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
87
87
|
datachain/lib/udf.py,sha256=FWqA476ygdk4MU-0qehYKxvnt8Tekh21Cyf3RgddD1k,16674
|
|
88
88
|
datachain/lib/udf_signature.py,sha256=2EtsOPDNSPqcOlYwqbCdy6RF5MldI-7smii8aLy8p7Y,7543
|
|
89
|
-
datachain/lib/utils.py,sha256=
|
|
90
|
-
datachain/lib/video.py,sha256=
|
|
91
|
-
datachain/lib/webdataset.py,sha256=
|
|
89
|
+
datachain/lib/utils.py,sha256=rG2y7NwTqZOuomZZRmrA-Q-ANM_j1cToQYqDJoOeGyU,1480
|
|
90
|
+
datachain/lib/video.py,sha256=u6fLJWj5G6QqsVkpfHnKGklBNpG3BRRg6v3izngnNcU,6767
|
|
91
|
+
datachain/lib/webdataset.py,sha256=hZWar13LoZ1TAidFW_sl9rUO-KtMJQY3OFmbnPkJw_A,6913
|
|
92
92
|
datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0EVlUE,2525
|
|
93
93
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
94
94
|
datachain/lib/convert/flatten.py,sha256=IZFiUYbgXSxXhPSG5Cqf5IjnJ4ZDZKXMr4o_yCR1NY4,1505
|
|
@@ -121,7 +121,7 @@ datachain/model/ultralytics/pose.py,sha256=pBlmt63Qe68FKmexHimUGlNbNOoOlMHXG4fzX
|
|
|
121
121
|
datachain/model/ultralytics/segment.py,sha256=63bDCj43E6iZ0hFI5J6uQfksdCmjEp6sEm1XzVaE8pw,2986
|
|
122
122
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
123
123
|
datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
|
|
124
|
-
datachain/query/dataset.py,sha256=
|
|
124
|
+
datachain/query/dataset.py,sha256=dI51zOU1Drev65f6SPn4mvRdwRXs4SOW5STMm3WYd7A,60601
|
|
125
125
|
datachain/query/dispatch.py,sha256=A0nPxn6mEN5d9dDo6S8m16Ji_9IvJLXrgF2kqXdi4fs,15546
|
|
126
126
|
datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
|
|
127
127
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -153,9 +153,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
153
153
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
154
154
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
155
155
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
156
|
-
datachain-0.18.
|
|
157
|
-
datachain-0.18.
|
|
158
|
-
datachain-0.18.
|
|
159
|
-
datachain-0.18.
|
|
160
|
-
datachain-0.18.
|
|
161
|
-
datachain-0.18.
|
|
156
|
+
datachain-0.18.8.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
157
|
+
datachain-0.18.8.dist-info/METADATA,sha256=7_EQNrTrI5u-hjaGNfJOamf3LW-qljTmCuELCFkA2yE,11319
|
|
158
|
+
datachain-0.18.8.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
159
|
+
datachain-0.18.8.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
160
|
+
datachain-0.18.8.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
161
|
+
datachain-0.18.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|