datachain 0.3.17__py3-none-any.whl → 0.3.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +14 -55
- datachain/catalog/catalog.py +8 -18
- datachain/cli.py +7 -1
- datachain/client/fsspec.py +29 -63
- datachain/client/local.py +2 -3
- datachain/lib/arrow.py +2 -1
- datachain/lib/dc.py +4 -0
- datachain/lib/file.py +41 -23
- datachain/lib/listing.py +2 -0
- datachain/listing.py +4 -4
- datachain/node.py +6 -6
- datachain/nodes_fetcher.py +12 -5
- datachain/nodes_thread_pool.py +1 -1
- datachain/progress.py +2 -12
- datachain/query/dataset.py +6 -18
- datachain/query/dispatch.py +2 -15
- datachain/query/schema.py +25 -24
- datachain/query/udf.py +0 -106
- datachain/sql/types.py +4 -2
- datachain/telemetry.py +37 -0
- datachain/utils.py +11 -0
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/METADATA +5 -3
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/RECORD +27 -26
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/LICENSE +0 -0
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/WHEEL +0 -0
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/top_level.txt +0 -0
datachain/cache.py
CHANGED
|
@@ -1,56 +1,15 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import json
|
|
3
1
|
import os
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from functools import partial
|
|
6
2
|
from typing import TYPE_CHECKING, Optional
|
|
7
3
|
|
|
8
|
-
import attrs
|
|
9
4
|
from dvc_data.hashfile.db.local import LocalHashFileDB
|
|
10
5
|
from dvc_objects.fs.local import LocalFileSystem
|
|
11
6
|
from fsspec.callbacks import Callback, TqdmCallback
|
|
12
7
|
|
|
13
|
-
from datachain.utils import TIME_ZERO
|
|
14
|
-
|
|
15
8
|
from .progress import Tqdm
|
|
16
9
|
|
|
17
10
|
if TYPE_CHECKING:
|
|
18
11
|
from datachain.client import Client
|
|
19
|
-
from datachain.
|
|
20
|
-
|
|
21
|
-
sha256 = partial(hashlib.sha256, usedforsecurity=False)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@attrs.frozen
|
|
25
|
-
class UniqueId:
|
|
26
|
-
storage: "StorageURI"
|
|
27
|
-
path: str
|
|
28
|
-
size: int
|
|
29
|
-
etag: str
|
|
30
|
-
version: str = ""
|
|
31
|
-
is_latest: bool = True
|
|
32
|
-
location: Optional[str] = None
|
|
33
|
-
last_modified: datetime = TIME_ZERO
|
|
34
|
-
|
|
35
|
-
def get_parsed_location(self) -> Optional[dict]:
|
|
36
|
-
if not self.location:
|
|
37
|
-
return None
|
|
38
|
-
|
|
39
|
-
loc_stack = (
|
|
40
|
-
json.loads(self.location)
|
|
41
|
-
if isinstance(self.location, str)
|
|
42
|
-
else self.location
|
|
43
|
-
)
|
|
44
|
-
if len(loc_stack) > 1:
|
|
45
|
-
raise NotImplementedError("Nested v-objects are not supported yet.")
|
|
46
|
-
|
|
47
|
-
return loc_stack[0]
|
|
48
|
-
|
|
49
|
-
def get_hash(self) -> str:
|
|
50
|
-
fingerprint = f"{self.storage}/{self.path}/{self.version}/{self.etag}"
|
|
51
|
-
if self.location:
|
|
52
|
-
fingerprint += f"/{self.location}"
|
|
53
|
-
return sha256(fingerprint.encode()).hexdigest()
|
|
12
|
+
from datachain.lib.file import File
|
|
54
13
|
|
|
55
14
|
|
|
56
15
|
def try_scandir(path):
|
|
@@ -77,30 +36,30 @@ class DataChainCache:
|
|
|
77
36
|
def tmp_dir(self):
|
|
78
37
|
return self.odb.tmp_dir
|
|
79
38
|
|
|
80
|
-
def get_path(self,
|
|
81
|
-
if self.contains(
|
|
82
|
-
return self.path_from_checksum(
|
|
39
|
+
def get_path(self, file: "File") -> Optional[str]:
|
|
40
|
+
if self.contains(file):
|
|
41
|
+
return self.path_from_checksum(file.get_hash())
|
|
83
42
|
return None
|
|
84
43
|
|
|
85
|
-
def contains(self,
|
|
86
|
-
return self.odb.exists(
|
|
44
|
+
def contains(self, file: "File") -> bool:
|
|
45
|
+
return self.odb.exists(file.get_hash())
|
|
87
46
|
|
|
88
47
|
def path_from_checksum(self, checksum: str) -> str:
|
|
89
48
|
assert checksum
|
|
90
49
|
return self.odb.oid_to_path(checksum)
|
|
91
50
|
|
|
92
|
-
def remove(self,
|
|
93
|
-
self.odb.delete(
|
|
51
|
+
def remove(self, file: "File") -> None:
|
|
52
|
+
self.odb.delete(file.get_hash())
|
|
94
53
|
|
|
95
54
|
async def download(
|
|
96
|
-
self,
|
|
55
|
+
self, file: "File", client: "Client", callback: Optional[Callback] = None
|
|
97
56
|
) -> None:
|
|
98
|
-
from_path = f"{
|
|
57
|
+
from_path = f"{file.source}/{file.path}"
|
|
99
58
|
from dvc_objects.fs.utils import tmp_fname
|
|
100
59
|
|
|
101
60
|
odb_fs = self.odb.fs
|
|
102
61
|
tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
|
|
103
|
-
size =
|
|
62
|
+
size = file.size
|
|
104
63
|
if size < 0:
|
|
105
64
|
size = await client.get_size(from_path)
|
|
106
65
|
cb = callback or TqdmCallback(
|
|
@@ -115,13 +74,13 @@ class DataChainCache:
|
|
|
115
74
|
cb.close()
|
|
116
75
|
|
|
117
76
|
try:
|
|
118
|
-
oid =
|
|
77
|
+
oid = file.get_hash()
|
|
119
78
|
self.odb.add(tmp_info, self.odb.fs, oid)
|
|
120
79
|
finally:
|
|
121
80
|
os.unlink(tmp_info)
|
|
122
81
|
|
|
123
|
-
def store_data(self,
|
|
124
|
-
checksum =
|
|
82
|
+
def store_data(self, file: "File", contents: bytes) -> None:
|
|
83
|
+
checksum = file.get_hash()
|
|
125
84
|
dst = self.path_from_checksum(checksum)
|
|
126
85
|
if not os.path.exists(dst):
|
|
127
86
|
# Create the file only if it's not already in cache
|
datachain/catalog/catalog.py
CHANGED
|
@@ -34,7 +34,7 @@ import yaml
|
|
|
34
34
|
from sqlalchemy import Column
|
|
35
35
|
from tqdm import tqdm
|
|
36
36
|
|
|
37
|
-
from datachain.cache import DataChainCache
|
|
37
|
+
from datachain.cache import DataChainCache
|
|
38
38
|
from datachain.client import Client
|
|
39
39
|
from datachain.config import get_remote_config, read_config
|
|
40
40
|
from datachain.dataset import (
|
|
@@ -619,13 +619,13 @@ class Catalog:
|
|
|
619
619
|
code_ast.body[-1:] = new_expressions
|
|
620
620
|
return code_ast
|
|
621
621
|
|
|
622
|
-
def get_client(self, uri:
|
|
622
|
+
def get_client(self, uri: str, **config: Any) -> Client:
|
|
623
623
|
"""
|
|
624
624
|
Return the client corresponding to the given source `uri`.
|
|
625
625
|
"""
|
|
626
626
|
config = config or self.client_config
|
|
627
627
|
cls = Client.get_implementation(uri)
|
|
628
|
-
return cls.from_source(uri, self.cache, **config)
|
|
628
|
+
return cls.from_source(StorageURI(uri), self.cache, **config)
|
|
629
629
|
|
|
630
630
|
def enlist_source(
|
|
631
631
|
self,
|
|
@@ -1431,7 +1431,7 @@ class Catalog:
|
|
|
1431
1431
|
|
|
1432
1432
|
def get_file_signals(
|
|
1433
1433
|
self, dataset_name: str, dataset_version: int, row: RowDict
|
|
1434
|
-
) -> Optional[
|
|
1434
|
+
) -> Optional[RowDict]:
|
|
1435
1435
|
"""
|
|
1436
1436
|
Function that returns file signals from dataset row.
|
|
1437
1437
|
Note that signal names are without prefix, so if there was 'laion__file__source'
|
|
@@ -1448,7 +1448,7 @@ class Catalog:
|
|
|
1448
1448
|
|
|
1449
1449
|
version = self.get_dataset(dataset_name).get_version(dataset_version)
|
|
1450
1450
|
|
|
1451
|
-
file_signals_values =
|
|
1451
|
+
file_signals_values = RowDict()
|
|
1452
1452
|
|
|
1453
1453
|
schema = SignalSchema.deserialize(version.feature_schema)
|
|
1454
1454
|
for file_signals in schema.get_signals(File):
|
|
@@ -1476,6 +1476,8 @@ class Catalog:
|
|
|
1476
1476
|
use_cache: bool = True,
|
|
1477
1477
|
**config: Any,
|
|
1478
1478
|
):
|
|
1479
|
+
from datachain.lib.file import File
|
|
1480
|
+
|
|
1479
1481
|
file_signals = self.get_file_signals(dataset_name, dataset_version, row)
|
|
1480
1482
|
if not file_signals:
|
|
1481
1483
|
raise RuntimeError("Cannot open object without file signals")
|
|
@@ -1483,22 +1485,10 @@ class Catalog:
|
|
|
1483
1485
|
config = config or self.client_config
|
|
1484
1486
|
client = self.get_client(file_signals["source"], **config)
|
|
1485
1487
|
return client.open_object(
|
|
1486
|
-
|
|
1488
|
+
File._from_row(file_signals),
|
|
1487
1489
|
use_cache=use_cache,
|
|
1488
1490
|
)
|
|
1489
1491
|
|
|
1490
|
-
def _get_row_uid(self, row: RowDict) -> UniqueId:
|
|
1491
|
-
return UniqueId(
|
|
1492
|
-
row["source"],
|
|
1493
|
-
row["path"],
|
|
1494
|
-
row["size"],
|
|
1495
|
-
row["etag"],
|
|
1496
|
-
row["version"],
|
|
1497
|
-
row["is_latest"],
|
|
1498
|
-
row["location"],
|
|
1499
|
-
row["last_modified"],
|
|
1500
|
-
)
|
|
1501
|
-
|
|
1502
1492
|
def ls(
|
|
1503
1493
|
self,
|
|
1504
1494
|
sources: list[str],
|
datachain/cli.py
CHANGED
|
@@ -15,6 +15,7 @@ import shtab
|
|
|
15
15
|
from datachain import utils
|
|
16
16
|
from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
|
|
17
17
|
from datachain.lib.dc import DataChain
|
|
18
|
+
from datachain.telemetry import telemetry
|
|
18
19
|
from datachain.utils import DataChainDir
|
|
19
20
|
|
|
20
21
|
if TYPE_CHECKING:
|
|
@@ -872,6 +873,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
872
873
|
# This also sets this environment variable for any subprocesses
|
|
873
874
|
os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
|
|
874
875
|
|
|
876
|
+
error = None
|
|
875
877
|
try:
|
|
876
878
|
catalog = get_catalog(client_config=client_config)
|
|
877
879
|
if args.command == "cp":
|
|
@@ -1003,14 +1005,16 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1003
1005
|
print(f"invalid command: {args.command}", file=sys.stderr)
|
|
1004
1006
|
return 1
|
|
1005
1007
|
return 0
|
|
1006
|
-
except BrokenPipeError:
|
|
1008
|
+
except BrokenPipeError as exc:
|
|
1007
1009
|
# Python flushes standard streams on exit; redirect remaining output
|
|
1008
1010
|
# to devnull to avoid another BrokenPipeError at shutdown
|
|
1009
1011
|
# See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
|
|
1012
|
+
error = str(exc)
|
|
1010
1013
|
devnull = os.open(os.devnull, os.O_WRONLY)
|
|
1011
1014
|
os.dup2(devnull, sys.stdout.fileno())
|
|
1012
1015
|
return 141 # 128 + 13 (SIGPIPE)
|
|
1013
1016
|
except (KeyboardInterrupt, Exception) as exc:
|
|
1017
|
+
error = str(exc)
|
|
1014
1018
|
if isinstance(exc, KeyboardInterrupt):
|
|
1015
1019
|
msg = "Operation cancelled by the user"
|
|
1016
1020
|
else:
|
|
@@ -1028,3 +1032,5 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
|
|
|
1028
1032
|
|
|
1029
1033
|
pdb.post_mortem()
|
|
1030
1034
|
return 1
|
|
1035
|
+
finally:
|
|
1036
|
+
telemetry.send_cli_call(args.command, error=error)
|
datachain/client/fsspec.py
CHANGED
|
@@ -3,7 +3,6 @@ import functools
|
|
|
3
3
|
import logging
|
|
4
4
|
import multiprocessing
|
|
5
5
|
import os
|
|
6
|
-
import posixpath
|
|
7
6
|
import re
|
|
8
7
|
import sys
|
|
9
8
|
from abc import ABC, abstractmethod
|
|
@@ -26,8 +25,8 @@ from fsspec.asyn import get_loop, sync
|
|
|
26
25
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
27
26
|
from tqdm import tqdm
|
|
28
27
|
|
|
29
|
-
from datachain.cache import DataChainCache
|
|
30
|
-
from datachain.client.fileslice import
|
|
28
|
+
from datachain.cache import DataChainCache
|
|
29
|
+
from datachain.client.fileslice import FileWrapper
|
|
31
30
|
from datachain.error import ClientError as DataChainClientError
|
|
32
31
|
from datachain.lib.file import File
|
|
33
32
|
from datachain.nodes_fetcher import NodesFetcher
|
|
@@ -187,8 +186,8 @@ class Client(ABC):
|
|
|
187
186
|
def url(self, path: str, expires: int = 3600, **kwargs) -> str:
|
|
188
187
|
return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
|
|
189
188
|
|
|
190
|
-
async def get_current_etag(self,
|
|
191
|
-
info = await self.fs._info(self.get_full_path(
|
|
189
|
+
async def get_current_etag(self, file: "File") -> str:
|
|
190
|
+
info = await self.fs._info(self.get_full_path(file.path))
|
|
192
191
|
return self.info_to_file(info, "").etag
|
|
193
192
|
|
|
194
193
|
async def get_size(self, path: str) -> int:
|
|
@@ -317,7 +316,7 @@ class Client(ABC):
|
|
|
317
316
|
|
|
318
317
|
def instantiate_object(
|
|
319
318
|
self,
|
|
320
|
-
|
|
319
|
+
file: "File",
|
|
321
320
|
dst: str,
|
|
322
321
|
progress_bar: tqdm,
|
|
323
322
|
force: bool = False,
|
|
@@ -328,10 +327,10 @@ class Client(ABC):
|
|
|
328
327
|
else:
|
|
329
328
|
progress_bar.close()
|
|
330
329
|
raise FileExistsError(f"Path {dst} already exists")
|
|
331
|
-
self.do_instantiate_object(
|
|
330
|
+
self.do_instantiate_object(file, dst)
|
|
332
331
|
|
|
333
|
-
def do_instantiate_object(self,
|
|
334
|
-
src = self.cache.get_path(
|
|
332
|
+
def do_instantiate_object(self, file: "File", dst: str) -> None:
|
|
333
|
+
src = self.cache.get_path(file)
|
|
335
334
|
assert src is not None
|
|
336
335
|
|
|
337
336
|
try:
|
|
@@ -341,66 +340,33 @@ class Client(ABC):
|
|
|
341
340
|
copy2(src, dst)
|
|
342
341
|
|
|
343
342
|
def open_object(
|
|
344
|
-
self,
|
|
343
|
+
self, file: File, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
|
|
345
344
|
) -> BinaryIO:
|
|
346
345
|
"""Open a file, including files in tar archives."""
|
|
347
|
-
|
|
348
|
-
if use_cache and (cache_path := self.cache.get_path(uid)):
|
|
346
|
+
if use_cache and (cache_path := self.cache.get_path(file)):
|
|
349
347
|
return open(cache_path, mode="rb") # noqa: SIM115
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
offset = location["offset"]
|
|
359
|
-
size = location["size"]
|
|
360
|
-
parent = location["parent"]
|
|
361
|
-
|
|
362
|
-
parent_uid = UniqueId(
|
|
363
|
-
parent["source"],
|
|
364
|
-
parent["path"],
|
|
365
|
-
parent["size"],
|
|
366
|
-
parent["etag"],
|
|
367
|
-
location=parent["location"],
|
|
368
|
-
)
|
|
369
|
-
f = self.open_object(parent_uid, use_cache=use_cache)
|
|
370
|
-
return FileSlice(f, offset, size, posixpath.basename(uid.path))
|
|
371
|
-
|
|
372
|
-
def download(self, uid: UniqueId, *, callback: Callback = DEFAULT_CALLBACK) -> None:
|
|
373
|
-
sync(get_loop(), functools.partial(self._download, uid, callback=callback))
|
|
374
|
-
|
|
375
|
-
async def _download(self, uid: UniqueId, *, callback: "Callback" = None) -> None:
|
|
376
|
-
if self.cache.contains(uid):
|
|
348
|
+
assert not file.location
|
|
349
|
+
return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb) # type: ignore[return-value]
|
|
350
|
+
|
|
351
|
+
def download(self, file: File, *, callback: Callback = DEFAULT_CALLBACK) -> None:
|
|
352
|
+
sync(get_loop(), functools.partial(self._download, file, callback=callback))
|
|
353
|
+
|
|
354
|
+
async def _download(self, file: File, *, callback: "Callback" = None) -> None:
|
|
355
|
+
if self.cache.contains(file):
|
|
377
356
|
# Already in cache, so there's nothing to do.
|
|
378
357
|
return
|
|
379
|
-
await self._put_in_cache(
|
|
358
|
+
await self._put_in_cache(file, callback=callback)
|
|
380
359
|
|
|
381
|
-
def put_in_cache(self,
|
|
382
|
-
sync(get_loop(), functools.partial(self._put_in_cache,
|
|
360
|
+
def put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
|
|
361
|
+
sync(get_loop(), functools.partial(self._put_in_cache, file, callback=callback))
|
|
383
362
|
|
|
384
|
-
async def _put_in_cache(
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
loop = asyncio.get_running_loop()
|
|
390
|
-
await loop.run_in_executor(
|
|
391
|
-
None, functools.partial(self._download_from_tar, uid, callback=callback)
|
|
392
|
-
)
|
|
393
|
-
return
|
|
394
|
-
if uid.etag:
|
|
395
|
-
etag = await self.get_current_etag(uid)
|
|
396
|
-
if uid.etag != etag:
|
|
363
|
+
async def _put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
|
|
364
|
+
assert not file.location
|
|
365
|
+
if file.etag:
|
|
366
|
+
etag = await self.get_current_etag(file)
|
|
367
|
+
if file.etag != etag:
|
|
397
368
|
raise FileNotFoundError(
|
|
398
|
-
f"Invalid etag for {
|
|
399
|
-
f"expected {
|
|
369
|
+
f"Invalid etag for {file.source}/{file.path}: "
|
|
370
|
+
f"expected {file.etag}, got {etag}"
|
|
400
371
|
)
|
|
401
|
-
await self.cache.download(
|
|
402
|
-
|
|
403
|
-
def _download_from_tar(self, uid, *, callback: "Callback" = None):
|
|
404
|
-
with self._open_tar(uid, use_cache=False) as f:
|
|
405
|
-
contents = f.read()
|
|
406
|
-
self.cache.store_data(uid, contents)
|
|
372
|
+
await self.cache.download(file, self, callback=callback)
|
datachain/client/local.py
CHANGED
|
@@ -7,7 +7,6 @@ from urllib.parse import urlparse
|
|
|
7
7
|
|
|
8
8
|
from fsspec.implementations.local import LocalFileSystem
|
|
9
9
|
|
|
10
|
-
from datachain.cache import UniqueId
|
|
11
10
|
from datachain.lib.file import File
|
|
12
11
|
from datachain.storage import StorageURI
|
|
13
12
|
|
|
@@ -114,8 +113,8 @@ class FileClient(Client):
|
|
|
114
113
|
use_symlinks=use_symlinks,
|
|
115
114
|
)
|
|
116
115
|
|
|
117
|
-
async def get_current_etag(self,
|
|
118
|
-
info = self.fs.info(self.get_full_path(
|
|
116
|
+
async def get_current_etag(self, file: "File") -> str:
|
|
117
|
+
info = self.fs.info(self.get_full_path(file.path))
|
|
119
118
|
return self.info_to_file(info, "").etag
|
|
120
119
|
|
|
121
120
|
async def get_size(self, path: str) -> int:
|
datachain/lib/arrow.py
CHANGED
|
@@ -49,7 +49,8 @@ class ArrowGenerator(Generator):
|
|
|
49
49
|
|
|
50
50
|
def process(self, file: File):
|
|
51
51
|
if file._caching_enabled:
|
|
52
|
-
|
|
52
|
+
file.ensure_cached()
|
|
53
|
+
path = file.get_local_path()
|
|
53
54
|
ds = dataset(path, schema=self.input_schema, **self.kwargs)
|
|
54
55
|
elif self.nrows:
|
|
55
56
|
path = _nrows_file(file, self.nrows)
|
datachain/lib/dc.py
CHANGED
|
@@ -58,6 +58,7 @@ from datachain.query.dataset import (
|
|
|
58
58
|
)
|
|
59
59
|
from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
|
|
60
60
|
from datachain.sql.functions import path as pathfunc
|
|
61
|
+
from datachain.telemetry import telemetry
|
|
61
62
|
from datachain.utils import inside_notebook
|
|
62
63
|
|
|
63
64
|
if TYPE_CHECKING:
|
|
@@ -246,6 +247,9 @@ class DataChain(DatasetQuery):
|
|
|
246
247
|
**kwargs,
|
|
247
248
|
indexing_column_types=File._datachain_column_types,
|
|
248
249
|
)
|
|
250
|
+
|
|
251
|
+
telemetry.send_event_once("class", "datachain_init", **kwargs)
|
|
252
|
+
|
|
249
253
|
if settings:
|
|
250
254
|
self._settings = Settings(**settings)
|
|
251
255
|
else:
|
datachain/lib/file.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import io
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
5
6
|
import posixpath
|
|
6
7
|
from abc import ABC, abstractmethod
|
|
8
|
+
from collections.abc import Iterator
|
|
7
9
|
from contextlib import contextmanager
|
|
8
10
|
from datetime import datetime
|
|
11
|
+
from functools import partial
|
|
9
12
|
from io import BytesIO
|
|
10
13
|
from pathlib import Path, PurePosixPath
|
|
11
14
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
|
|
@@ -19,7 +22,6 @@ from pydantic import Field, field_validator
|
|
|
19
22
|
if TYPE_CHECKING:
|
|
20
23
|
from typing_extensions import Self
|
|
21
24
|
|
|
22
|
-
from datachain.cache import UniqueId
|
|
23
25
|
from datachain.client.fileslice import FileSlice
|
|
24
26
|
from datachain.lib.data_model import DataModel
|
|
25
27
|
from datachain.lib.utils import DataChainError
|
|
@@ -27,7 +29,13 @@ from datachain.sql.types import JSON, Boolean, DateTime, Int, String
|
|
|
27
29
|
from datachain.utils import TIME_ZERO
|
|
28
30
|
|
|
29
31
|
if TYPE_CHECKING:
|
|
32
|
+
from typing_extensions import Self
|
|
33
|
+
|
|
30
34
|
from datachain.catalog import Catalog
|
|
35
|
+
from datachain.client.fsspec import Client
|
|
36
|
+
from datachain.dataset import RowDict
|
|
37
|
+
|
|
38
|
+
sha256 = partial(hashlib.sha256, usedforsecurity=False)
|
|
31
39
|
|
|
32
40
|
logger = logging.getLogger("datachain")
|
|
33
41
|
|
|
@@ -38,7 +46,7 @@ ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
|
|
|
38
46
|
class VFileError(DataChainError):
|
|
39
47
|
def __init__(self, file: "File", message: str, vtype: str = ""):
|
|
40
48
|
type_ = f" of vtype '{vtype}'" if vtype else ""
|
|
41
|
-
super().__init__(f"Error in v-file '{file.
|
|
49
|
+
super().__init__(f"Error in v-file '{file.path}'{type_}: {message}")
|
|
42
50
|
|
|
43
51
|
|
|
44
52
|
class FileError(DataChainError):
|
|
@@ -85,9 +93,8 @@ class TarVFile(VFile):
|
|
|
85
93
|
tar_file = File(**parent)
|
|
86
94
|
tar_file._set_stream(file._catalog)
|
|
87
95
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
fd = client.open_object(tar_file_uid, use_cache=file._caching_enabled)
|
|
96
|
+
client = file._catalog.get_client(tar_file.source)
|
|
97
|
+
fd = client.open_object(tar_file, use_cache=file._caching_enabled)
|
|
91
98
|
return FileSlice(fd, offset, size, file.name)
|
|
92
99
|
|
|
93
100
|
|
|
@@ -181,7 +188,11 @@ class File(DataModel):
|
|
|
181
188
|
def __init__(self, **kwargs):
|
|
182
189
|
super().__init__(**kwargs)
|
|
183
190
|
self._catalog = None
|
|
184
|
-
self._caching_enabled = False
|
|
191
|
+
self._caching_enabled: bool = False
|
|
192
|
+
|
|
193
|
+
@classmethod
|
|
194
|
+
def _from_row(cls, row: "RowDict") -> "Self":
|
|
195
|
+
return cls(**{key: row[key] for key in cls._datachain_column_types})
|
|
185
196
|
|
|
186
197
|
@property
|
|
187
198
|
def name(self):
|
|
@@ -192,19 +203,18 @@ class File(DataModel):
|
|
|
192
203
|
return str(PurePosixPath(self.path).parent)
|
|
193
204
|
|
|
194
205
|
@contextmanager
|
|
195
|
-
def open(self, mode: Literal["rb", "r"] = "rb"):
|
|
206
|
+
def open(self, mode: Literal["rb", "r"] = "rb") -> Iterator[Any]:
|
|
196
207
|
"""Open the file and return a file object."""
|
|
197
208
|
if self.location:
|
|
198
209
|
with VFileRegistry.resolve(self, self.location) as f: # type: ignore[arg-type]
|
|
199
210
|
yield f
|
|
200
211
|
|
|
201
212
|
else:
|
|
202
|
-
uid = self.get_uid()
|
|
203
|
-
client = self._catalog.get_client(self.source)
|
|
204
213
|
if self._caching_enabled:
|
|
205
|
-
|
|
214
|
+
self.ensure_cached()
|
|
215
|
+
client: Client = self._catalog.get_client(self.source)
|
|
206
216
|
with client.open_object(
|
|
207
|
-
|
|
217
|
+
self, use_cache=self._caching_enabled, cb=self._download_cb
|
|
208
218
|
) as f:
|
|
209
219
|
yield io.TextIOWrapper(f) if mode == "r" else f
|
|
210
220
|
|
|
@@ -252,23 +262,25 @@ class File(DataModel):
|
|
|
252
262
|
self._caching_enabled = caching_enabled
|
|
253
263
|
self._download_cb = download_cb
|
|
254
264
|
|
|
255
|
-
def
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
265
|
+
def ensure_cached(self) -> None:
|
|
266
|
+
if self._catalog is None:
|
|
267
|
+
raise RuntimeError(
|
|
268
|
+
"cannot download file to cache because catalog is not setup"
|
|
269
|
+
)
|
|
270
|
+
client = self._catalog.get_client(self.source)
|
|
271
|
+
client.download(self, callback=self._download_cb)
|
|
272
|
+
|
|
273
|
+
def get_local_path(self) -> Optional[str]:
|
|
274
|
+
"""Return path to a file in a local cache.
|
|
259
275
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
276
|
+
Returns None if file is not cached.
|
|
277
|
+
Raises an exception if cache is not setup.
|
|
278
|
+
"""
|
|
263
279
|
if self._catalog is None:
|
|
264
280
|
raise RuntimeError(
|
|
265
281
|
"cannot resolve local file path because catalog is not setup"
|
|
266
282
|
)
|
|
267
|
-
|
|
268
|
-
if download:
|
|
269
|
-
client = self._catalog.get_client(self.source)
|
|
270
|
-
client.download(uid, callback=self._download_cb)
|
|
271
|
-
return self._catalog.cache.get_path(uid)
|
|
283
|
+
return self._catalog.cache.get_path(self)
|
|
272
284
|
|
|
273
285
|
def get_file_suffix(self):
|
|
274
286
|
"""Returns last part of file name with `.`."""
|
|
@@ -323,6 +335,12 @@ class File(DataModel):
|
|
|
323
335
|
"""Returns `fsspec` filesystem for the file."""
|
|
324
336
|
return self._catalog.get_client(self.source).fs
|
|
325
337
|
|
|
338
|
+
def get_hash(self) -> str:
|
|
339
|
+
fingerprint = f"{self.source}/{self.path}/{self.version}/{self.etag}"
|
|
340
|
+
if self.location:
|
|
341
|
+
fingerprint += f"/{self.location}"
|
|
342
|
+
return sha256(fingerprint.encode()).hexdigest()
|
|
343
|
+
|
|
326
344
|
def resolve(self) -> "Self":
|
|
327
345
|
"""
|
|
328
346
|
Resolve a File object by checking its existence and updating its metadata.
|
datachain/lib/listing.py
CHANGED
|
@@ -11,6 +11,7 @@ from datachain.client import Client
|
|
|
11
11
|
from datachain.lib.file import File
|
|
12
12
|
from datachain.query.schema import Column
|
|
13
13
|
from datachain.sql.functions import path as pathfunc
|
|
14
|
+
from datachain.telemetry import telemetry
|
|
14
15
|
from datachain.utils import uses_glob
|
|
15
16
|
|
|
16
17
|
if TYPE_CHECKING:
|
|
@@ -80,6 +81,7 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
|
|
|
80
81
|
client_config = client_config or {}
|
|
81
82
|
client = Client.get_client(uri, cache, **client_config)
|
|
82
83
|
storage_uri, path = Client.parse_url(uri)
|
|
84
|
+
telemetry.log_param("client", client.PREFIX)
|
|
83
85
|
|
|
84
86
|
# clean path without globs
|
|
85
87
|
lst_uri_path = (
|
datachain/listing.py
CHANGED
|
@@ -156,12 +156,12 @@ class Listing:
|
|
|
156
156
|
|
|
157
157
|
def instantiate_nodes(
|
|
158
158
|
self,
|
|
159
|
-
all_nodes,
|
|
159
|
+
all_nodes: Iterable[NodeWithPath],
|
|
160
160
|
output,
|
|
161
161
|
total_files=None,
|
|
162
162
|
force=False,
|
|
163
163
|
shared_progress_bar=None,
|
|
164
|
-
):
|
|
164
|
+
) -> None:
|
|
165
165
|
progress_bar = shared_progress_bar or tqdm(
|
|
166
166
|
desc=f"Instantiating '{output}'",
|
|
167
167
|
unit=" files",
|
|
@@ -175,8 +175,8 @@ class Listing:
|
|
|
175
175
|
dst = os.path.join(output, *node.path)
|
|
176
176
|
dst_dir = os.path.dirname(dst)
|
|
177
177
|
os.makedirs(dst_dir, exist_ok=True)
|
|
178
|
-
|
|
179
|
-
self.client.instantiate_object(
|
|
178
|
+
file = node.n.to_file(self.client.uri)
|
|
179
|
+
self.client.instantiate_object(file, dst, progress_bar, force)
|
|
180
180
|
counter += 1
|
|
181
181
|
if counter > 1000:
|
|
182
182
|
progress_bar.update(counter)
|
datachain/node.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
3
3
|
|
|
4
4
|
import attrs
|
|
5
5
|
|
|
6
|
-
from datachain.
|
|
6
|
+
from datachain.lib.file import File
|
|
7
7
|
from datachain.storage import StorageURI
|
|
8
8
|
from datachain.utils import TIME_ZERO, time_to_str
|
|
9
9
|
|
|
@@ -99,11 +99,11 @@ class Node:
|
|
|
99
99
|
return self.path + "/"
|
|
100
100
|
return self.path
|
|
101
101
|
|
|
102
|
-
def
|
|
103
|
-
if
|
|
104
|
-
|
|
105
|
-
return
|
|
106
|
-
|
|
102
|
+
def to_file(self, source: Optional[StorageURI] = None) -> File:
|
|
103
|
+
if source is None:
|
|
104
|
+
source = self.source
|
|
105
|
+
return File(
|
|
106
|
+
source=source,
|
|
107
107
|
path=self.path,
|
|
108
108
|
size=self.size,
|
|
109
109
|
version=self.version or "",
|
datachain/nodes_fetcher.py
CHANGED
|
@@ -1,12 +1,19 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
2
4
|
|
|
5
|
+
from datachain.node import Node
|
|
3
6
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
4
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from datachain.cache import DataChainCache
|
|
10
|
+
from datachain.client.fsspec import Client
|
|
11
|
+
|
|
5
12
|
logger = logging.getLogger("datachain")
|
|
6
13
|
|
|
7
14
|
|
|
8
15
|
class NodesFetcher(NodesThreadPool):
|
|
9
|
-
def __init__(self, client, max_threads, cache):
|
|
16
|
+
def __init__(self, client: "Client", max_threads: int, cache: "DataChainCache"):
|
|
10
17
|
super().__init__(max_threads)
|
|
11
18
|
self.client = client
|
|
12
19
|
self.cache = cache
|
|
@@ -15,7 +22,7 @@ class NodesFetcher(NodesThreadPool):
|
|
|
15
22
|
for task in done:
|
|
16
23
|
task.result()
|
|
17
24
|
|
|
18
|
-
def do_task(self, chunk):
|
|
25
|
+
def do_task(self, chunk: Iterable[Node]) -> None:
|
|
19
26
|
from fsspec import Callback
|
|
20
27
|
|
|
21
28
|
class _CB(Callback):
|
|
@@ -23,8 +30,8 @@ class NodesFetcher(NodesThreadPool):
|
|
|
23
30
|
self.increase_counter(inc)
|
|
24
31
|
|
|
25
32
|
for node in chunk:
|
|
26
|
-
|
|
27
|
-
if self.cache.contains(
|
|
33
|
+
file = node.to_file(self.client.uri)
|
|
34
|
+
if self.cache.contains(file):
|
|
28
35
|
self.increase_counter(node.size)
|
|
29
36
|
else:
|
|
30
|
-
self.client.put_in_cache(
|
|
37
|
+
self.client.put_in_cache(file, callback=_CB())
|