datachain 0.3.17__py3-none-any.whl → 0.3.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/cache.py CHANGED
@@ -1,56 +1,15 @@
1
- import hashlib
2
- import json
3
1
  import os
4
- from datetime import datetime
5
- from functools import partial
6
2
  from typing import TYPE_CHECKING, Optional
7
3
 
8
- import attrs
9
4
  from dvc_data.hashfile.db.local import LocalHashFileDB
10
5
  from dvc_objects.fs.local import LocalFileSystem
11
6
  from fsspec.callbacks import Callback, TqdmCallback
12
7
 
13
- from datachain.utils import TIME_ZERO
14
-
15
8
  from .progress import Tqdm
16
9
 
17
10
  if TYPE_CHECKING:
18
11
  from datachain.client import Client
19
- from datachain.storage import StorageURI
20
-
21
- sha256 = partial(hashlib.sha256, usedforsecurity=False)
22
-
23
-
24
- @attrs.frozen
25
- class UniqueId:
26
- storage: "StorageURI"
27
- path: str
28
- size: int
29
- etag: str
30
- version: str = ""
31
- is_latest: bool = True
32
- location: Optional[str] = None
33
- last_modified: datetime = TIME_ZERO
34
-
35
- def get_parsed_location(self) -> Optional[dict]:
36
- if not self.location:
37
- return None
38
-
39
- loc_stack = (
40
- json.loads(self.location)
41
- if isinstance(self.location, str)
42
- else self.location
43
- )
44
- if len(loc_stack) > 1:
45
- raise NotImplementedError("Nested v-objects are not supported yet.")
46
-
47
- return loc_stack[0]
48
-
49
- def get_hash(self) -> str:
50
- fingerprint = f"{self.storage}/{self.path}/{self.version}/{self.etag}"
51
- if self.location:
52
- fingerprint += f"/{self.location}"
53
- return sha256(fingerprint.encode()).hexdigest()
12
+ from datachain.lib.file import File
54
13
 
55
14
 
56
15
  def try_scandir(path):
@@ -77,30 +36,30 @@ class DataChainCache:
77
36
  def tmp_dir(self):
78
37
  return self.odb.tmp_dir
79
38
 
80
- def get_path(self, uid: UniqueId) -> Optional[str]:
81
- if self.contains(uid):
82
- return self.path_from_checksum(uid.get_hash())
39
+ def get_path(self, file: "File") -> Optional[str]:
40
+ if self.contains(file):
41
+ return self.path_from_checksum(file.get_hash())
83
42
  return None
84
43
 
85
- def contains(self, uid: UniqueId) -> bool:
86
- return self.odb.exists(uid.get_hash())
44
+ def contains(self, file: "File") -> bool:
45
+ return self.odb.exists(file.get_hash())
87
46
 
88
47
  def path_from_checksum(self, checksum: str) -> str:
89
48
  assert checksum
90
49
  return self.odb.oid_to_path(checksum)
91
50
 
92
- def remove(self, uid: UniqueId) -> None:
93
- self.odb.delete(uid.get_hash())
51
+ def remove(self, file: "File") -> None:
52
+ self.odb.delete(file.get_hash())
94
53
 
95
54
  async def download(
96
- self, uid: UniqueId, client: "Client", callback: Optional[Callback] = None
55
+ self, file: "File", client: "Client", callback: Optional[Callback] = None
97
56
  ) -> None:
98
- from_path = f"{uid.storage}/{uid.path}"
57
+ from_path = f"{file.source}/{file.path}"
99
58
  from dvc_objects.fs.utils import tmp_fname
100
59
 
101
60
  odb_fs = self.odb.fs
102
61
  tmp_info = odb_fs.join(self.odb.tmp_dir, tmp_fname()) # type: ignore[arg-type]
103
- size = uid.size
62
+ size = file.size
104
63
  if size < 0:
105
64
  size = await client.get_size(from_path)
106
65
  cb = callback or TqdmCallback(
@@ -115,13 +74,13 @@ class DataChainCache:
115
74
  cb.close()
116
75
 
117
76
  try:
118
- oid = uid.get_hash()
77
+ oid = file.get_hash()
119
78
  self.odb.add(tmp_info, self.odb.fs, oid)
120
79
  finally:
121
80
  os.unlink(tmp_info)
122
81
 
123
- def store_data(self, uid: UniqueId, contents: bytes) -> None:
124
- checksum = uid.get_hash()
82
+ def store_data(self, file: "File", contents: bytes) -> None:
83
+ checksum = file.get_hash()
125
84
  dst = self.path_from_checksum(checksum)
126
85
  if not os.path.exists(dst):
127
86
  # Create the file only if it's not already in cache
@@ -34,7 +34,7 @@ import yaml
34
34
  from sqlalchemy import Column
35
35
  from tqdm import tqdm
36
36
 
37
- from datachain.cache import DataChainCache, UniqueId
37
+ from datachain.cache import DataChainCache
38
38
  from datachain.client import Client
39
39
  from datachain.config import get_remote_config, read_config
40
40
  from datachain.dataset import (
@@ -619,13 +619,13 @@ class Catalog:
619
619
  code_ast.body[-1:] = new_expressions
620
620
  return code_ast
621
621
 
622
- def get_client(self, uri: StorageURI, **config: Any) -> Client:
622
+ def get_client(self, uri: str, **config: Any) -> Client:
623
623
  """
624
624
  Return the client corresponding to the given source `uri`.
625
625
  """
626
626
  config = config or self.client_config
627
627
  cls = Client.get_implementation(uri)
628
- return cls.from_source(uri, self.cache, **config)
628
+ return cls.from_source(StorageURI(uri), self.cache, **config)
629
629
 
630
630
  def enlist_source(
631
631
  self,
@@ -1431,7 +1431,7 @@ class Catalog:
1431
1431
 
1432
1432
  def get_file_signals(
1433
1433
  self, dataset_name: str, dataset_version: int, row: RowDict
1434
- ) -> Optional[dict]:
1434
+ ) -> Optional[RowDict]:
1435
1435
  """
1436
1436
  Function that returns file signals from dataset row.
1437
1437
  Note that signal names are without prefix, so if there was 'laion__file__source'
@@ -1448,7 +1448,7 @@ class Catalog:
1448
1448
 
1449
1449
  version = self.get_dataset(dataset_name).get_version(dataset_version)
1450
1450
 
1451
- file_signals_values = {}
1451
+ file_signals_values = RowDict()
1452
1452
 
1453
1453
  schema = SignalSchema.deserialize(version.feature_schema)
1454
1454
  for file_signals in schema.get_signals(File):
@@ -1476,6 +1476,8 @@ class Catalog:
1476
1476
  use_cache: bool = True,
1477
1477
  **config: Any,
1478
1478
  ):
1479
+ from datachain.lib.file import File
1480
+
1479
1481
  file_signals = self.get_file_signals(dataset_name, dataset_version, row)
1480
1482
  if not file_signals:
1481
1483
  raise RuntimeError("Cannot open object without file signals")
@@ -1483,22 +1485,10 @@ class Catalog:
1483
1485
  config = config or self.client_config
1484
1486
  client = self.get_client(file_signals["source"], **config)
1485
1487
  return client.open_object(
1486
- self._get_row_uid(file_signals), # type: ignore [arg-type]
1488
+ File._from_row(file_signals),
1487
1489
  use_cache=use_cache,
1488
1490
  )
1489
1491
 
1490
- def _get_row_uid(self, row: RowDict) -> UniqueId:
1491
- return UniqueId(
1492
- row["source"],
1493
- row["path"],
1494
- row["size"],
1495
- row["etag"],
1496
- row["version"],
1497
- row["is_latest"],
1498
- row["location"],
1499
- row["last_modified"],
1500
- )
1501
-
1502
1492
  def ls(
1503
1493
  self,
1504
1494
  sources: list[str],
datachain/cli.py CHANGED
@@ -15,6 +15,7 @@ import shtab
15
15
  from datachain import utils
16
16
  from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
17
17
  from datachain.lib.dc import DataChain
18
+ from datachain.telemetry import telemetry
18
19
  from datachain.utils import DataChainDir
19
20
 
20
21
  if TYPE_CHECKING:
@@ -872,6 +873,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
872
873
  # This also sets this environment variable for any subprocesses
873
874
  os.environ["DEBUG_SHOW_SQL_QUERIES"] = "True"
874
875
 
876
+ error = None
875
877
  try:
876
878
  catalog = get_catalog(client_config=client_config)
877
879
  if args.command == "cp":
@@ -1003,14 +1005,16 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1003
1005
  print(f"invalid command: {args.command}", file=sys.stderr)
1004
1006
  return 1
1005
1007
  return 0
1006
- except BrokenPipeError:
1008
+ except BrokenPipeError as exc:
1007
1009
  # Python flushes standard streams on exit; redirect remaining output
1008
1010
  # to devnull to avoid another BrokenPipeError at shutdown
1009
1011
  # See: https://docs.python.org/3/library/signal.html#note-on-sigpipe
1012
+ error = str(exc)
1010
1013
  devnull = os.open(os.devnull, os.O_WRONLY)
1011
1014
  os.dup2(devnull, sys.stdout.fileno())
1012
1015
  return 141 # 128 + 13 (SIGPIPE)
1013
1016
  except (KeyboardInterrupt, Exception) as exc:
1017
+ error = str(exc)
1014
1018
  if isinstance(exc, KeyboardInterrupt):
1015
1019
  msg = "Operation cancelled by the user"
1016
1020
  else:
@@ -1028,3 +1032,5 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1028
1032
 
1029
1033
  pdb.post_mortem()
1030
1034
  return 1
1035
+ finally:
1036
+ telemetry.send_cli_call(args.command, error=error)
@@ -3,7 +3,6 @@ import functools
3
3
  import logging
4
4
  import multiprocessing
5
5
  import os
6
- import posixpath
7
6
  import re
8
7
  import sys
9
8
  from abc import ABC, abstractmethod
@@ -26,8 +25,8 @@ from fsspec.asyn import get_loop, sync
26
25
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
27
26
  from tqdm import tqdm
28
27
 
29
- from datachain.cache import DataChainCache, UniqueId
30
- from datachain.client.fileslice import FileSlice, FileWrapper
28
+ from datachain.cache import DataChainCache
29
+ from datachain.client.fileslice import FileWrapper
31
30
  from datachain.error import ClientError as DataChainClientError
32
31
  from datachain.lib.file import File
33
32
  from datachain.nodes_fetcher import NodesFetcher
@@ -187,8 +186,8 @@ class Client(ABC):
187
186
  def url(self, path: str, expires: int = 3600, **kwargs) -> str:
188
187
  return self.fs.sign(self.get_full_path(path), expiration=expires, **kwargs)
189
188
 
190
- async def get_current_etag(self, uid: UniqueId) -> str:
191
- info = await self.fs._info(self.get_full_path(uid.path))
189
+ async def get_current_etag(self, file: "File") -> str:
190
+ info = await self.fs._info(self.get_full_path(file.path))
192
191
  return self.info_to_file(info, "").etag
193
192
 
194
193
  async def get_size(self, path: str) -> int:
@@ -317,7 +316,7 @@ class Client(ABC):
317
316
 
318
317
  def instantiate_object(
319
318
  self,
320
- uid: UniqueId,
319
+ file: "File",
321
320
  dst: str,
322
321
  progress_bar: tqdm,
323
322
  force: bool = False,
@@ -328,10 +327,10 @@ class Client(ABC):
328
327
  else:
329
328
  progress_bar.close()
330
329
  raise FileExistsError(f"Path {dst} already exists")
331
- self.do_instantiate_object(uid, dst)
330
+ self.do_instantiate_object(file, dst)
332
331
 
333
- def do_instantiate_object(self, uid: "UniqueId", dst: str) -> None:
334
- src = self.cache.get_path(uid)
332
+ def do_instantiate_object(self, file: "File", dst: str) -> None:
333
+ src = self.cache.get_path(file)
335
334
  assert src is not None
336
335
 
337
336
  try:
@@ -341,66 +340,33 @@ class Client(ABC):
341
340
  copy2(src, dst)
342
341
 
343
342
  def open_object(
344
- self, uid: UniqueId, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
343
+ self, file: File, use_cache: bool = True, cb: Callback = DEFAULT_CALLBACK
345
344
  ) -> BinaryIO:
346
345
  """Open a file, including files in tar archives."""
347
- location = uid.get_parsed_location()
348
- if use_cache and (cache_path := self.cache.get_path(uid)):
346
+ if use_cache and (cache_path := self.cache.get_path(file)):
349
347
  return open(cache_path, mode="rb") # noqa: SIM115
350
- if location and location["vtype"] == "tar":
351
- return self._open_tar(uid, use_cache=True)
352
- return FileWrapper(self.fs.open(self.get_full_path(uid.path)), cb) # type: ignore[return-value]
353
-
354
- def _open_tar(self, uid: UniqueId, use_cache: bool = True):
355
- location = uid.get_parsed_location()
356
- assert location
357
-
358
- offset = location["offset"]
359
- size = location["size"]
360
- parent = location["parent"]
361
-
362
- parent_uid = UniqueId(
363
- parent["source"],
364
- parent["path"],
365
- parent["size"],
366
- parent["etag"],
367
- location=parent["location"],
368
- )
369
- f = self.open_object(parent_uid, use_cache=use_cache)
370
- return FileSlice(f, offset, size, posixpath.basename(uid.path))
371
-
372
- def download(self, uid: UniqueId, *, callback: Callback = DEFAULT_CALLBACK) -> None:
373
- sync(get_loop(), functools.partial(self._download, uid, callback=callback))
374
-
375
- async def _download(self, uid: UniqueId, *, callback: "Callback" = None) -> None:
376
- if self.cache.contains(uid):
348
+ assert not file.location
349
+ return FileWrapper(self.fs.open(self.get_full_path(file.path)), cb) # type: ignore[return-value]
350
+
351
+ def download(self, file: File, *, callback: Callback = DEFAULT_CALLBACK) -> None:
352
+ sync(get_loop(), functools.partial(self._download, file, callback=callback))
353
+
354
+ async def _download(self, file: File, *, callback: "Callback" = None) -> None:
355
+ if self.cache.contains(file):
377
356
  # Already in cache, so there's nothing to do.
378
357
  return
379
- await self._put_in_cache(uid, callback=callback)
358
+ await self._put_in_cache(file, callback=callback)
380
359
 
381
- def put_in_cache(self, uid: UniqueId, *, callback: "Callback" = None) -> None:
382
- sync(get_loop(), functools.partial(self._put_in_cache, uid, callback=callback))
360
+ def put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
361
+ sync(get_loop(), functools.partial(self._put_in_cache, file, callback=callback))
383
362
 
384
- async def _put_in_cache(
385
- self, uid: UniqueId, *, callback: "Callback" = None
386
- ) -> None:
387
- location = uid.get_parsed_location()
388
- if location and location["vtype"] == "tar":
389
- loop = asyncio.get_running_loop()
390
- await loop.run_in_executor(
391
- None, functools.partial(self._download_from_tar, uid, callback=callback)
392
- )
393
- return
394
- if uid.etag:
395
- etag = await self.get_current_etag(uid)
396
- if uid.etag != etag:
363
+ async def _put_in_cache(self, file: File, *, callback: "Callback" = None) -> None:
364
+ assert not file.location
365
+ if file.etag:
366
+ etag = await self.get_current_etag(file)
367
+ if file.etag != etag:
397
368
  raise FileNotFoundError(
398
- f"Invalid etag for {uid.storage}/{uid.path}: "
399
- f"expected {uid.etag}, got {etag}"
369
+ f"Invalid etag for {file.source}/{file.path}: "
370
+ f"expected {file.etag}, got {etag}"
400
371
  )
401
- await self.cache.download(uid, self, callback=callback)
402
-
403
- def _download_from_tar(self, uid, *, callback: "Callback" = None):
404
- with self._open_tar(uid, use_cache=False) as f:
405
- contents = f.read()
406
- self.cache.store_data(uid, contents)
372
+ await self.cache.download(file, self, callback=callback)
datachain/client/local.py CHANGED
@@ -7,7 +7,6 @@ from urllib.parse import urlparse
7
7
 
8
8
  from fsspec.implementations.local import LocalFileSystem
9
9
 
10
- from datachain.cache import UniqueId
11
10
  from datachain.lib.file import File
12
11
  from datachain.storage import StorageURI
13
12
 
@@ -114,8 +113,8 @@ class FileClient(Client):
114
113
  use_symlinks=use_symlinks,
115
114
  )
116
115
 
117
- async def get_current_etag(self, uid: UniqueId) -> str:
118
- info = self.fs.info(self.get_full_path(uid.path))
116
+ async def get_current_etag(self, file: "File") -> str:
117
+ info = self.fs.info(self.get_full_path(file.path))
119
118
  return self.info_to_file(info, "").etag
120
119
 
121
120
  async def get_size(self, path: str) -> int:
datachain/lib/arrow.py CHANGED
@@ -49,7 +49,8 @@ class ArrowGenerator(Generator):
49
49
 
50
50
  def process(self, file: File):
51
51
  if file._caching_enabled:
52
- path = file.get_local_path(download=True)
52
+ file.ensure_cached()
53
+ path = file.get_local_path()
53
54
  ds = dataset(path, schema=self.input_schema, **self.kwargs)
54
55
  elif self.nrows:
55
56
  path = _nrows_file(file, self.nrows)
datachain/lib/dc.py CHANGED
@@ -58,6 +58,7 @@ from datachain.query.dataset import (
58
58
  )
59
59
  from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
60
60
  from datachain.sql.functions import path as pathfunc
61
+ from datachain.telemetry import telemetry
61
62
  from datachain.utils import inside_notebook
62
63
 
63
64
  if TYPE_CHECKING:
@@ -246,6 +247,9 @@ class DataChain(DatasetQuery):
246
247
  **kwargs,
247
248
  indexing_column_types=File._datachain_column_types,
248
249
  )
250
+
251
+ telemetry.send_event_once("class", "datachain_init", **kwargs)
252
+
249
253
  if settings:
250
254
  self._settings = Settings(**settings)
251
255
  else:
datachain/lib/file.py CHANGED
@@ -1,11 +1,14 @@
1
+ import hashlib
1
2
  import io
2
3
  import json
3
4
  import logging
4
5
  import os
5
6
  import posixpath
6
7
  from abc import ABC, abstractmethod
8
+ from collections.abc import Iterator
7
9
  from contextlib import contextmanager
8
10
  from datetime import datetime
11
+ from functools import partial
9
12
  from io import BytesIO
10
13
  from pathlib import Path, PurePosixPath
11
14
  from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
@@ -19,7 +22,6 @@ from pydantic import Field, field_validator
19
22
  if TYPE_CHECKING:
20
23
  from typing_extensions import Self
21
24
 
22
- from datachain.cache import UniqueId
23
25
  from datachain.client.fileslice import FileSlice
24
26
  from datachain.lib.data_model import DataModel
25
27
  from datachain.lib.utils import DataChainError
@@ -27,7 +29,13 @@ from datachain.sql.types import JSON, Boolean, DateTime, Int, String
27
29
  from datachain.utils import TIME_ZERO
28
30
 
29
31
  if TYPE_CHECKING:
32
+ from typing_extensions import Self
33
+
30
34
  from datachain.catalog import Catalog
35
+ from datachain.client.fsspec import Client
36
+ from datachain.dataset import RowDict
37
+
38
+ sha256 = partial(hashlib.sha256, usedforsecurity=False)
31
39
 
32
40
  logger = logging.getLogger("datachain")
33
41
 
@@ -38,7 +46,7 @@ ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
38
46
  class VFileError(DataChainError):
39
47
  def __init__(self, file: "File", message: str, vtype: str = ""):
40
48
  type_ = f" of vtype '{vtype}'" if vtype else ""
41
- super().__init__(f"Error in v-file '{file.get_uid().path}'{type_}: {message}")
49
+ super().__init__(f"Error in v-file '{file.path}'{type_}: {message}")
42
50
 
43
51
 
44
52
  class FileError(DataChainError):
@@ -85,9 +93,8 @@ class TarVFile(VFile):
85
93
  tar_file = File(**parent)
86
94
  tar_file._set_stream(file._catalog)
87
95
 
88
- tar_file_uid = tar_file.get_uid()
89
- client = file._catalog.get_client(tar_file_uid.storage)
90
- fd = client.open_object(tar_file_uid, use_cache=file._caching_enabled)
96
+ client = file._catalog.get_client(tar_file.source)
97
+ fd = client.open_object(tar_file, use_cache=file._caching_enabled)
91
98
  return FileSlice(fd, offset, size, file.name)
92
99
 
93
100
 
@@ -181,7 +188,11 @@ class File(DataModel):
181
188
  def __init__(self, **kwargs):
182
189
  super().__init__(**kwargs)
183
190
  self._catalog = None
184
- self._caching_enabled = False
191
+ self._caching_enabled: bool = False
192
+
193
+ @classmethod
194
+ def _from_row(cls, row: "RowDict") -> "Self":
195
+ return cls(**{key: row[key] for key in cls._datachain_column_types})
185
196
 
186
197
  @property
187
198
  def name(self):
@@ -192,19 +203,18 @@ class File(DataModel):
192
203
  return str(PurePosixPath(self.path).parent)
193
204
 
194
205
  @contextmanager
195
- def open(self, mode: Literal["rb", "r"] = "rb"):
206
+ def open(self, mode: Literal["rb", "r"] = "rb") -> Iterator[Any]:
196
207
  """Open the file and return a file object."""
197
208
  if self.location:
198
209
  with VFileRegistry.resolve(self, self.location) as f: # type: ignore[arg-type]
199
210
  yield f
200
211
 
201
212
  else:
202
- uid = self.get_uid()
203
- client = self._catalog.get_client(self.source)
204
213
  if self._caching_enabled:
205
- client.download(uid, callback=self._download_cb)
214
+ self.ensure_cached()
215
+ client: Client = self._catalog.get_client(self.source)
206
216
  with client.open_object(
207
- uid, use_cache=self._caching_enabled, cb=self._download_cb
217
+ self, use_cache=self._caching_enabled, cb=self._download_cb
208
218
  ) as f:
209
219
  yield io.TextIOWrapper(f) if mode == "r" else f
210
220
 
@@ -252,23 +262,25 @@ class File(DataModel):
252
262
  self._caching_enabled = caching_enabled
253
263
  self._download_cb = download_cb
254
264
 
255
- def get_uid(self) -> UniqueId:
256
- """Returns unique ID for file."""
257
- dump = self.model_dump()
258
- return UniqueId(*(dump[k] for k in self._unique_id_keys))
265
+ def ensure_cached(self) -> None:
266
+ if self._catalog is None:
267
+ raise RuntimeError(
268
+ "cannot download file to cache because catalog is not setup"
269
+ )
270
+ client = self._catalog.get_client(self.source)
271
+ client.download(self, callback=self._download_cb)
272
+
273
+ def get_local_path(self) -> Optional[str]:
274
+ """Return path to a file in a local cache.
259
275
 
260
- def get_local_path(self, download: bool = False) -> Optional[str]:
261
- """Returns path to a file in a local cache.
262
- Return None if file is not cached. Throws an exception if cache is not setup."""
276
+ Returns None if file is not cached.
277
+ Raises an exception if cache is not setup.
278
+ """
263
279
  if self._catalog is None:
264
280
  raise RuntimeError(
265
281
  "cannot resolve local file path because catalog is not setup"
266
282
  )
267
- uid = self.get_uid()
268
- if download:
269
- client = self._catalog.get_client(self.source)
270
- client.download(uid, callback=self._download_cb)
271
- return self._catalog.cache.get_path(uid)
283
+ return self._catalog.cache.get_path(self)
272
284
 
273
285
  def get_file_suffix(self):
274
286
  """Returns last part of file name with `.`."""
@@ -323,6 +335,12 @@ class File(DataModel):
323
335
  """Returns `fsspec` filesystem for the file."""
324
336
  return self._catalog.get_client(self.source).fs
325
337
 
338
+ def get_hash(self) -> str:
339
+ fingerprint = f"{self.source}/{self.path}/{self.version}/{self.etag}"
340
+ if self.location:
341
+ fingerprint += f"/{self.location}"
342
+ return sha256(fingerprint.encode()).hexdigest()
343
+
326
344
  def resolve(self) -> "Self":
327
345
  """
328
346
  Resolve a File object by checking its existence and updating its metadata.
datachain/lib/listing.py CHANGED
@@ -11,6 +11,7 @@ from datachain.client import Client
11
11
  from datachain.lib.file import File
12
12
  from datachain.query.schema import Column
13
13
  from datachain.sql.functions import path as pathfunc
14
+ from datachain.telemetry import telemetry
14
15
  from datachain.utils import uses_glob
15
16
 
16
17
  if TYPE_CHECKING:
@@ -80,6 +81,7 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
80
81
  client_config = client_config or {}
81
82
  client = Client.get_client(uri, cache, **client_config)
82
83
  storage_uri, path = Client.parse_url(uri)
84
+ telemetry.log_param("client", client.PREFIX)
83
85
 
84
86
  # clean path without globs
85
87
  lst_uri_path = (
datachain/listing.py CHANGED
@@ -156,12 +156,12 @@ class Listing:
156
156
 
157
157
  def instantiate_nodes(
158
158
  self,
159
- all_nodes,
159
+ all_nodes: Iterable[NodeWithPath],
160
160
  output,
161
161
  total_files=None,
162
162
  force=False,
163
163
  shared_progress_bar=None,
164
- ):
164
+ ) -> None:
165
165
  progress_bar = shared_progress_bar or tqdm(
166
166
  desc=f"Instantiating '{output}'",
167
167
  unit=" files",
@@ -175,8 +175,8 @@ class Listing:
175
175
  dst = os.path.join(output, *node.path)
176
176
  dst_dir = os.path.dirname(dst)
177
177
  os.makedirs(dst_dir, exist_ok=True)
178
- uid = node.n.as_uid(self.client.uri)
179
- self.client.instantiate_object(uid, dst, progress_bar, force)
178
+ file = node.n.to_file(self.client.uri)
179
+ self.client.instantiate_object(file, dst, progress_bar, force)
180
180
  counter += 1
181
181
  if counter > 1000:
182
182
  progress_bar.update(counter)
datachain/node.py CHANGED
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any, Optional
3
3
 
4
4
  import attrs
5
5
 
6
- from datachain.cache import UniqueId
6
+ from datachain.lib.file import File
7
7
  from datachain.storage import StorageURI
8
8
  from datachain.utils import TIME_ZERO, time_to_str
9
9
 
@@ -99,11 +99,11 @@ class Node:
99
99
  return self.path + "/"
100
100
  return self.path
101
101
 
102
- def as_uid(self, storage: Optional[StorageURI] = None) -> UniqueId:
103
- if storage is None:
104
- storage = self.source
105
- return UniqueId(
106
- storage=storage,
102
+ def to_file(self, source: Optional[StorageURI] = None) -> File:
103
+ if source is None:
104
+ source = self.source
105
+ return File(
106
+ source=source,
107
107
  path=self.path,
108
108
  size=self.size,
109
109
  version=self.version or "",
@@ -1,12 +1,19 @@
1
1
  import logging
2
+ from collections.abc import Iterable
3
+ from typing import TYPE_CHECKING
2
4
 
5
+ from datachain.node import Node
3
6
  from datachain.nodes_thread_pool import NodesThreadPool
4
7
 
8
+ if TYPE_CHECKING:
9
+ from datachain.cache import DataChainCache
10
+ from datachain.client.fsspec import Client
11
+
5
12
  logger = logging.getLogger("datachain")
6
13
 
7
14
 
8
15
  class NodesFetcher(NodesThreadPool):
9
- def __init__(self, client, max_threads, cache):
16
+ def __init__(self, client: "Client", max_threads: int, cache: "DataChainCache"):
10
17
  super().__init__(max_threads)
11
18
  self.client = client
12
19
  self.cache = cache
@@ -15,7 +22,7 @@ class NodesFetcher(NodesThreadPool):
15
22
  for task in done:
16
23
  task.result()
17
24
 
18
- def do_task(self, chunk):
25
+ def do_task(self, chunk: Iterable[Node]) -> None:
19
26
  from fsspec import Callback
20
27
 
21
28
  class _CB(Callback):
@@ -23,8 +30,8 @@ class NodesFetcher(NodesThreadPool):
23
30
  self.increase_counter(inc)
24
31
 
25
32
  for node in chunk:
26
- uid = node.as_uid(self.client.uri)
27
- if self.cache.contains(uid):
33
+ file = node.to_file(self.client.uri)
34
+ if self.cache.contains(file):
28
35
  self.increase_counter(node.size)
29
36
  else:
30
- self.client.put_in_cache(uid, callback=_CB())
37
+ self.client.put_in_cache(file, callback=_CB())