datachain 0.3.16__py3-none-any.whl → 0.3.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/dc.py CHANGED
@@ -58,6 +58,7 @@ from datachain.query.dataset import (
58
58
  )
59
59
  from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
60
60
  from datachain.sql.functions import path as pathfunc
61
+ from datachain.telemetry import telemetry
61
62
  from datachain.utils import inside_notebook
62
63
 
63
64
  if TYPE_CHECKING:
@@ -246,6 +247,9 @@ class DataChain(DatasetQuery):
246
247
  **kwargs,
247
248
  indexing_column_types=File._datachain_column_types,
248
249
  )
250
+
251
+ telemetry.send_event_once("class", "datachain_init", **kwargs)
252
+
249
253
  if settings:
250
254
  self._settings = Settings(**settings)
251
255
  else:
@@ -1337,8 +1341,7 @@ class DataChain(DatasetQuery):
1337
1341
  other.signals_schema.resolve(*right_on).db_signals(),
1338
1342
  ) # type: ignore[arg-type]
1339
1343
  )
1340
-
1341
- return super()._subtract(other, signals) # type: ignore[arg-type]
1344
+ return super().subtract(other, signals) # type: ignore[arg-type]
1342
1345
 
1343
1346
  @classmethod
1344
1347
  def from_values(
datachain/lib/file.py CHANGED
@@ -1,11 +1,14 @@
1
+ import hashlib
1
2
  import io
2
3
  import json
3
4
  import logging
4
5
  import os
5
6
  import posixpath
6
7
  from abc import ABC, abstractmethod
8
+ from collections.abc import Iterator
7
9
  from contextlib import contextmanager
8
10
  from datetime import datetime
11
+ from functools import partial
9
12
  from io import BytesIO
10
13
  from pathlib import Path, PurePosixPath
11
14
  from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
@@ -19,7 +22,6 @@ from pydantic import Field, field_validator
19
22
  if TYPE_CHECKING:
20
23
  from typing_extensions import Self
21
24
 
22
- from datachain.cache import UniqueId
23
25
  from datachain.client.fileslice import FileSlice
24
26
  from datachain.lib.data_model import DataModel
25
27
  from datachain.lib.utils import DataChainError
@@ -27,7 +29,13 @@ from datachain.sql.types import JSON, Boolean, DateTime, Int, String
27
29
  from datachain.utils import TIME_ZERO
28
30
 
29
31
  if TYPE_CHECKING:
32
+ from typing_extensions import Self
33
+
30
34
  from datachain.catalog import Catalog
35
+ from datachain.client.fsspec import Client
36
+ from datachain.dataset import RowDict
37
+
38
+ sha256 = partial(hashlib.sha256, usedforsecurity=False)
31
39
 
32
40
  logger = logging.getLogger("datachain")
33
41
 
@@ -38,7 +46,7 @@ ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
38
46
  class VFileError(DataChainError):
39
47
  def __init__(self, file: "File", message: str, vtype: str = ""):
40
48
  type_ = f" of vtype '{vtype}'" if vtype else ""
41
- super().__init__(f"Error in v-file '{file.get_uid().path}'{type_}: {message}")
49
+ super().__init__(f"Error in v-file '{file.path}'{type_}: {message}")
42
50
 
43
51
 
44
52
  class FileError(DataChainError):
@@ -85,9 +93,8 @@ class TarVFile(VFile):
85
93
  tar_file = File(**parent)
86
94
  tar_file._set_stream(file._catalog)
87
95
 
88
- tar_file_uid = tar_file.get_uid()
89
- client = file._catalog.get_client(tar_file_uid.storage)
90
- fd = client.open_object(tar_file_uid, use_cache=file._caching_enabled)
96
+ client = file._catalog.get_client(tar_file.source)
97
+ fd = client.open_object(tar_file, use_cache=file._caching_enabled)
91
98
  return FileSlice(fd, offset, size, file.name)
92
99
 
93
100
 
@@ -181,7 +188,11 @@ class File(DataModel):
181
188
  def __init__(self, **kwargs):
182
189
  super().__init__(**kwargs)
183
190
  self._catalog = None
184
- self._caching_enabled = False
191
+ self._caching_enabled: bool = False
192
+
193
+ @classmethod
194
+ def _from_row(cls, row: "RowDict") -> "Self":
195
+ return cls(**{key: row[key] for key in cls._datachain_column_types})
185
196
 
186
197
  @property
187
198
  def name(self):
@@ -192,19 +203,18 @@ class File(DataModel):
192
203
  return str(PurePosixPath(self.path).parent)
193
204
 
194
205
  @contextmanager
195
- def open(self, mode: Literal["rb", "r"] = "rb"):
206
+ def open(self, mode: Literal["rb", "r"] = "rb") -> Iterator[Any]:
196
207
  """Open the file and return a file object."""
197
208
  if self.location:
198
209
  with VFileRegistry.resolve(self, self.location) as f: # type: ignore[arg-type]
199
210
  yield f
200
211
 
201
212
  else:
202
- uid = self.get_uid()
203
- client = self._catalog.get_client(self.source)
204
213
  if self._caching_enabled:
205
- client.download(uid, callback=self._download_cb)
214
+ self.ensure_cached()
215
+ client: Client = self._catalog.get_client(self.source)
206
216
  with client.open_object(
207
- uid, use_cache=self._caching_enabled, cb=self._download_cb
217
+ self, use_cache=self._caching_enabled, cb=self._download_cb
208
218
  ) as f:
209
219
  yield io.TextIOWrapper(f) if mode == "r" else f
210
220
 
@@ -252,23 +262,25 @@ class File(DataModel):
252
262
  self._caching_enabled = caching_enabled
253
263
  self._download_cb = download_cb
254
264
 
255
- def get_uid(self) -> UniqueId:
256
- """Returns unique ID for file."""
257
- dump = self.model_dump()
258
- return UniqueId(*(dump[k] for k in self._unique_id_keys))
265
+ def ensure_cached(self) -> None:
266
+ if self._catalog is None:
267
+ raise RuntimeError(
268
+ "cannot download file to cache because catalog is not setup"
269
+ )
270
+ client = self._catalog.get_client(self.source)
271
+ client.download(self, callback=self._download_cb)
272
+
273
+ def get_local_path(self) -> Optional[str]:
274
+ """Return path to a file in a local cache.
259
275
 
260
- def get_local_path(self, download: bool = False) -> Optional[str]:
261
- """Returns path to a file in a local cache.
262
- Return None if file is not cached. Throws an exception if cache is not setup."""
276
+ Returns None if file is not cached.
277
+ Raises an exception if cache is not setup.
278
+ """
263
279
  if self._catalog is None:
264
280
  raise RuntimeError(
265
281
  "cannot resolve local file path because catalog is not setup"
266
282
  )
267
- uid = self.get_uid()
268
- if download:
269
- client = self._catalog.get_client(self.source)
270
- client.download(uid, callback=self._download_cb)
271
- return self._catalog.cache.get_path(uid)
283
+ return self._catalog.cache.get_path(self)
272
284
 
273
285
  def get_file_suffix(self):
274
286
  """Returns last part of file name with `.`."""
@@ -323,6 +335,12 @@ class File(DataModel):
323
335
  """Returns `fsspec` filesystem for the file."""
324
336
  return self._catalog.get_client(self.source).fs
325
337
 
338
+ def get_hash(self) -> str:
339
+ fingerprint = f"{self.source}/{self.path}/{self.version}/{self.etag}"
340
+ if self.location:
341
+ fingerprint += f"/{self.location}"
342
+ return sha256(fingerprint.encode()).hexdigest()
343
+
326
344
  def resolve(self) -> "Self":
327
345
  """
328
346
  Resolve a File object by checking its existence and updating its metadata.
datachain/lib/listing.py CHANGED
@@ -11,6 +11,7 @@ from datachain.client import Client
11
11
  from datachain.lib.file import File
12
12
  from datachain.query.schema import Column
13
13
  from datachain.sql.functions import path as pathfunc
14
+ from datachain.telemetry import telemetry
14
15
  from datachain.utils import uses_glob
15
16
 
16
17
  if TYPE_CHECKING:
@@ -77,8 +78,10 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
77
78
  """
78
79
  Parsing uri and returns listing dataset name, listing uri and listing path
79
80
  """
81
+ client_config = client_config or {}
80
82
  client = Client.get_client(uri, cache, **client_config)
81
83
  storage_uri, path = Client.parse_url(uri)
84
+ telemetry.log_param("client", client.PREFIX)
82
85
 
83
86
  # clean path without globs
84
87
  lst_uri_path = (
datachain/lib/tar.py CHANGED
@@ -30,4 +30,5 @@ def process_tar(file: File) -> Iterator[File]:
30
30
  with file.open() as fd:
31
31
  with tarfile.open(fileobj=fd) as tar:
32
32
  for entry in tar.getmembers():
33
- yield build_tar_member(file, entry)
33
+ if entry.isfile():
34
+ yield build_tar_member(file, entry)
datachain/listing.py CHANGED
@@ -156,12 +156,12 @@ class Listing:
156
156
 
157
157
  def instantiate_nodes(
158
158
  self,
159
- all_nodes,
159
+ all_nodes: Iterable[NodeWithPath],
160
160
  output,
161
161
  total_files=None,
162
162
  force=False,
163
163
  shared_progress_bar=None,
164
- ):
164
+ ) -> None:
165
165
  progress_bar = shared_progress_bar or tqdm(
166
166
  desc=f"Instantiating '{output}'",
167
167
  unit=" files",
@@ -175,8 +175,8 @@ class Listing:
175
175
  dst = os.path.join(output, *node.path)
176
176
  dst_dir = os.path.dirname(dst)
177
177
  os.makedirs(dst_dir, exist_ok=True)
178
- uid = node.n.as_uid(self.client.uri)
179
- self.client.instantiate_object(uid, dst, progress_bar, force)
178
+ file = node.n.to_file(self.client.uri)
179
+ self.client.instantiate_object(file, dst, progress_bar, force)
180
180
  counter += 1
181
181
  if counter > 1000:
182
182
  progress_bar.update(counter)
datachain/node.py CHANGED
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any, Optional
3
3
 
4
4
  import attrs
5
5
 
6
- from datachain.cache import UniqueId
6
+ from datachain.lib.file import File
7
7
  from datachain.storage import StorageURI
8
8
  from datachain.utils import TIME_ZERO, time_to_str
9
9
 
@@ -99,11 +99,11 @@ class Node:
99
99
  return self.path + "/"
100
100
  return self.path
101
101
 
102
- def as_uid(self, storage: Optional[StorageURI] = None) -> UniqueId:
103
- if storage is None:
104
- storage = self.source
105
- return UniqueId(
106
- storage=storage,
102
+ def to_file(self, source: Optional[StorageURI] = None) -> File:
103
+ if source is None:
104
+ source = self.source
105
+ return File(
106
+ source=source,
107
107
  path=self.path,
108
108
  size=self.size,
109
109
  version=self.version or "",
@@ -114,9 +114,23 @@ class Node:
114
114
  )
115
115
 
116
116
  @classmethod
117
- def from_dict(cls, d: dict[str, Any]) -> "Self":
118
- kw = {f.name: d[f.name] for f in attrs.fields(cls) if f.name in d}
119
- return cls(**kw)
117
+ def from_dict(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
118
+ def _dval(field_name: str):
119
+ return d.get(f"{file_prefix}__{field_name}")
120
+
121
+ return cls(
122
+ sys__id=d["sys__id"],
123
+ sys__rand=d["sys__rand"],
124
+ source=_dval("source"),
125
+ path=_dval("path"),
126
+ etag=_dval("etag"),
127
+ is_latest=_dval("is_latest"),
128
+ size=_dval("size"),
129
+ last_modified=_dval("last_modified"),
130
+ version=_dval("version"),
131
+ location=_dval("location"),
132
+ dir_type=DirType.FILE,
133
+ )
120
134
 
121
135
  @classmethod
122
136
  def from_dir(cls, path, **kwargs) -> "Node":
@@ -1,12 +1,19 @@
1
1
  import logging
2
+ from collections.abc import Iterable
3
+ from typing import TYPE_CHECKING
2
4
 
5
+ from datachain.node import Node
3
6
  from datachain.nodes_thread_pool import NodesThreadPool
4
7
 
8
+ if TYPE_CHECKING:
9
+ from datachain.cache import DataChainCache
10
+ from datachain.client.fsspec import Client
11
+
5
12
  logger = logging.getLogger("datachain")
6
13
 
7
14
 
8
15
  class NodesFetcher(NodesThreadPool):
9
- def __init__(self, client, max_threads, cache):
16
+ def __init__(self, client: "Client", max_threads: int, cache: "DataChainCache"):
10
17
  super().__init__(max_threads)
11
18
  self.client = client
12
19
  self.cache = cache
@@ -15,7 +22,7 @@ class NodesFetcher(NodesThreadPool):
15
22
  for task in done:
16
23
  task.result()
17
24
 
18
- def do_task(self, chunk):
25
+ def do_task(self, chunk: Iterable[Node]) -> None:
19
26
  from fsspec import Callback
20
27
 
21
28
  class _CB(Callback):
@@ -23,8 +30,8 @@ class NodesFetcher(NodesThreadPool):
23
30
  self.increase_counter(inc)
24
31
 
25
32
  for node in chunk:
26
- uid = node.as_uid(self.client.uri)
27
- if self.cache.contains(uid):
33
+ file = node.to_file(self.client.uri)
34
+ if self.cache.contains(file):
28
35
  self.increase_counter(node.size)
29
36
  else:
30
- self.client.put_in_cache(uid, callback=_CB())
37
+ self.client.put_in_cache(file, callback=_CB())
@@ -20,7 +20,7 @@ class NodeChunk:
20
20
  def next_downloadable(self):
21
21
  node = next(self.nodes, None)
22
22
  while node and (
23
- not node.is_downloadable or self.cache.contains(node.as_uid(self.storage))
23
+ not node.is_downloadable or self.cache.contains(node.to_file(self.storage))
24
24
  ):
25
25
  node = next(self.nodes, None)
26
26
  return node
datachain/progress.py CHANGED
@@ -1,8 +1,6 @@
1
1
  """Manages progress bars."""
2
2
 
3
3
  import logging
4
- import os
5
- import re
6
4
  import sys
7
5
  from threading import RLock
8
6
  from typing import Any, ClassVar
@@ -10,20 +8,12 @@ from typing import Any, ClassVar
10
8
  from fsspec.callbacks import TqdmCallback
11
9
  from tqdm import tqdm
12
10
 
11
+ from datachain.utils import env2bool
12
+
13
13
  logger = logging.getLogger(__name__)
14
14
  tqdm.set_lock(RLock())
15
15
 
16
16
 
17
- def env2bool(var, undefined=False):
18
- """
19
- undefined: return value if env var is unset
20
- """
21
- var = os.getenv(var, None)
22
- if var is None:
23
- return undefined
24
- return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
25
-
26
-
27
17
  class Tqdm(tqdm):
28
18
  """
29
19
  maximum-compatibility tqdm-based progressbars
@@ -2,7 +2,6 @@ from .dataset import DatasetQuery
2
2
  from .params import param
3
3
  from .schema import C, DatasetRow, LocalFilename, Object, Stream
4
4
  from .session import Session
5
- from .udf import udf
6
5
 
7
6
  __all__ = [
8
7
  "C",
@@ -13,5 +12,4 @@ __all__ = [
13
12
  "Session",
14
13
  "Stream",
15
14
  "param",
16
- "udf",
17
15
  ]