datachain 0.3.16__py3-none-any.whl → 0.3.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +14 -55
- datachain/catalog/catalog.py +21 -55
- datachain/cli.py +7 -26
- datachain/client/fsspec.py +29 -63
- datachain/client/local.py +2 -3
- datachain/data_storage/metastore.py +7 -66
- datachain/data_storage/sqlite.py +5 -2
- datachain/data_storage/warehouse.py +0 -22
- datachain/lib/arrow.py +2 -1
- datachain/lib/dc.py +5 -2
- datachain/lib/file.py +41 -23
- datachain/lib/listing.py +3 -0
- datachain/lib/tar.py +2 -1
- datachain/listing.py +4 -4
- datachain/node.py +23 -9
- datachain/nodes_fetcher.py +12 -5
- datachain/nodes_thread_pool.py +1 -1
- datachain/progress.py +2 -12
- datachain/query/__init__.py +0 -2
- datachain/query/dataset.py +26 -144
- datachain/query/dispatch.py +2 -15
- datachain/query/schema.py +36 -24
- datachain/query/udf.py +2 -148
- datachain/sql/types.py +4 -2
- datachain/telemetry.py +37 -0
- datachain/utils.py +11 -40
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/METADATA +5 -3
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/RECORD +32 -32
- datachain/query/builtins.py +0 -96
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/LICENSE +0 -0
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/WHEEL +0 -0
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/top_level.txt +0 -0
datachain/lib/dc.py
CHANGED
|
@@ -58,6 +58,7 @@ from datachain.query.dataset import (
|
|
|
58
58
|
)
|
|
59
59
|
from datachain.query.schema import DEFAULT_DELIMITER, Column, DatasetRow
|
|
60
60
|
from datachain.sql.functions import path as pathfunc
|
|
61
|
+
from datachain.telemetry import telemetry
|
|
61
62
|
from datachain.utils import inside_notebook
|
|
62
63
|
|
|
63
64
|
if TYPE_CHECKING:
|
|
@@ -246,6 +247,9 @@ class DataChain(DatasetQuery):
|
|
|
246
247
|
**kwargs,
|
|
247
248
|
indexing_column_types=File._datachain_column_types,
|
|
248
249
|
)
|
|
250
|
+
|
|
251
|
+
telemetry.send_event_once("class", "datachain_init", **kwargs)
|
|
252
|
+
|
|
249
253
|
if settings:
|
|
250
254
|
self._settings = Settings(**settings)
|
|
251
255
|
else:
|
|
@@ -1337,8 +1341,7 @@ class DataChain(DatasetQuery):
|
|
|
1337
1341
|
other.signals_schema.resolve(*right_on).db_signals(),
|
|
1338
1342
|
) # type: ignore[arg-type]
|
|
1339
1343
|
)
|
|
1340
|
-
|
|
1341
|
-
return super()._subtract(other, signals) # type: ignore[arg-type]
|
|
1344
|
+
return super().subtract(other, signals) # type: ignore[arg-type]
|
|
1342
1345
|
|
|
1343
1346
|
@classmethod
|
|
1344
1347
|
def from_values(
|
datachain/lib/file.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import io
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
4
5
|
import os
|
|
5
6
|
import posixpath
|
|
6
7
|
from abc import ABC, abstractmethod
|
|
8
|
+
from collections.abc import Iterator
|
|
7
9
|
from contextlib import contextmanager
|
|
8
10
|
from datetime import datetime
|
|
11
|
+
from functools import partial
|
|
9
12
|
from io import BytesIO
|
|
10
13
|
from pathlib import Path, PurePosixPath
|
|
11
14
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
|
|
@@ -19,7 +22,6 @@ from pydantic import Field, field_validator
|
|
|
19
22
|
if TYPE_CHECKING:
|
|
20
23
|
from typing_extensions import Self
|
|
21
24
|
|
|
22
|
-
from datachain.cache import UniqueId
|
|
23
25
|
from datachain.client.fileslice import FileSlice
|
|
24
26
|
from datachain.lib.data_model import DataModel
|
|
25
27
|
from datachain.lib.utils import DataChainError
|
|
@@ -27,7 +29,13 @@ from datachain.sql.types import JSON, Boolean, DateTime, Int, String
|
|
|
27
29
|
from datachain.utils import TIME_ZERO
|
|
28
30
|
|
|
29
31
|
if TYPE_CHECKING:
|
|
32
|
+
from typing_extensions import Self
|
|
33
|
+
|
|
30
34
|
from datachain.catalog import Catalog
|
|
35
|
+
from datachain.client.fsspec import Client
|
|
36
|
+
from datachain.dataset import RowDict
|
|
37
|
+
|
|
38
|
+
sha256 = partial(hashlib.sha256, usedforsecurity=False)
|
|
31
39
|
|
|
32
40
|
logger = logging.getLogger("datachain")
|
|
33
41
|
|
|
@@ -38,7 +46,7 @@ ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
|
|
|
38
46
|
class VFileError(DataChainError):
|
|
39
47
|
def __init__(self, file: "File", message: str, vtype: str = ""):
|
|
40
48
|
type_ = f" of vtype '{vtype}'" if vtype else ""
|
|
41
|
-
super().__init__(f"Error in v-file '{file.
|
|
49
|
+
super().__init__(f"Error in v-file '{file.path}'{type_}: {message}")
|
|
42
50
|
|
|
43
51
|
|
|
44
52
|
class FileError(DataChainError):
|
|
@@ -85,9 +93,8 @@ class TarVFile(VFile):
|
|
|
85
93
|
tar_file = File(**parent)
|
|
86
94
|
tar_file._set_stream(file._catalog)
|
|
87
95
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
fd = client.open_object(tar_file_uid, use_cache=file._caching_enabled)
|
|
96
|
+
client = file._catalog.get_client(tar_file.source)
|
|
97
|
+
fd = client.open_object(tar_file, use_cache=file._caching_enabled)
|
|
91
98
|
return FileSlice(fd, offset, size, file.name)
|
|
92
99
|
|
|
93
100
|
|
|
@@ -181,7 +188,11 @@ class File(DataModel):
|
|
|
181
188
|
def __init__(self, **kwargs):
|
|
182
189
|
super().__init__(**kwargs)
|
|
183
190
|
self._catalog = None
|
|
184
|
-
self._caching_enabled = False
|
|
191
|
+
self._caching_enabled: bool = False
|
|
192
|
+
|
|
193
|
+
@classmethod
|
|
194
|
+
def _from_row(cls, row: "RowDict") -> "Self":
|
|
195
|
+
return cls(**{key: row[key] for key in cls._datachain_column_types})
|
|
185
196
|
|
|
186
197
|
@property
|
|
187
198
|
def name(self):
|
|
@@ -192,19 +203,18 @@ class File(DataModel):
|
|
|
192
203
|
return str(PurePosixPath(self.path).parent)
|
|
193
204
|
|
|
194
205
|
@contextmanager
|
|
195
|
-
def open(self, mode: Literal["rb", "r"] = "rb"):
|
|
206
|
+
def open(self, mode: Literal["rb", "r"] = "rb") -> Iterator[Any]:
|
|
196
207
|
"""Open the file and return a file object."""
|
|
197
208
|
if self.location:
|
|
198
209
|
with VFileRegistry.resolve(self, self.location) as f: # type: ignore[arg-type]
|
|
199
210
|
yield f
|
|
200
211
|
|
|
201
212
|
else:
|
|
202
|
-
uid = self.get_uid()
|
|
203
|
-
client = self._catalog.get_client(self.source)
|
|
204
213
|
if self._caching_enabled:
|
|
205
|
-
|
|
214
|
+
self.ensure_cached()
|
|
215
|
+
client: Client = self._catalog.get_client(self.source)
|
|
206
216
|
with client.open_object(
|
|
207
|
-
|
|
217
|
+
self, use_cache=self._caching_enabled, cb=self._download_cb
|
|
208
218
|
) as f:
|
|
209
219
|
yield io.TextIOWrapper(f) if mode == "r" else f
|
|
210
220
|
|
|
@@ -252,23 +262,25 @@ class File(DataModel):
|
|
|
252
262
|
self._caching_enabled = caching_enabled
|
|
253
263
|
self._download_cb = download_cb
|
|
254
264
|
|
|
255
|
-
def
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
265
|
+
def ensure_cached(self) -> None:
|
|
266
|
+
if self._catalog is None:
|
|
267
|
+
raise RuntimeError(
|
|
268
|
+
"cannot download file to cache because catalog is not setup"
|
|
269
|
+
)
|
|
270
|
+
client = self._catalog.get_client(self.source)
|
|
271
|
+
client.download(self, callback=self._download_cb)
|
|
272
|
+
|
|
273
|
+
def get_local_path(self) -> Optional[str]:
|
|
274
|
+
"""Return path to a file in a local cache.
|
|
259
275
|
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
276
|
+
Returns None if file is not cached.
|
|
277
|
+
Raises an exception if cache is not setup.
|
|
278
|
+
"""
|
|
263
279
|
if self._catalog is None:
|
|
264
280
|
raise RuntimeError(
|
|
265
281
|
"cannot resolve local file path because catalog is not setup"
|
|
266
282
|
)
|
|
267
|
-
|
|
268
|
-
if download:
|
|
269
|
-
client = self._catalog.get_client(self.source)
|
|
270
|
-
client.download(uid, callback=self._download_cb)
|
|
271
|
-
return self._catalog.cache.get_path(uid)
|
|
283
|
+
return self._catalog.cache.get_path(self)
|
|
272
284
|
|
|
273
285
|
def get_file_suffix(self):
|
|
274
286
|
"""Returns last part of file name with `.`."""
|
|
@@ -323,6 +335,12 @@ class File(DataModel):
|
|
|
323
335
|
"""Returns `fsspec` filesystem for the file."""
|
|
324
336
|
return self._catalog.get_client(self.source).fs
|
|
325
337
|
|
|
338
|
+
def get_hash(self) -> str:
|
|
339
|
+
fingerprint = f"{self.source}/{self.path}/{self.version}/{self.etag}"
|
|
340
|
+
if self.location:
|
|
341
|
+
fingerprint += f"/{self.location}"
|
|
342
|
+
return sha256(fingerprint.encode()).hexdigest()
|
|
343
|
+
|
|
326
344
|
def resolve(self) -> "Self":
|
|
327
345
|
"""
|
|
328
346
|
Resolve a File object by checking its existence and updating its metadata.
|
datachain/lib/listing.py
CHANGED
|
@@ -11,6 +11,7 @@ from datachain.client import Client
|
|
|
11
11
|
from datachain.lib.file import File
|
|
12
12
|
from datachain.query.schema import Column
|
|
13
13
|
from datachain.sql.functions import path as pathfunc
|
|
14
|
+
from datachain.telemetry import telemetry
|
|
14
15
|
from datachain.utils import uses_glob
|
|
15
16
|
|
|
16
17
|
if TYPE_CHECKING:
|
|
@@ -77,8 +78,10 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[str, str, str]:
|
|
|
77
78
|
"""
|
|
78
79
|
Parsing uri and returns listing dataset name, listing uri and listing path
|
|
79
80
|
"""
|
|
81
|
+
client_config = client_config or {}
|
|
80
82
|
client = Client.get_client(uri, cache, **client_config)
|
|
81
83
|
storage_uri, path = Client.parse_url(uri)
|
|
84
|
+
telemetry.log_param("client", client.PREFIX)
|
|
82
85
|
|
|
83
86
|
# clean path without globs
|
|
84
87
|
lst_uri_path = (
|
datachain/lib/tar.py
CHANGED
datachain/listing.py
CHANGED
|
@@ -156,12 +156,12 @@ class Listing:
|
|
|
156
156
|
|
|
157
157
|
def instantiate_nodes(
|
|
158
158
|
self,
|
|
159
|
-
all_nodes,
|
|
159
|
+
all_nodes: Iterable[NodeWithPath],
|
|
160
160
|
output,
|
|
161
161
|
total_files=None,
|
|
162
162
|
force=False,
|
|
163
163
|
shared_progress_bar=None,
|
|
164
|
-
):
|
|
164
|
+
) -> None:
|
|
165
165
|
progress_bar = shared_progress_bar or tqdm(
|
|
166
166
|
desc=f"Instantiating '{output}'",
|
|
167
167
|
unit=" files",
|
|
@@ -175,8 +175,8 @@ class Listing:
|
|
|
175
175
|
dst = os.path.join(output, *node.path)
|
|
176
176
|
dst_dir = os.path.dirname(dst)
|
|
177
177
|
os.makedirs(dst_dir, exist_ok=True)
|
|
178
|
-
|
|
179
|
-
self.client.instantiate_object(
|
|
178
|
+
file = node.n.to_file(self.client.uri)
|
|
179
|
+
self.client.instantiate_object(file, dst, progress_bar, force)
|
|
180
180
|
counter += 1
|
|
181
181
|
if counter > 1000:
|
|
182
182
|
progress_bar.update(counter)
|
datachain/node.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import TYPE_CHECKING, Any, Optional
|
|
|
3
3
|
|
|
4
4
|
import attrs
|
|
5
5
|
|
|
6
|
-
from datachain.
|
|
6
|
+
from datachain.lib.file import File
|
|
7
7
|
from datachain.storage import StorageURI
|
|
8
8
|
from datachain.utils import TIME_ZERO, time_to_str
|
|
9
9
|
|
|
@@ -99,11 +99,11 @@ class Node:
|
|
|
99
99
|
return self.path + "/"
|
|
100
100
|
return self.path
|
|
101
101
|
|
|
102
|
-
def
|
|
103
|
-
if
|
|
104
|
-
|
|
105
|
-
return
|
|
106
|
-
|
|
102
|
+
def to_file(self, source: Optional[StorageURI] = None) -> File:
|
|
103
|
+
if source is None:
|
|
104
|
+
source = self.source
|
|
105
|
+
return File(
|
|
106
|
+
source=source,
|
|
107
107
|
path=self.path,
|
|
108
108
|
size=self.size,
|
|
109
109
|
version=self.version or "",
|
|
@@ -114,9 +114,23 @@ class Node:
|
|
|
114
114
|
)
|
|
115
115
|
|
|
116
116
|
@classmethod
|
|
117
|
-
def from_dict(cls, d: dict[str, Any]) -> "Self":
|
|
118
|
-
|
|
119
|
-
|
|
117
|
+
def from_dict(cls, d: dict[str, Any], file_prefix: str = "file") -> "Self":
|
|
118
|
+
def _dval(field_name: str):
|
|
119
|
+
return d.get(f"{file_prefix}__{field_name}")
|
|
120
|
+
|
|
121
|
+
return cls(
|
|
122
|
+
sys__id=d["sys__id"],
|
|
123
|
+
sys__rand=d["sys__rand"],
|
|
124
|
+
source=_dval("source"),
|
|
125
|
+
path=_dval("path"),
|
|
126
|
+
etag=_dval("etag"),
|
|
127
|
+
is_latest=_dval("is_latest"),
|
|
128
|
+
size=_dval("size"),
|
|
129
|
+
last_modified=_dval("last_modified"),
|
|
130
|
+
version=_dval("version"),
|
|
131
|
+
location=_dval("location"),
|
|
132
|
+
dir_type=DirType.FILE,
|
|
133
|
+
)
|
|
120
134
|
|
|
121
135
|
@classmethod
|
|
122
136
|
def from_dir(cls, path, **kwargs) -> "Node":
|
datachain/nodes_fetcher.py
CHANGED
|
@@ -1,12 +1,19 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
2
4
|
|
|
5
|
+
from datachain.node import Node
|
|
3
6
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
4
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from datachain.cache import DataChainCache
|
|
10
|
+
from datachain.client.fsspec import Client
|
|
11
|
+
|
|
5
12
|
logger = logging.getLogger("datachain")
|
|
6
13
|
|
|
7
14
|
|
|
8
15
|
class NodesFetcher(NodesThreadPool):
|
|
9
|
-
def __init__(self, client, max_threads, cache):
|
|
16
|
+
def __init__(self, client: "Client", max_threads: int, cache: "DataChainCache"):
|
|
10
17
|
super().__init__(max_threads)
|
|
11
18
|
self.client = client
|
|
12
19
|
self.cache = cache
|
|
@@ -15,7 +22,7 @@ class NodesFetcher(NodesThreadPool):
|
|
|
15
22
|
for task in done:
|
|
16
23
|
task.result()
|
|
17
24
|
|
|
18
|
-
def do_task(self, chunk):
|
|
25
|
+
def do_task(self, chunk: Iterable[Node]) -> None:
|
|
19
26
|
from fsspec import Callback
|
|
20
27
|
|
|
21
28
|
class _CB(Callback):
|
|
@@ -23,8 +30,8 @@ class NodesFetcher(NodesThreadPool):
|
|
|
23
30
|
self.increase_counter(inc)
|
|
24
31
|
|
|
25
32
|
for node in chunk:
|
|
26
|
-
|
|
27
|
-
if self.cache.contains(
|
|
33
|
+
file = node.to_file(self.client.uri)
|
|
34
|
+
if self.cache.contains(file):
|
|
28
35
|
self.increase_counter(node.size)
|
|
29
36
|
else:
|
|
30
|
-
self.client.put_in_cache(
|
|
37
|
+
self.client.put_in_cache(file, callback=_CB())
|
datachain/nodes_thread_pool.py
CHANGED
|
@@ -20,7 +20,7 @@ class NodeChunk:
|
|
|
20
20
|
def next_downloadable(self):
|
|
21
21
|
node = next(self.nodes, None)
|
|
22
22
|
while node and (
|
|
23
|
-
not node.is_downloadable or self.cache.contains(node.
|
|
23
|
+
not node.is_downloadable or self.cache.contains(node.to_file(self.storage))
|
|
24
24
|
):
|
|
25
25
|
node = next(self.nodes, None)
|
|
26
26
|
return node
|
datachain/progress.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
"""Manages progress bars."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
import os
|
|
5
|
-
import re
|
|
6
4
|
import sys
|
|
7
5
|
from threading import RLock
|
|
8
6
|
from typing import Any, ClassVar
|
|
@@ -10,20 +8,12 @@ from typing import Any, ClassVar
|
|
|
10
8
|
from fsspec.callbacks import TqdmCallback
|
|
11
9
|
from tqdm import tqdm
|
|
12
10
|
|
|
11
|
+
from datachain.utils import env2bool
|
|
12
|
+
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
tqdm.set_lock(RLock())
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
def env2bool(var, undefined=False):
|
|
18
|
-
"""
|
|
19
|
-
undefined: return value if env var is unset
|
|
20
|
-
"""
|
|
21
|
-
var = os.getenv(var, None)
|
|
22
|
-
if var is None:
|
|
23
|
-
return undefined
|
|
24
|
-
return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
|
|
25
|
-
|
|
26
|
-
|
|
27
17
|
class Tqdm(tqdm):
|
|
28
18
|
"""
|
|
29
19
|
maximum-compatibility tqdm-based progressbars
|
datachain/query/__init__.py
CHANGED
|
@@ -2,7 +2,6 @@ from .dataset import DatasetQuery
|
|
|
2
2
|
from .params import param
|
|
3
3
|
from .schema import C, DatasetRow, LocalFilename, Object, Stream
|
|
4
4
|
from .session import Session
|
|
5
|
-
from .udf import udf
|
|
6
5
|
|
|
7
6
|
__all__ = [
|
|
8
7
|
"C",
|
|
@@ -13,5 +12,4 @@ __all__ = [
|
|
|
13
12
|
"Session",
|
|
14
13
|
"Stream",
|
|
15
14
|
"param",
|
|
16
|
-
"udf",
|
|
17
15
|
]
|