datachain 0.18.11__py3-none-any.whl → 0.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/client/fsspec.py +1 -0
- datachain/client/hf.py +19 -2
- datachain/delta.py +164 -39
- datachain/lib/arrow.py +2 -0
- datachain/lib/dc/datachain.py +19 -4
- datachain/lib/dc/datasets.py +12 -1
- datachain/lib/dc/storage.py +13 -1
- datachain/lib/file.py +40 -11
- datachain/semver.py +15 -5
- {datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/METADATA +56 -2
- {datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/RECORD +15 -15
- {datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/WHEEL +0 -0
- {datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.18.11.dist-info → datachain-0.19.1.dist-info}/top_level.txt +0 -0
datachain/client/fsspec.py
CHANGED
datachain/client/hf.py
CHANGED
|
@@ -15,6 +15,24 @@ class classproperty: # noqa: N801
|
|
|
15
15
|
return self.fget(owner)
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
def _wrap_class(sync_fs_class):
|
|
19
|
+
"""
|
|
20
|
+
Analog of `AsyncFileSystemWrapper.wrap_class` from fsspec, but sets
|
|
21
|
+
asynchronous to False by default. This is similar to other Async FS
|
|
22
|
+
we initialize. E.g. it means we don't break things in Jupyter where code
|
|
23
|
+
run in async.
|
|
24
|
+
"""
|
|
25
|
+
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
|
|
26
|
+
|
|
27
|
+
class GeneratedAsyncFileSystemWrapper(AsyncFileSystemWrapper):
|
|
28
|
+
def __init__(self, *args, **kwargs):
|
|
29
|
+
sync_fs = sync_fs_class(*args, **kwargs)
|
|
30
|
+
super().__init__(sync_fs, asynchronous=False)
|
|
31
|
+
|
|
32
|
+
GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
|
|
33
|
+
return GeneratedAsyncFileSystemWrapper
|
|
34
|
+
|
|
35
|
+
|
|
18
36
|
@functools.cache
|
|
19
37
|
def get_hf_filesystem_cls():
|
|
20
38
|
import fsspec
|
|
@@ -29,10 +47,9 @@ def get_hf_filesystem_cls():
|
|
|
29
47
|
f"{fsspec_version} is installed."
|
|
30
48
|
)
|
|
31
49
|
|
|
32
|
-
from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
|
|
33
50
|
from huggingface_hub import HfFileSystem
|
|
34
51
|
|
|
35
|
-
fs_cls =
|
|
52
|
+
fs_cls = _wrap_class(HfFileSystem)
|
|
36
53
|
# AsyncFileSystemWrapper does not set class properties, so we need to set them back.
|
|
37
54
|
fs_cls.protocol = HfFileSystem.protocol
|
|
38
55
|
return fs_cls
|
datachain/delta.py
CHANGED
|
@@ -48,72 +48,197 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
|
|
|
48
48
|
return dc
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def
|
|
51
|
+
def _get_delta_chain(
|
|
52
|
+
source_ds_name: str,
|
|
53
|
+
source_ds_version: str,
|
|
54
|
+
source_ds_latest_version: str,
|
|
55
|
+
on: Union[str, Sequence[str]],
|
|
56
|
+
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
57
|
+
) -> "DataChain":
|
|
58
|
+
"""Get delta chain for processing changes between versions."""
|
|
59
|
+
source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
|
|
60
|
+
source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
|
|
61
|
+
|
|
62
|
+
# Calculate diff between source versions
|
|
63
|
+
return source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _get_retry_chain(
|
|
67
|
+
name: str,
|
|
68
|
+
latest_version: str,
|
|
69
|
+
source_ds_name: str,
|
|
70
|
+
source_ds_latest_version: str,
|
|
71
|
+
on: Union[str, Sequence[str]],
|
|
72
|
+
right_on: Optional[Union[str, Sequence[str]]],
|
|
73
|
+
delta_retry: Optional[Union[bool, str]],
|
|
74
|
+
) -> Optional["DataChain"]:
|
|
75
|
+
"""Get retry chain for processing error records and missing records."""
|
|
76
|
+
# Import here to avoid circular import
|
|
77
|
+
from datachain.lib.dc import C
|
|
78
|
+
|
|
79
|
+
retry_chain = None
|
|
80
|
+
|
|
81
|
+
# Read the latest version of the result dataset for retry logic
|
|
82
|
+
result_dataset = datachain.read_dataset(name, latest_version)
|
|
83
|
+
source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
|
|
84
|
+
|
|
85
|
+
# Handle error records if delta_retry is a string (column name)
|
|
86
|
+
if isinstance(delta_retry, str):
|
|
87
|
+
error_records = result_dataset.filter(C(delta_retry) != "")
|
|
88
|
+
error_source_records = source_dc_latest.merge(
|
|
89
|
+
error_records, on=on, right_on=right_on, inner=True
|
|
90
|
+
).select(*list(source_dc_latest.signals_schema.values))
|
|
91
|
+
retry_chain = error_source_records
|
|
92
|
+
|
|
93
|
+
# Handle missing records if delta_retry is True
|
|
94
|
+
elif delta_retry is True:
|
|
95
|
+
missing_records = source_dc_latest.subtract(
|
|
96
|
+
result_dataset, on=on, right_on=right_on
|
|
97
|
+
)
|
|
98
|
+
retry_chain = missing_records
|
|
99
|
+
|
|
100
|
+
return retry_chain
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_source_info(
|
|
104
|
+
name: str,
|
|
105
|
+
latest_version: str,
|
|
106
|
+
catalog,
|
|
107
|
+
) -> tuple[
|
|
108
|
+
Optional[str], Optional[str], Optional[str], Optional[list[DatasetDependency]]
|
|
109
|
+
]:
|
|
110
|
+
"""Get source dataset information and dependencies.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Tuple of (source_name, source_version, source_latest_version, dependencies)
|
|
114
|
+
Returns (None, None, None, None) if source dataset was removed.
|
|
115
|
+
"""
|
|
116
|
+
dependencies = catalog.get_dataset_dependencies(
|
|
117
|
+
name, latest_version, indirect=False
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
dep = dependencies[0]
|
|
121
|
+
if not dep:
|
|
122
|
+
# Starting dataset was removed, back off to normal dataset creation
|
|
123
|
+
return None, None, None, None
|
|
124
|
+
|
|
125
|
+
source_ds_name = dep.name
|
|
126
|
+
source_ds_version = dep.version
|
|
127
|
+
source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
|
|
128
|
+
|
|
129
|
+
return source_ds_name, source_ds_version, source_ds_latest_version, dependencies
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def delta_retry_update(
|
|
52
133
|
dc: "DataChain",
|
|
53
134
|
name: str,
|
|
54
135
|
on: Union[str, Sequence[str]],
|
|
55
136
|
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
56
137
|
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
138
|
+
delta_retry: Optional[Union[bool, str]] = None,
|
|
57
139
|
) -> tuple[Optional["DataChain"], Optional[list[DatasetDependency]], bool]:
|
|
58
140
|
"""
|
|
59
141
|
Creates new chain that consists of the last version of current delta dataset
|
|
60
142
|
plus diff from the source with all needed modifications.
|
|
61
|
-
This way we don't need to re-calculate the whole chain from the source again
|
|
62
|
-
apply all the DataChain methods like filters, mappers, generators etc.)
|
|
143
|
+
This way we don't need to re-calculate the whole chain from the source again
|
|
144
|
+
(apply all the DataChain methods like filters, mappers, generators etc.)
|
|
63
145
|
but just the diff part which is very important for performance.
|
|
64
146
|
|
|
65
|
-
Note that currently delta update works only if there is only one direct
|
|
147
|
+
Note that currently delta update works only if there is only one direct
|
|
148
|
+
dependency.
|
|
149
|
+
|
|
150
|
+
Additionally supports retry functionality to filter records that either:
|
|
151
|
+
1. Have a non-None value in the field specified by delta_retry (when it's a string)
|
|
152
|
+
2. Exist in the source dataset but are missing in the result dataset
|
|
153
|
+
(when delta_retry=True)
|
|
154
|
+
|
|
155
|
+
Parameters:
|
|
156
|
+
dc: The DataChain to filter for records that need reprocessing
|
|
157
|
+
name: Name of the destination dataset
|
|
158
|
+
on: Field(s) in source dataset that uniquely identify records
|
|
159
|
+
right_on: Corresponding field(s) in result dataset if they differ from
|
|
160
|
+
source
|
|
161
|
+
compare: Field(s) used to check if the same row has been modified
|
|
162
|
+
delta_retry: If string, field in result dataset that indicates an error
|
|
163
|
+
when not None. If True, include records missing from result dataset.
|
|
164
|
+
If False/None, no retry functionality.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
A tuple containing (filtered chain for delta/retry processing,
|
|
168
|
+
dependencies, found records flag)
|
|
66
169
|
"""
|
|
170
|
+
|
|
67
171
|
catalog = dc.session.catalog
|
|
68
172
|
dc._query.apply_listing_pre_step()
|
|
69
173
|
|
|
174
|
+
# Check if dataset exists
|
|
70
175
|
try:
|
|
71
|
-
|
|
176
|
+
dataset = catalog.get_dataset(name)
|
|
177
|
+
latest_version = dataset.latest_version
|
|
72
178
|
except DatasetNotFoundError:
|
|
73
|
-
#
|
|
179
|
+
# First creation of result dataset
|
|
74
180
|
return None, None, True
|
|
75
181
|
|
|
76
|
-
|
|
77
|
-
|
|
182
|
+
# Initialize variables
|
|
183
|
+
diff_chain = None
|
|
184
|
+
dependencies = None
|
|
185
|
+
retry_chain = None
|
|
186
|
+
processing_chain = None
|
|
187
|
+
|
|
188
|
+
source_ds_name, source_ds_version, source_ds_latest_version, dependencies = (
|
|
189
|
+
_get_source_info(name, latest_version, catalog)
|
|
78
190
|
)
|
|
79
191
|
|
|
80
|
-
|
|
81
|
-
if
|
|
82
|
-
# starting dataset (e.g listing) was removed so we are backing off to normal
|
|
83
|
-
# dataset creation, as it was created first time
|
|
192
|
+
# If source_ds_name is None, starting dataset was removed
|
|
193
|
+
if source_ds_name is None:
|
|
84
194
|
return None, None, True
|
|
85
195
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
|
|
89
|
-
dependencies = copy(dependencies)
|
|
90
|
-
dependencies = [d for d in dependencies if d is not None] # filter out removed dep
|
|
91
|
-
dependencies[0].version = source_ds_latest_version # type: ignore[union-attr]
|
|
196
|
+
assert source_ds_version
|
|
197
|
+
assert source_ds_latest_version
|
|
92
198
|
|
|
93
|
-
|
|
94
|
-
|
|
199
|
+
diff_chain = _get_delta_chain(
|
|
200
|
+
source_ds_name, source_ds_version, source_ds_latest_version, on, compare
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Filter out removed dep
|
|
204
|
+
if dependencies:
|
|
205
|
+
dependencies = copy(dependencies)
|
|
206
|
+
dependencies = [d for d in dependencies if d is not None]
|
|
207
|
+
# Update to latest version
|
|
208
|
+
dependencies[0].version = source_ds_latest_version # type: ignore[union-attr]
|
|
209
|
+
|
|
210
|
+
# Handle retry functionality if enabled
|
|
211
|
+
if delta_retry:
|
|
212
|
+
retry_chain = _get_retry_chain(
|
|
213
|
+
name,
|
|
214
|
+
latest_version,
|
|
215
|
+
source_ds_name,
|
|
216
|
+
source_ds_latest_version,
|
|
217
|
+
on,
|
|
218
|
+
right_on,
|
|
219
|
+
delta_retry,
|
|
220
|
+
)
|
|
95
221
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
222
|
+
# Combine delta and retry chains
|
|
223
|
+
if retry_chain is not None:
|
|
224
|
+
processing_chain = diff_chain.union(retry_chain)
|
|
225
|
+
else:
|
|
226
|
+
processing_chain = diff_chain
|
|
99
227
|
|
|
100
|
-
#
|
|
101
|
-
|
|
228
|
+
# Apply all the steps from the original chain to processing_chain
|
|
229
|
+
processing_chain = _append_steps(processing_chain, dc).persist()
|
|
102
230
|
|
|
103
|
-
if
|
|
231
|
+
# Check if chain becomes empty after applying steps
|
|
232
|
+
if processing_chain is None or (processing_chain and processing_chain.empty):
|
|
104
233
|
return None, None, False
|
|
105
234
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
modified=False,
|
|
114
|
-
deleted=False,
|
|
115
|
-
)
|
|
116
|
-
.union(diff)
|
|
235
|
+
latest_dataset = datachain.read_dataset(name, latest_version)
|
|
236
|
+
compared_chain = latest_dataset.compare(
|
|
237
|
+
processing_chain,
|
|
238
|
+
on=right_on or on,
|
|
239
|
+
added=True,
|
|
240
|
+
modified=False,
|
|
241
|
+
deleted=False,
|
|
117
242
|
)
|
|
118
|
-
|
|
119
|
-
return
|
|
243
|
+
result_chain = compared_chain.union(processing_chain)
|
|
244
|
+
return result_chain, dependencies, True
|
datachain/lib/arrow.py
CHANGED
|
@@ -241,6 +241,8 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
|
|
|
241
241
|
return dict
|
|
242
242
|
if isinstance(col_type, pa.lib.DictionaryType):
|
|
243
243
|
return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
|
|
244
|
+
if pa.types.is_null(col_type):
|
|
245
|
+
return str # use strings for null columns
|
|
244
246
|
raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
|
|
245
247
|
|
|
246
248
|
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -25,7 +25,7 @@ from tqdm import tqdm
|
|
|
25
25
|
|
|
26
26
|
from datachain import semver
|
|
27
27
|
from datachain.dataset import DatasetRecord
|
|
28
|
-
from datachain.delta import delta_disabled
|
|
28
|
+
from datachain.delta import delta_disabled
|
|
29
29
|
from datachain.func import literal
|
|
30
30
|
from datachain.func.base import Function
|
|
31
31
|
from datachain.func.func import Func
|
|
@@ -169,6 +169,10 @@ class DataChain:
|
|
|
169
169
|
self._setup: dict = setup or {}
|
|
170
170
|
self._sys = _sys
|
|
171
171
|
self._delta = False
|
|
172
|
+
self._delta_on: Optional[Union[str, Sequence[str]]] = None
|
|
173
|
+
self._delta_result_on: Optional[Union[str, Sequence[str]]] = None
|
|
174
|
+
self._delta_compare: Optional[Union[str, Sequence[str]]] = None
|
|
175
|
+
self._delta_retry: Optional[Union[bool, str]] = None
|
|
172
176
|
|
|
173
177
|
def __repr__(self) -> str:
|
|
174
178
|
"""Return a string representation of the chain."""
|
|
@@ -187,6 +191,7 @@ class DataChain:
|
|
|
187
191
|
on: Optional[Union[str, Sequence[str]]] = None,
|
|
188
192
|
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
189
193
|
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
194
|
+
delta_retry: Optional[Union[bool, str]] = None,
|
|
190
195
|
) -> "Self":
|
|
191
196
|
"""Marks this chain as delta, which means special delta process will be
|
|
192
197
|
called on saving dataset for optimization"""
|
|
@@ -196,6 +201,7 @@ class DataChain:
|
|
|
196
201
|
self._delta_on = on
|
|
197
202
|
self._delta_result_on = right_on
|
|
198
203
|
self._delta_compare = compare
|
|
204
|
+
self._delta_retry = delta_retry
|
|
199
205
|
return self
|
|
200
206
|
|
|
201
207
|
@property
|
|
@@ -293,6 +299,7 @@ class DataChain:
|
|
|
293
299
|
on=self._delta_on,
|
|
294
300
|
right_on=self._delta_result_on,
|
|
295
301
|
compare=self._delta_compare,
|
|
302
|
+
delta_retry=self._delta_retry,
|
|
296
303
|
)
|
|
297
304
|
|
|
298
305
|
return chain
|
|
@@ -529,18 +536,26 @@ class DataChain:
|
|
|
529
536
|
)
|
|
530
537
|
|
|
531
538
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
539
|
+
|
|
540
|
+
# Handle retry and delta functionality
|
|
532
541
|
if self.delta and name:
|
|
533
|
-
|
|
542
|
+
from datachain.delta import delta_retry_update
|
|
543
|
+
|
|
544
|
+
# Delta chains must have delta_on defined (ensured by _as_delta method)
|
|
545
|
+
assert self._delta_on is not None, "Delta chain must have delta_on defined"
|
|
546
|
+
|
|
547
|
+
result_ds, dependencies, has_changes = delta_retry_update(
|
|
534
548
|
self,
|
|
535
549
|
name,
|
|
536
550
|
on=self._delta_on,
|
|
537
551
|
right_on=self._delta_result_on,
|
|
538
552
|
compare=self._delta_compare,
|
|
553
|
+
delta_retry=self._delta_retry,
|
|
539
554
|
)
|
|
540
555
|
|
|
541
|
-
if
|
|
556
|
+
if result_ds:
|
|
542
557
|
return self._evolve(
|
|
543
|
-
query=
|
|
558
|
+
query=result_ds._query.save(
|
|
544
559
|
name=name,
|
|
545
560
|
version=version,
|
|
546
561
|
feature_schema=schema,
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -32,6 +32,7 @@ def read_dataset(
|
|
|
32
32
|
delta_on: Optional[Union[str, Sequence[str]]] = None,
|
|
33
33
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
34
34
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
35
|
+
delta_retry: Optional[Union[bool, str]] = None,
|
|
35
36
|
) -> "DataChain":
|
|
36
37
|
"""Get data from a saved Dataset. It returns the chain itself.
|
|
37
38
|
If dataset or version is not found locally, it will try to pull it from Studio.
|
|
@@ -73,6 +74,11 @@ def read_dataset(
|
|
|
73
74
|
delta_compare: A list of fields used to check if the same row has been modified
|
|
74
75
|
in the new version of the source.
|
|
75
76
|
If not defined, all fields except those defined in delta_on will be used.
|
|
77
|
+
delta_retry: Specifies retry behavior for delta processing. If a string,
|
|
78
|
+
it's the name of a field in the result dataset that indicates an error
|
|
79
|
+
when not None - records with errors will be reprocessed. If True,
|
|
80
|
+
records that exist in the source dataset but not in the result dataset
|
|
81
|
+
will be reprocessed.
|
|
76
82
|
|
|
77
83
|
Example:
|
|
78
84
|
```py
|
|
@@ -149,10 +155,15 @@ def read_dataset(
|
|
|
149
155
|
else:
|
|
150
156
|
signals_schema |= SignalSchema.from_column_types(query.column_types or {})
|
|
151
157
|
chain = DataChain(query, _settings, signals_schema)
|
|
158
|
+
|
|
152
159
|
if delta:
|
|
153
160
|
chain = chain._as_delta(
|
|
154
|
-
on=delta_on,
|
|
161
|
+
on=delta_on,
|
|
162
|
+
right_on=delta_result_on,
|
|
163
|
+
compare=delta_compare,
|
|
164
|
+
delta_retry=delta_retry,
|
|
155
165
|
)
|
|
166
|
+
|
|
156
167
|
return chain
|
|
157
168
|
|
|
158
169
|
|
datachain/lib/dc/storage.py
CHANGED
|
@@ -38,6 +38,7 @@ def read_storage(
|
|
|
38
38
|
delta_on: Optional[Union[str, Sequence[str]]] = None,
|
|
39
39
|
delta_result_on: Optional[Union[str, Sequence[str]]] = None,
|
|
40
40
|
delta_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
41
|
+
delta_retry: Optional[Union[bool, str]] = None,
|
|
41
42
|
client_config: Optional[dict] = None,
|
|
42
43
|
) -> "DataChain":
|
|
43
44
|
"""Get data from storage(s) as a list of file with all file attributes.
|
|
@@ -83,6 +84,13 @@ def read_storage(
|
|
|
83
84
|
delta_compare: A list of fields used to check if the same row has been modified
|
|
84
85
|
in the new version of the source.
|
|
85
86
|
If not defined, all fields except those defined in `delta_on` will be used.
|
|
87
|
+
delta_retry: Controls which records to reprocess. Can be:
|
|
88
|
+
- A string specifying a field name: Records where this field is not None
|
|
89
|
+
will be reprocessed (error checking mode).
|
|
90
|
+
- True: Records that exist in the source dataset but not in the result
|
|
91
|
+
dataset (based on delta_on/delta_result_on fields) will be reprocessed
|
|
92
|
+
(missing records mode).
|
|
93
|
+
- False or None: No retry processing.
|
|
86
94
|
|
|
87
95
|
Returns:
|
|
88
96
|
DataChain: A DataChain object containing the file information.
|
|
@@ -208,6 +216,10 @@ def read_storage(
|
|
|
208
216
|
|
|
209
217
|
if delta:
|
|
210
218
|
storage_chain = storage_chain._as_delta(
|
|
211
|
-
on=delta_on,
|
|
219
|
+
on=delta_on,
|
|
220
|
+
right_on=delta_result_on,
|
|
221
|
+
compare=delta_compare,
|
|
222
|
+
delta_retry=delta_retry,
|
|
212
223
|
)
|
|
224
|
+
|
|
213
225
|
return storage_chain
|
datachain/lib/file.py
CHANGED
|
@@ -127,10 +127,7 @@ class TarVFile(VFile):
|
|
|
127
127
|
@classmethod
|
|
128
128
|
def open(cls, file: "File", location: list[dict]):
|
|
129
129
|
"""Stream file from tar archive based on location in archive."""
|
|
130
|
-
|
|
131
|
-
raise VFileError(
|
|
132
|
-
"multiple 'location's are not supported yet", file.source, file.path
|
|
133
|
-
)
|
|
130
|
+
tar_file = cls.parent(file, location)
|
|
134
131
|
|
|
135
132
|
loc = location[0]
|
|
136
133
|
|
|
@@ -140,15 +137,26 @@ class TarVFile(VFile):
|
|
|
140
137
|
if (size := loc.get("size", None)) is None:
|
|
141
138
|
raise VFileError("'size' is not specified", file.source, file.path)
|
|
142
139
|
|
|
140
|
+
client = file._catalog.get_client(tar_file.source)
|
|
141
|
+
fd = client.open_object(tar_file, use_cache=file._caching_enabled)
|
|
142
|
+
return FileSlice(fd, offset, size, file.name)
|
|
143
|
+
|
|
144
|
+
@classmethod
|
|
145
|
+
def parent(cls, file: "File", location: list[dict]) -> "File":
|
|
146
|
+
if len(location) > 1:
|
|
147
|
+
raise VFileError(
|
|
148
|
+
"multiple 'location's are not supported yet", file.source, file.path
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
loc = location[0]
|
|
152
|
+
|
|
143
153
|
if (parent := loc.get("parent", None)) is None:
|
|
144
154
|
raise VFileError("'parent' is not specified", file.source, file.path)
|
|
145
155
|
|
|
146
156
|
tar_file = File(**parent)
|
|
147
157
|
tar_file._set_stream(file._catalog)
|
|
148
158
|
|
|
149
|
-
|
|
150
|
-
fd = client.open_object(tar_file, use_cache=file._caching_enabled)
|
|
151
|
-
return FileSlice(fd, offset, size, file.name)
|
|
159
|
+
return tar_file
|
|
152
160
|
|
|
153
161
|
|
|
154
162
|
class VFileRegistry:
|
|
@@ -159,7 +167,7 @@ class VFileRegistry:
|
|
|
159
167
|
cls._vtype_readers[reader.get_vtype()] = reader
|
|
160
168
|
|
|
161
169
|
@classmethod
|
|
162
|
-
def
|
|
170
|
+
def _get_reader(cls, file: "File", location: list[dict]):
|
|
163
171
|
if len(location) == 0:
|
|
164
172
|
raise VFileError(
|
|
165
173
|
"'location' must not be list of JSONs", file.source, file.path
|
|
@@ -174,8 +182,18 @@ class VFileRegistry:
|
|
|
174
182
|
"reader not registered", file.source, file.path, vtype=vtype
|
|
175
183
|
)
|
|
176
184
|
|
|
185
|
+
return reader
|
|
186
|
+
|
|
187
|
+
@classmethod
|
|
188
|
+
def open(cls, file: "File", location: list[dict]):
|
|
189
|
+
reader = cls._get_reader(file, location)
|
|
177
190
|
return reader.open(file, location)
|
|
178
191
|
|
|
192
|
+
@classmethod
|
|
193
|
+
def parent(cls, file: "File", location: list[dict]) -> "File":
|
|
194
|
+
reader = cls._get_reader(file, location)
|
|
195
|
+
return reader.parent(file, location)
|
|
196
|
+
|
|
179
197
|
|
|
180
198
|
class File(DataModel):
|
|
181
199
|
"""
|
|
@@ -330,7 +348,7 @@ class File(DataModel):
|
|
|
330
348
|
def open(self, mode: Literal["rb", "r"] = "rb") -> Iterator[Any]:
|
|
331
349
|
"""Open the file and return a file object."""
|
|
332
350
|
if self.location:
|
|
333
|
-
with VFileRegistry.
|
|
351
|
+
with VFileRegistry.open(self, self.location) as f: # type: ignore[arg-type]
|
|
334
352
|
yield f
|
|
335
353
|
|
|
336
354
|
else:
|
|
@@ -349,6 +367,13 @@ class File(DataModel):
|
|
|
349
367
|
|
|
350
368
|
def read_text(self):
|
|
351
369
|
"""Returns file contents as text."""
|
|
370
|
+
if self.location:
|
|
371
|
+
raise VFileError(
|
|
372
|
+
"Reading text from virtual file is not supported",
|
|
373
|
+
self.source,
|
|
374
|
+
self.path,
|
|
375
|
+
)
|
|
376
|
+
|
|
352
377
|
with self.open(mode="r") as stream:
|
|
353
378
|
return stream.read()
|
|
354
379
|
|
|
@@ -427,9 +452,13 @@ class File(DataModel):
|
|
|
427
452
|
if self._catalog is None:
|
|
428
453
|
raise RuntimeError("cannot prefetch file because catalog is not setup")
|
|
429
454
|
|
|
455
|
+
file = self
|
|
456
|
+
if self.location:
|
|
457
|
+
file = VFileRegistry.parent(self, self.location) # type: ignore[arg-type]
|
|
458
|
+
|
|
430
459
|
client = self._catalog.get_client(self.source)
|
|
431
|
-
await client._download(
|
|
432
|
-
|
|
460
|
+
await client._download(file, callback=download_cb or self._download_cb)
|
|
461
|
+
file._set_stream(
|
|
433
462
|
self._catalog, caching_enabled=True, download_cb=DEFAULT_CALLBACK
|
|
434
463
|
)
|
|
435
464
|
return True
|
datachain/semver.py
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
1
|
+
# Maximum version number for semver (major.minor.patch) is 999999.999999.999999
|
|
2
|
+
# this number was chosen because value("999999.999999.999999") < 2**63 - 1
|
|
3
|
+
MAX_VERSION_NUMBER = 999_999
|
|
4
|
+
|
|
5
|
+
|
|
1
6
|
def parse(version: str) -> tuple[int, int, int]:
|
|
2
7
|
"""Parsing semver into 3 integers: major, minor, patch"""
|
|
3
8
|
validate(version)
|
|
4
9
|
parts = version.split(".")
|
|
5
|
-
return
|
|
10
|
+
return int(parts[0]), int(parts[1]), int(parts[2])
|
|
6
11
|
|
|
7
12
|
|
|
8
13
|
def validate(version: str) -> None:
|
|
@@ -20,14 +25,18 @@ def validate(version: str) -> None:
|
|
|
20
25
|
for part in parts:
|
|
21
26
|
try:
|
|
22
27
|
val = int(part)
|
|
23
|
-
assert val
|
|
28
|
+
assert 0 <= val <= MAX_VERSION_NUMBER
|
|
24
29
|
except (ValueError, AssertionError):
|
|
25
30
|
raise ValueError(error_message) from None
|
|
26
31
|
|
|
27
32
|
|
|
28
33
|
def create(major: int = 0, minor: int = 0, patch: int = 0) -> str:
|
|
29
34
|
"""Creates new semver from 3 integers: major, minor and patch"""
|
|
30
|
-
if
|
|
35
|
+
if not (
|
|
36
|
+
0 <= major <= MAX_VERSION_NUMBER
|
|
37
|
+
and 0 <= minor <= MAX_VERSION_NUMBER
|
|
38
|
+
and 0 <= patch <= MAX_VERSION_NUMBER
|
|
39
|
+
):
|
|
31
40
|
raise ValueError("Major, minor and patch must be greater or equal to zero")
|
|
32
41
|
|
|
33
42
|
return ".".join([str(major), str(minor), str(patch)])
|
|
@@ -35,10 +44,11 @@ def create(major: int = 0, minor: int = 0, patch: int = 0) -> str:
|
|
|
35
44
|
|
|
36
45
|
def value(version: str) -> int:
|
|
37
46
|
"""
|
|
38
|
-
Calculate integer value of a version. This is useful when comparing two versions
|
|
47
|
+
Calculate integer value of a version. This is useful when comparing two versions.
|
|
39
48
|
"""
|
|
40
49
|
major, minor, patch = parse(version)
|
|
41
|
-
|
|
50
|
+
limit = MAX_VERSION_NUMBER + 1
|
|
51
|
+
return major * (limit**2) + minor * limit + patch
|
|
42
52
|
|
|
43
53
|
|
|
44
54
|
def compare(v1: str, v2: str) -> int:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.19.1
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -146,6 +146,12 @@ Use Cases
|
|
|
146
146
|
on these tables at scale.
|
|
147
147
|
3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
|
|
148
148
|
Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
|
|
149
|
+
4. **Incremental Processing.** DataChain's delta and retry features allow for efficient
|
|
150
|
+
processing workflows:
|
|
151
|
+
|
|
152
|
+
- **Delta Processing**: Process only new or changed files/records
|
|
153
|
+
- **Retry Processing**: Automatically reprocess records with errors or missing results
|
|
154
|
+
- **Combined Approach**: Process new data and fix errors in a single pipeline
|
|
149
155
|
|
|
150
156
|
Getting Started
|
|
151
157
|
===============
|
|
@@ -158,7 +164,7 @@ to get started with `DataChain` and learn more.
|
|
|
158
164
|
pip install datachain
|
|
159
165
|
|
|
160
166
|
|
|
161
|
-
Example:
|
|
167
|
+
Example: Download Subset of Files Based on Metadata
|
|
162
168
|
---------------------------------------------------
|
|
163
169
|
|
|
164
170
|
Sometimes users only need to download a specific subset of files from cloud storage,
|
|
@@ -182,6 +188,54 @@ high confidence scores.
|
|
|
182
188
|
likely_cats.to_storage("high-confidence-cats/", signal="file")
|
|
183
189
|
|
|
184
190
|
|
|
191
|
+
Example: Incremental Processing with Error Handling
|
|
192
|
+
---------------------------------------------------
|
|
193
|
+
|
|
194
|
+
This example shows how to use both delta and retry processing for efficient handling of large
|
|
195
|
+
datasets that evolve over time and may occasionally have processing errors.
|
|
196
|
+
|
|
197
|
+
.. code:: py
|
|
198
|
+
|
|
199
|
+
import datachain as dc
|
|
200
|
+
from datachain import C, File
|
|
201
|
+
|
|
202
|
+
def process_file(file: File):
|
|
203
|
+
"""Process a file, which may occasionally fail."""
|
|
204
|
+
try:
|
|
205
|
+
# Your processing logic here
|
|
206
|
+
content = file.read_text()
|
|
207
|
+
result = analyze_content(content)
|
|
208
|
+
return {
|
|
209
|
+
"content": content,
|
|
210
|
+
"result": result,
|
|
211
|
+
"error": None # No error
|
|
212
|
+
}
|
|
213
|
+
except Exception as e:
|
|
214
|
+
# Return an error that will trigger reprocessing next time
|
|
215
|
+
return {
|
|
216
|
+
"content": None,
|
|
217
|
+
"result": None,
|
|
218
|
+
"error": str(e) # Error field will trigger retry
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
# Process files efficiently with delta and retry
|
|
222
|
+
chain = (
|
|
223
|
+
dc.read_storage(
|
|
224
|
+
"data/",
|
|
225
|
+
update=True,
|
|
226
|
+
delta=True, # Process only new/changed files
|
|
227
|
+
delta_on="file.path", # Identify files by path
|
|
228
|
+
retry_on="error" # Field that indicates errors
|
|
229
|
+
)
|
|
230
|
+
.map(processed_result=process_file)
|
|
231
|
+
.mutate(
|
|
232
|
+
content=C("processed_result.content"),
|
|
233
|
+
result=C("processed_result.result"),
|
|
234
|
+
error=C("processed_result.error")
|
|
235
|
+
)
|
|
236
|
+
.save(name="processed_data")
|
|
237
|
+
)
|
|
238
|
+
|
|
185
239
|
Example: LLM based text-file evaluation
|
|
186
240
|
---------------------------------------
|
|
187
241
|
|
|
@@ -4,7 +4,7 @@ datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
|
4
4
|
datachain/cache.py,sha256=yQblPhOh_Mq74Ma7xT1CL1idLJ0HgrQxpGVYvRy_9Eg,3623
|
|
5
5
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
6
6
|
datachain/dataset.py,sha256=XUZ-kSBL1y6juFqlSWXXbattGS1E53lXpyhc0Ip1_AA,20527
|
|
7
|
-
datachain/delta.py,sha256=
|
|
7
|
+
datachain/delta.py,sha256=fP1Yy_MfdnTZmIOe243SBiDWTzd6MqLw0tQxvZNxLcs,8384
|
|
8
8
|
datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
|
|
9
9
|
datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
|
|
10
10
|
datachain/listing.py,sha256=JtExYIfKMFhEIIcSSWBmaxWpoS3ben7kb692cHHm4Lo,7079
|
|
@@ -14,7 +14,7 @@ datachain/nodes_thread_pool.py,sha256=mdo0s-VybuSZkRUARcUO4Tjh8KFfZr9foHqmupx2Sm
|
|
|
14
14
|
datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
|
|
15
15
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
|
|
17
|
-
datachain/semver.py,sha256=
|
|
17
|
+
datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
|
|
18
18
|
datachain/studio.py,sha256=1J2ANFVVA1ysPxBuLibQSnSXt0U9Vfgz9ZNGikYtWdk,11038
|
|
19
19
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
20
20
|
datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
|
|
@@ -39,9 +39,9 @@ datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI
|
|
|
39
39
|
datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
|
|
40
40
|
datachain/client/azure.py,sha256=7yyAgANHfu9Kfh187MKNTT1guvu9Q-WYsi4vYoY3aew,3270
|
|
41
41
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
42
|
-
datachain/client/fsspec.py,sha256=
|
|
42
|
+
datachain/client/fsspec.py,sha256=huPHNDZRGz_rSN7XnS9hKmRoS2fsSz_y2-cxUSlvsOA,13938
|
|
43
43
|
datachain/client/gcs.py,sha256=8hcFhEHp8qGRsJoyfCoawfuwb1Et-MSkyQoM9AnNuXI,5204
|
|
44
|
-
datachain/client/hf.py,sha256=
|
|
44
|
+
datachain/client/hf.py,sha256=mRBqHeBT758TJicU-Fn2L3l5AbHWwMzycWwttNUACKk,2180
|
|
45
45
|
datachain/client/local.py,sha256=cGoCYflribzexiOe-Y1qbaE2fJRh-_EgQrfCSa0yK_E,4568
|
|
46
46
|
datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
|
|
47
47
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
@@ -68,11 +68,11 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
|
|
|
68
68
|
datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
|
|
69
69
|
datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
|
|
70
70
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
|
-
datachain/lib/arrow.py,sha256=
|
|
71
|
+
datachain/lib/arrow.py,sha256=2IuNZ6tRFsxVNhWElqr0ptz28geSDzlDHUtzD4qeDNM,10339
|
|
72
72
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
73
73
|
datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
|
|
74
74
|
datachain/lib/dataset_info.py,sha256=d-jz6zeDU5DEgYtyeSF5nK0MU-40FV5km_iOCh4pXzo,3179
|
|
75
|
-
datachain/lib/file.py,sha256=
|
|
75
|
+
datachain/lib/file.py,sha256=PuTa6CEG9CaJXPhxrZFY-R9-DS7ynB9l7Y0bUbd_Qwg,31952
|
|
76
76
|
datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
|
|
77
77
|
datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
|
|
78
78
|
datachain/lib/listing.py,sha256=5_GoATtIwCtd1JMqlorPB_vQDxndOQZpiWjNOG3NMw4,7007
|
|
@@ -99,15 +99,15 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
|
|
|
99
99
|
datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
|
|
100
100
|
datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
|
|
101
101
|
datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
|
|
102
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
103
|
-
datachain/lib/dc/datasets.py,sha256=
|
|
102
|
+
datachain/lib/dc/datachain.py,sha256=cQjq6_OWQ_1JKvIqb8snl6mKfuBbpllPEao5ygVINog,81733
|
|
103
|
+
datachain/lib/dc/datasets.py,sha256=g_bBGCUwAwNJypYSUQvrDDqnaw7nfXpvrEvUVPtWATY,11268
|
|
104
104
|
datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
|
|
105
105
|
datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
|
|
106
106
|
datachain/lib/dc/listings.py,sha256=2na9v63xO1vPUNaoBSzA-TSN49V7zQAb-4iS1wOPLFE,1029
|
|
107
107
|
datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
|
|
108
108
|
datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
|
|
109
109
|
datachain/lib/dc/records.py,sha256=J1I69J2gFIBjRTGr2LG-5qn_rTVzRLcr2y3tVDrmHdg,3068
|
|
110
|
-
datachain/lib/dc/storage.py,sha256=
|
|
110
|
+
datachain/lib/dc/storage.py,sha256=u-QB_0sn1Wwc0-9phi1zT38UDe5uBIc25xbAhKMU2fA,8774
|
|
111
111
|
datachain/lib/dc/utils.py,sha256=VawOAlJSvAtZbsMg33s5tJe21TRx1Km3QggI1nN6tnw,3984
|
|
112
112
|
datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
|
|
113
113
|
datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
|
|
@@ -153,9 +153,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
153
153
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
154
154
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
155
155
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
156
|
-
datachain-0.
|
|
157
|
-
datachain-0.
|
|
158
|
-
datachain-0.
|
|
159
|
-
datachain-0.
|
|
160
|
-
datachain-0.
|
|
161
|
-
datachain-0.
|
|
156
|
+
datachain-0.19.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
157
|
+
datachain-0.19.1.dist-info/METADATA,sha256=qg4KSU457ARE-A00yjNYNtFP3vhX0yqsxrCGKctXva4,13281
|
|
158
|
+
datachain-0.19.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
159
|
+
datachain-0.19.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
160
|
+
datachain-0.19.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
161
|
+
datachain-0.19.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|