datachain 0.18.11__py3-none-any.whl → 0.19.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -330,6 +330,7 @@ class Client(ABC):
330
330
  return getattr(self.fs, "version_aware", False)
331
331
 
332
332
  async def ls_dir(self, path):
333
+ kwargs = {}
333
334
  if self._is_version_aware():
334
335
  kwargs = {"versions": True}
335
336
  return await self.fs._ls(path, detail=True, **kwargs)
datachain/client/hf.py CHANGED
@@ -15,6 +15,24 @@ class classproperty: # noqa: N801
15
15
  return self.fget(owner)
16
16
 
17
17
 
18
+ def _wrap_class(sync_fs_class):
19
+ """
20
+ Analog of `AsyncFileSystemWrapper.wrap_class` from fsspec, but sets
21
+ asynchronous to False by default. This is similar to other Async FS
22
+ we initialize. E.g. it means we don't break things in Jupyter where code
23
+ run in async.
24
+ """
25
+ from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
26
+
27
+ class GeneratedAsyncFileSystemWrapper(AsyncFileSystemWrapper):
28
+ def __init__(self, *args, **kwargs):
29
+ sync_fs = sync_fs_class(*args, **kwargs)
30
+ super().__init__(sync_fs, asynchronous=False)
31
+
32
+ GeneratedAsyncFileSystemWrapper.__name__ = f"Async{sync_fs_class.__name__}Wrapper"
33
+ return GeneratedAsyncFileSystemWrapper
34
+
35
+
18
36
  @functools.cache
19
37
  def get_hf_filesystem_cls():
20
38
  import fsspec
@@ -29,10 +47,9 @@ def get_hf_filesystem_cls():
29
47
  f"{fsspec_version} is installed."
30
48
  )
31
49
 
32
- from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
33
50
  from huggingface_hub import HfFileSystem
34
51
 
35
- fs_cls = AsyncFileSystemWrapper.wrap_class(HfFileSystem)
52
+ fs_cls = _wrap_class(HfFileSystem)
36
53
  # AsyncFileSystemWrapper does not set class properties, so we need to set them back.
37
54
  fs_cls.protocol = HfFileSystem.protocol
38
55
  return fs_cls
datachain/delta.py CHANGED
@@ -48,72 +48,197 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
48
48
  return dc
49
49
 
50
50
 
51
- def delta_update(
51
+ def _get_delta_chain(
52
+ source_ds_name: str,
53
+ source_ds_version: str,
54
+ source_ds_latest_version: str,
55
+ on: Union[str, Sequence[str]],
56
+ compare: Optional[Union[str, Sequence[str]]] = None,
57
+ ) -> "DataChain":
58
+ """Get delta chain for processing changes between versions."""
59
+ source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
60
+ source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
61
+
62
+ # Calculate diff between source versions
63
+ return source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
64
+
65
+
66
+ def _get_retry_chain(
67
+ name: str,
68
+ latest_version: str,
69
+ source_ds_name: str,
70
+ source_ds_latest_version: str,
71
+ on: Union[str, Sequence[str]],
72
+ right_on: Optional[Union[str, Sequence[str]]],
73
+ delta_retry: Optional[Union[bool, str]],
74
+ ) -> Optional["DataChain"]:
75
+ """Get retry chain for processing error records and missing records."""
76
+ # Import here to avoid circular import
77
+ from datachain.lib.dc import C
78
+
79
+ retry_chain = None
80
+
81
+ # Read the latest version of the result dataset for retry logic
82
+ result_dataset = datachain.read_dataset(name, latest_version)
83
+ source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
84
+
85
+ # Handle error records if delta_retry is a string (column name)
86
+ if isinstance(delta_retry, str):
87
+ error_records = result_dataset.filter(C(delta_retry) != "")
88
+ error_source_records = source_dc_latest.merge(
89
+ error_records, on=on, right_on=right_on, inner=True
90
+ ).select(*list(source_dc_latest.signals_schema.values))
91
+ retry_chain = error_source_records
92
+
93
+ # Handle missing records if delta_retry is True
94
+ elif delta_retry is True:
95
+ missing_records = source_dc_latest.subtract(
96
+ result_dataset, on=on, right_on=right_on
97
+ )
98
+ retry_chain = missing_records
99
+
100
+ return retry_chain
101
+
102
+
103
+ def _get_source_info(
104
+ name: str,
105
+ latest_version: str,
106
+ catalog,
107
+ ) -> tuple[
108
+ Optional[str], Optional[str], Optional[str], Optional[list[DatasetDependency]]
109
+ ]:
110
+ """Get source dataset information and dependencies.
111
+
112
+ Returns:
113
+ Tuple of (source_name, source_version, source_latest_version, dependencies)
114
+ Returns (None, None, None, None) if source dataset was removed.
115
+ """
116
+ dependencies = catalog.get_dataset_dependencies(
117
+ name, latest_version, indirect=False
118
+ )
119
+
120
+ dep = dependencies[0]
121
+ if not dep:
122
+ # Starting dataset was removed, back off to normal dataset creation
123
+ return None, None, None, None
124
+
125
+ source_ds_name = dep.name
126
+ source_ds_version = dep.version
127
+ source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
128
+
129
+ return source_ds_name, source_ds_version, source_ds_latest_version, dependencies
130
+
131
+
132
+ def delta_retry_update(
52
133
  dc: "DataChain",
53
134
  name: str,
54
135
  on: Union[str, Sequence[str]],
55
136
  right_on: Optional[Union[str, Sequence[str]]] = None,
56
137
  compare: Optional[Union[str, Sequence[str]]] = None,
138
+ delta_retry: Optional[Union[bool, str]] = None,
57
139
  ) -> tuple[Optional["DataChain"], Optional[list[DatasetDependency]], bool]:
58
140
  """
59
141
  Creates new chain that consists of the last version of current delta dataset
60
142
  plus diff from the source with all needed modifications.
61
- This way we don't need to re-calculate the whole chain from the source again(
62
- apply all the DataChain methods like filters, mappers, generators etc.)
143
+ This way we don't need to re-calculate the whole chain from the source again
144
+ (apply all the DataChain methods like filters, mappers, generators etc.)
63
145
  but just the diff part which is very important for performance.
64
146
 
65
- Note that currently delta update works only if there is only one direct dependency.
147
+ Note that currently delta update works only if there is only one direct
148
+ dependency.
149
+
150
+ Additionally supports retry functionality to filter records that either:
151
+ 1. Have a non-None value in the field specified by delta_retry (when it's a string)
152
+ 2. Exist in the source dataset but are missing in the result dataset
153
+ (when delta_retry=True)
154
+
155
+ Parameters:
156
+ dc: The DataChain to filter for records that need reprocessing
157
+ name: Name of the destination dataset
158
+ on: Field(s) in source dataset that uniquely identify records
159
+ right_on: Corresponding field(s) in result dataset if they differ from
160
+ source
161
+ compare: Field(s) used to check if the same row has been modified
162
+ delta_retry: If string, field in result dataset that indicates an error
163
+ when not None. If True, include records missing from result dataset.
164
+ If False/None, no retry functionality.
165
+
166
+ Returns:
167
+ A tuple containing (filtered chain for delta/retry processing,
168
+ dependencies, found records flag)
66
169
  """
170
+
67
171
  catalog = dc.session.catalog
68
172
  dc._query.apply_listing_pre_step()
69
173
 
174
+ # Check if dataset exists
70
175
  try:
71
- latest_version = catalog.get_dataset(name).latest_version
176
+ dataset = catalog.get_dataset(name)
177
+ latest_version = dataset.latest_version
72
178
  except DatasetNotFoundError:
73
- # first creation of delta update dataset
179
+ # First creation of result dataset
74
180
  return None, None, True
75
181
 
76
- dependencies = catalog.get_dataset_dependencies(
77
- name, latest_version, indirect=False
182
+ # Initialize variables
183
+ diff_chain = None
184
+ dependencies = None
185
+ retry_chain = None
186
+ processing_chain = None
187
+
188
+ source_ds_name, source_ds_version, source_ds_latest_version, dependencies = (
189
+ _get_source_info(name, latest_version, catalog)
78
190
  )
79
191
 
80
- dep = dependencies[0]
81
- if not dep:
82
- # starting dataset (e.g listing) was removed so we are backing off to normal
83
- # dataset creation, as it was created first time
192
+ # If source_ds_name is None, starting dataset was removed
193
+ if source_ds_name is None:
84
194
  return None, None, True
85
195
 
86
- source_ds_name = dep.name
87
- source_ds_version = dep.version
88
- source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
89
- dependencies = copy(dependencies)
90
- dependencies = [d for d in dependencies if d is not None] # filter out removed dep
91
- dependencies[0].version = source_ds_latest_version # type: ignore[union-attr]
196
+ assert source_ds_version
197
+ assert source_ds_latest_version
92
198
 
93
- source_dc = datachain.read_dataset(source_ds_name, source_ds_version)
94
- source_dc_latest = datachain.read_dataset(source_ds_name, source_ds_latest_version)
199
+ diff_chain = _get_delta_chain(
200
+ source_ds_name, source_ds_version, source_ds_latest_version, on, compare
201
+ )
202
+
203
+ # Filter out removed dep
204
+ if dependencies:
205
+ dependencies = copy(dependencies)
206
+ dependencies = [d for d in dependencies if d is not None]
207
+ # Update to latest version
208
+ dependencies[0].version = source_ds_latest_version # type: ignore[union-attr]
209
+
210
+ # Handle retry functionality if enabled
211
+ if delta_retry:
212
+ retry_chain = _get_retry_chain(
213
+ name,
214
+ latest_version,
215
+ source_ds_name,
216
+ source_ds_latest_version,
217
+ on,
218
+ right_on,
219
+ delta_retry,
220
+ )
95
221
 
96
- diff = source_dc_latest.compare(source_dc, on=on, compare=compare, deleted=False)
97
- # We append all the steps from the original chain to diff, e.g filters, mappers.
98
- diff = _append_steps(diff, dc)
222
+ # Combine delta and retry chains
223
+ if retry_chain is not None:
224
+ processing_chain = diff_chain.union(retry_chain)
225
+ else:
226
+ processing_chain = diff_chain
99
227
 
100
- # to avoid re-calculating diff multiple times
101
- diff = diff.persist()
228
+ # Apply all the steps from the original chain to processing_chain
229
+ processing_chain = _append_steps(processing_chain, dc).persist()
102
230
 
103
- if diff.empty:
231
+ # Check if chain becomes empty after applying steps
232
+ if processing_chain is None or (processing_chain and processing_chain.empty):
104
233
  return None, None, False
105
234
 
106
- # merging diff and the latest version of dataset
107
- delta_chain = (
108
- datachain.read_dataset(name, latest_version)
109
- .compare(
110
- diff,
111
- on=right_on or on,
112
- added=True,
113
- modified=False,
114
- deleted=False,
115
- )
116
- .union(diff)
235
+ latest_dataset = datachain.read_dataset(name, latest_version)
236
+ compared_chain = latest_dataset.compare(
237
+ processing_chain,
238
+ on=right_on or on,
239
+ added=True,
240
+ modified=False,
241
+ deleted=False,
117
242
  )
118
-
119
- return delta_chain, dependencies, True # type: ignore[return-value]
243
+ result_chain = compared_chain.union(processing_chain)
244
+ return result_chain, dependencies, True
datachain/lib/arrow.py CHANGED
@@ -241,6 +241,8 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
241
241
  return dict
242
242
  if isinstance(col_type, pa.lib.DictionaryType):
243
243
  return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
244
+ if pa.types.is_null(col_type):
245
+ return str # use strings for null columns
244
246
  raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
245
247
 
246
248
 
@@ -25,7 +25,7 @@ from tqdm import tqdm
25
25
 
26
26
  from datachain import semver
27
27
  from datachain.dataset import DatasetRecord
28
- from datachain.delta import delta_disabled, delta_update
28
+ from datachain.delta import delta_disabled
29
29
  from datachain.func import literal
30
30
  from datachain.func.base import Function
31
31
  from datachain.func.func import Func
@@ -169,6 +169,10 @@ class DataChain:
169
169
  self._setup: dict = setup or {}
170
170
  self._sys = _sys
171
171
  self._delta = False
172
+ self._delta_on: Optional[Union[str, Sequence[str]]] = None
173
+ self._delta_result_on: Optional[Union[str, Sequence[str]]] = None
174
+ self._delta_compare: Optional[Union[str, Sequence[str]]] = None
175
+ self._delta_retry: Optional[Union[bool, str]] = None
172
176
 
173
177
  def __repr__(self) -> str:
174
178
  """Return a string representation of the chain."""
@@ -187,6 +191,7 @@ class DataChain:
187
191
  on: Optional[Union[str, Sequence[str]]] = None,
188
192
  right_on: Optional[Union[str, Sequence[str]]] = None,
189
193
  compare: Optional[Union[str, Sequence[str]]] = None,
194
+ delta_retry: Optional[Union[bool, str]] = None,
190
195
  ) -> "Self":
191
196
  """Marks this chain as delta, which means special delta process will be
192
197
  called on saving dataset for optimization"""
@@ -196,6 +201,7 @@ class DataChain:
196
201
  self._delta_on = on
197
202
  self._delta_result_on = right_on
198
203
  self._delta_compare = compare
204
+ self._delta_retry = delta_retry
199
205
  return self
200
206
 
201
207
  @property
@@ -293,6 +299,7 @@ class DataChain:
293
299
  on=self._delta_on,
294
300
  right_on=self._delta_result_on,
295
301
  compare=self._delta_compare,
302
+ delta_retry=self._delta_retry,
296
303
  )
297
304
 
298
305
  return chain
@@ -529,18 +536,26 @@ class DataChain:
529
536
  )
530
537
 
531
538
  schema = self.signals_schema.clone_without_sys_signals().serialize()
539
+
540
+ # Handle retry and delta functionality
532
541
  if self.delta and name:
533
- delta_ds, dependencies, has_changes = delta_update(
542
+ from datachain.delta import delta_retry_update
543
+
544
+ # Delta chains must have delta_on defined (ensured by _as_delta method)
545
+ assert self._delta_on is not None, "Delta chain must have delta_on defined"
546
+
547
+ result_ds, dependencies, has_changes = delta_retry_update(
534
548
  self,
535
549
  name,
536
550
  on=self._delta_on,
537
551
  right_on=self._delta_result_on,
538
552
  compare=self._delta_compare,
553
+ delta_retry=self._delta_retry,
539
554
  )
540
555
 
541
- if delta_ds:
556
+ if result_ds:
542
557
  return self._evolve(
543
- query=delta_ds._query.save(
558
+ query=result_ds._query.save(
544
559
  name=name,
545
560
  version=version,
546
561
  feature_schema=schema,
@@ -32,6 +32,7 @@ def read_dataset(
32
32
  delta_on: Optional[Union[str, Sequence[str]]] = None,
33
33
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
34
34
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
35
+ delta_retry: Optional[Union[bool, str]] = None,
35
36
  ) -> "DataChain":
36
37
  """Get data from a saved Dataset. It returns the chain itself.
37
38
  If dataset or version is not found locally, it will try to pull it from Studio.
@@ -73,6 +74,11 @@ def read_dataset(
73
74
  delta_compare: A list of fields used to check if the same row has been modified
74
75
  in the new version of the source.
75
76
  If not defined, all fields except those defined in delta_on will be used.
77
+ delta_retry: Specifies retry behavior for delta processing. If a string,
78
+ it's the name of a field in the result dataset that indicates an error
79
+ when not None - records with errors will be reprocessed. If True,
80
+ records that exist in the source dataset but not in the result dataset
81
+ will be reprocessed.
76
82
 
77
83
  Example:
78
84
  ```py
@@ -149,10 +155,15 @@ def read_dataset(
149
155
  else:
150
156
  signals_schema |= SignalSchema.from_column_types(query.column_types or {})
151
157
  chain = DataChain(query, _settings, signals_schema)
158
+
152
159
  if delta:
153
160
  chain = chain._as_delta(
154
- on=delta_on, right_on=delta_result_on, compare=delta_compare
161
+ on=delta_on,
162
+ right_on=delta_result_on,
163
+ compare=delta_compare,
164
+ delta_retry=delta_retry,
155
165
  )
166
+
156
167
  return chain
157
168
 
158
169
 
@@ -38,6 +38,7 @@ def read_storage(
38
38
  delta_on: Optional[Union[str, Sequence[str]]] = None,
39
39
  delta_result_on: Optional[Union[str, Sequence[str]]] = None,
40
40
  delta_compare: Optional[Union[str, Sequence[str]]] = None,
41
+ delta_retry: Optional[Union[bool, str]] = None,
41
42
  client_config: Optional[dict] = None,
42
43
  ) -> "DataChain":
43
44
  """Get data from storage(s) as a list of file with all file attributes.
@@ -83,6 +84,13 @@ def read_storage(
83
84
  delta_compare: A list of fields used to check if the same row has been modified
84
85
  in the new version of the source.
85
86
  If not defined, all fields except those defined in `delta_on` will be used.
87
+ delta_retry: Controls which records to reprocess. Can be:
88
+ - A string specifying a field name: Records where this field is not None
89
+ will be reprocessed (error checking mode).
90
+ - True: Records that exist in the source dataset but not in the result
91
+ dataset (based on delta_on/delta_result_on fields) will be reprocessed
92
+ (missing records mode).
93
+ - False or None: No retry processing.
86
94
 
87
95
  Returns:
88
96
  DataChain: A DataChain object containing the file information.
@@ -208,6 +216,10 @@ def read_storage(
208
216
 
209
217
  if delta:
210
218
  storage_chain = storage_chain._as_delta(
211
- on=delta_on, right_on=delta_result_on, compare=delta_compare
219
+ on=delta_on,
220
+ right_on=delta_result_on,
221
+ compare=delta_compare,
222
+ delta_retry=delta_retry,
212
223
  )
224
+
213
225
  return storage_chain
datachain/lib/file.py CHANGED
@@ -127,10 +127,7 @@ class TarVFile(VFile):
127
127
  @classmethod
128
128
  def open(cls, file: "File", location: list[dict]):
129
129
  """Stream file from tar archive based on location in archive."""
130
- if len(location) > 1:
131
- raise VFileError(
132
- "multiple 'location's are not supported yet", file.source, file.path
133
- )
130
+ tar_file = cls.parent(file, location)
134
131
 
135
132
  loc = location[0]
136
133
 
@@ -140,15 +137,26 @@ class TarVFile(VFile):
140
137
  if (size := loc.get("size", None)) is None:
141
138
  raise VFileError("'size' is not specified", file.source, file.path)
142
139
 
140
+ client = file._catalog.get_client(tar_file.source)
141
+ fd = client.open_object(tar_file, use_cache=file._caching_enabled)
142
+ return FileSlice(fd, offset, size, file.name)
143
+
144
+ @classmethod
145
+ def parent(cls, file: "File", location: list[dict]) -> "File":
146
+ if len(location) > 1:
147
+ raise VFileError(
148
+ "multiple 'location's are not supported yet", file.source, file.path
149
+ )
150
+
151
+ loc = location[0]
152
+
143
153
  if (parent := loc.get("parent", None)) is None:
144
154
  raise VFileError("'parent' is not specified", file.source, file.path)
145
155
 
146
156
  tar_file = File(**parent)
147
157
  tar_file._set_stream(file._catalog)
148
158
 
149
- client = file._catalog.get_client(tar_file.source)
150
- fd = client.open_object(tar_file, use_cache=file._caching_enabled)
151
- return FileSlice(fd, offset, size, file.name)
159
+ return tar_file
152
160
 
153
161
 
154
162
  class VFileRegistry:
@@ -159,7 +167,7 @@ class VFileRegistry:
159
167
  cls._vtype_readers[reader.get_vtype()] = reader
160
168
 
161
169
  @classmethod
162
- def resolve(cls, file: "File", location: list[dict]):
170
+ def _get_reader(cls, file: "File", location: list[dict]):
163
171
  if len(location) == 0:
164
172
  raise VFileError(
165
173
  "'location' must not be list of JSONs", file.source, file.path
@@ -174,8 +182,18 @@ class VFileRegistry:
174
182
  "reader not registered", file.source, file.path, vtype=vtype
175
183
  )
176
184
 
185
+ return reader
186
+
187
+ @classmethod
188
+ def open(cls, file: "File", location: list[dict]):
189
+ reader = cls._get_reader(file, location)
177
190
  return reader.open(file, location)
178
191
 
192
+ @classmethod
193
+ def parent(cls, file: "File", location: list[dict]) -> "File":
194
+ reader = cls._get_reader(file, location)
195
+ return reader.parent(file, location)
196
+
179
197
 
180
198
  class File(DataModel):
181
199
  """
@@ -330,7 +348,7 @@ class File(DataModel):
330
348
  def open(self, mode: Literal["rb", "r"] = "rb") -> Iterator[Any]:
331
349
  """Open the file and return a file object."""
332
350
  if self.location:
333
- with VFileRegistry.resolve(self, self.location) as f: # type: ignore[arg-type]
351
+ with VFileRegistry.open(self, self.location) as f: # type: ignore[arg-type]
334
352
  yield f
335
353
 
336
354
  else:
@@ -349,6 +367,13 @@ class File(DataModel):
349
367
 
350
368
  def read_text(self):
351
369
  """Returns file contents as text."""
370
+ if self.location:
371
+ raise VFileError(
372
+ "Reading text from virtual file is not supported",
373
+ self.source,
374
+ self.path,
375
+ )
376
+
352
377
  with self.open(mode="r") as stream:
353
378
  return stream.read()
354
379
 
@@ -427,9 +452,13 @@ class File(DataModel):
427
452
  if self._catalog is None:
428
453
  raise RuntimeError("cannot prefetch file because catalog is not setup")
429
454
 
455
+ file = self
456
+ if self.location:
457
+ file = VFileRegistry.parent(self, self.location) # type: ignore[arg-type]
458
+
430
459
  client = self._catalog.get_client(self.source)
431
- await client._download(self, callback=download_cb or self._download_cb)
432
- self._set_stream(
460
+ await client._download(file, callback=download_cb or self._download_cb)
461
+ file._set_stream(
433
462
  self._catalog, caching_enabled=True, download_cb=DEFAULT_CALLBACK
434
463
  )
435
464
  return True
datachain/semver.py CHANGED
@@ -1,8 +1,13 @@
1
+ # Maximum version number for semver (major.minor.patch) is 999999.999999.999999
2
+ # this number was chosen because value("999999.999999.999999") < 2**63 - 1
3
+ MAX_VERSION_NUMBER = 999_999
4
+
5
+
1
6
  def parse(version: str) -> tuple[int, int, int]:
2
7
  """Parsing semver into 3 integers: major, minor, patch"""
3
8
  validate(version)
4
9
  parts = version.split(".")
5
- return (int(parts[0]), int(parts[1]), int(parts[2]))
10
+ return int(parts[0]), int(parts[1]), int(parts[2])
6
11
 
7
12
 
8
13
  def validate(version: str) -> None:
@@ -20,14 +25,18 @@ def validate(version: str) -> None:
20
25
  for part in parts:
21
26
  try:
22
27
  val = int(part)
23
- assert val >= 0
28
+ assert 0 <= val <= MAX_VERSION_NUMBER
24
29
  except (ValueError, AssertionError):
25
30
  raise ValueError(error_message) from None
26
31
 
27
32
 
28
33
  def create(major: int = 0, minor: int = 0, patch: int = 0) -> str:
29
34
  """Creates new semver from 3 integers: major, minor and patch"""
30
- if major < 0 or minor < 0 or patch < 0:
35
+ if not (
36
+ 0 <= major <= MAX_VERSION_NUMBER
37
+ and 0 <= minor <= MAX_VERSION_NUMBER
38
+ and 0 <= patch <= MAX_VERSION_NUMBER
39
+ ):
31
40
  raise ValueError("Major, minor and patch must be greater or equal to zero")
32
41
 
33
42
  return ".".join([str(major), str(minor), str(patch)])
@@ -35,10 +44,11 @@ def create(major: int = 0, minor: int = 0, patch: int = 0) -> str:
35
44
 
36
45
  def value(version: str) -> int:
37
46
  """
38
- Calculate integer value of a version. This is useful when comparing two versions
47
+ Calculate integer value of a version. This is useful when comparing two versions.
39
48
  """
40
49
  major, minor, patch = parse(version)
41
- return major * 100 + minor * 10 + patch
50
+ limit = MAX_VERSION_NUMBER + 1
51
+ return major * (limit**2) + minor * limit + patch
42
52
 
43
53
 
44
54
  def compare(v1: str, v2: str) -> int:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.18.11
3
+ Version: 0.19.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -146,6 +146,12 @@ Use Cases
146
146
  on these tables at scale.
147
147
  3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
148
148
  Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
149
+ 4. **Incremental Processing.** DataChain's delta and retry features allow for efficient
150
+ processing workflows:
151
+
152
+ - **Delta Processing**: Process only new or changed files/records
153
+ - **Retry Processing**: Automatically reprocess records with errors or missing results
154
+ - **Combined Approach**: Process new data and fix errors in a single pipeline
149
155
 
150
156
  Getting Started
151
157
  ===============
@@ -158,7 +164,7 @@ to get started with `DataChain` and learn more.
158
164
  pip install datachain
159
165
 
160
166
 
161
- Example: download subset of files based on metadata
167
+ Example: Download Subset of Files Based on Metadata
162
168
  ---------------------------------------------------
163
169
 
164
170
  Sometimes users only need to download a specific subset of files from cloud storage,
@@ -182,6 +188,54 @@ high confidence scores.
182
188
  likely_cats.to_storage("high-confidence-cats/", signal="file")
183
189
 
184
190
 
191
+ Example: Incremental Processing with Error Handling
192
+ ---------------------------------------------------
193
+
194
+ This example shows how to use both delta and retry processing for efficient handling of large
195
+ datasets that evolve over time and may occasionally have processing errors.
196
+
197
+ .. code:: py
198
+
199
+ import datachain as dc
200
+ from datachain import C, File
201
+
202
+ def process_file(file: File):
203
+ """Process a file, which may occasionally fail."""
204
+ try:
205
+ # Your processing logic here
206
+ content = file.read_text()
207
+ result = analyze_content(content)
208
+ return {
209
+ "content": content,
210
+ "result": result,
211
+ "error": None # No error
212
+ }
213
+ except Exception as e:
214
+ # Return an error that will trigger reprocessing next time
215
+ return {
216
+ "content": None,
217
+ "result": None,
218
+ "error": str(e) # Error field will trigger retry
219
+ }
220
+
221
+ # Process files efficiently with delta and retry
222
+ chain = (
223
+ dc.read_storage(
224
+ "data/",
225
+ update=True,
226
+ delta=True, # Process only new/changed files
227
+ delta_on="file.path", # Identify files by path
228
+ retry_on="error" # Field that indicates errors
229
+ )
230
+ .map(processed_result=process_file)
231
+ .mutate(
232
+ content=C("processed_result.content"),
233
+ result=C("processed_result.result"),
234
+ error=C("processed_result.error")
235
+ )
236
+ .save(name="processed_data")
237
+ )
238
+
185
239
  Example: LLM based text-file evaluation
186
240
  ---------------------------------------
187
241
 
@@ -4,7 +4,7 @@ datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
4
4
  datachain/cache.py,sha256=yQblPhOh_Mq74Ma7xT1CL1idLJ0HgrQxpGVYvRy_9Eg,3623
5
5
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
6
6
  datachain/dataset.py,sha256=XUZ-kSBL1y6juFqlSWXXbattGS1E53lXpyhc0Ip1_AA,20527
7
- datachain/delta.py,sha256=q-ritPMxgsTh53qJYd2N1TqZ3Inxc7GJ9JED9rE-Z1M,3994
7
+ datachain/delta.py,sha256=fP1Yy_MfdnTZmIOe243SBiDWTzd6MqLw0tQxvZNxLcs,8384
8
8
  datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
9
9
  datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
10
10
  datachain/listing.py,sha256=JtExYIfKMFhEIIcSSWBmaxWpoS3ben7kb692cHHm4Lo,7079
@@ -14,7 +14,7 @@ datachain/nodes_thread_pool.py,sha256=mdo0s-VybuSZkRUARcUO4Tjh8KFfZr9foHqmupx2Sm
14
14
  datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
15
15
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
17
- datachain/semver.py,sha256=t_3Y5OGLEthrstBwuwrf5pXVquEuRFu3ZoGe3ajfJB8,1715
17
+ datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
18
18
  datachain/studio.py,sha256=1J2ANFVVA1ysPxBuLibQSnSXt0U9Vfgz9ZNGikYtWdk,11038
19
19
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
20
20
  datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
@@ -39,9 +39,9 @@ datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI
39
39
  datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
40
40
  datachain/client/azure.py,sha256=7yyAgANHfu9Kfh187MKNTT1guvu9Q-WYsi4vYoY3aew,3270
41
41
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
42
- datachain/client/fsspec.py,sha256=c8oRBUMo31k8bMB_mIA60PDfna4nYTdslzHqmqL2Uvg,13918
42
+ datachain/client/fsspec.py,sha256=huPHNDZRGz_rSN7XnS9hKmRoS2fsSz_y2-cxUSlvsOA,13938
43
43
  datachain/client/gcs.py,sha256=8hcFhEHp8qGRsJoyfCoawfuwb1Et-MSkyQoM9AnNuXI,5204
44
- datachain/client/hf.py,sha256=posnI5WOKOMG1yY_ZiV9Orcd24QsUPKZlOXgJVLxxrM,1558
44
+ datachain/client/hf.py,sha256=mRBqHeBT758TJicU-Fn2L3l5AbHWwMzycWwttNUACKk,2180
45
45
  datachain/client/local.py,sha256=cGoCYflribzexiOe-Y1qbaE2fJRh-_EgQrfCSa0yK_E,4568
46
46
  datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
47
47
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
@@ -68,11 +68,11 @@ datachain/func/random.py,sha256=t7jwXsI8-hy0qAdvjAntgzy-AHtTAfozlZ1CpKR-QZE,458
68
68
  datachain/func/string.py,sha256=X9u4ip97U63RCaKRhMddoze7HgPiY3LbPRn9G06UWWo,7311
69
69
  datachain/func/window.py,sha256=ImyRpc1QI8QUSPO7KdD60e_DPVo7Ja0G5kcm6BlyMcw,1584
70
70
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
- datachain/lib/arrow.py,sha256=mFO_6wRqzpEzBhXf7Xn1aeLUvaiHcC6XQ-8as9sbcgY,10253
71
+ datachain/lib/arrow.py,sha256=2IuNZ6tRFsxVNhWElqr0ptz28geSDzlDHUtzD4qeDNM,10339
72
72
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
73
73
  datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
74
74
  datachain/lib/dataset_info.py,sha256=d-jz6zeDU5DEgYtyeSF5nK0MU-40FV5km_iOCh4pXzo,3179
75
- datachain/lib/file.py,sha256=mzc7_fpHAkVhs4z3jBUhFQzPEbODdXJpzjVfby2IkC4,31117
75
+ datachain/lib/file.py,sha256=PuTa6CEG9CaJXPhxrZFY-R9-DS7ynB9l7Y0bUbd_Qwg,31952
76
76
  datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
77
77
  datachain/lib/image.py,sha256=erWvZW5M3emnbl6_fGAOPyKm-1EKbt3vOdWPfe3Oo7U,3265
78
78
  datachain/lib/listing.py,sha256=5_GoATtIwCtd1JMqlorPB_vQDxndOQZpiWjNOG3NMw4,7007
@@ -99,15 +99,15 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
99
99
  datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
100
100
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
101
101
  datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
102
- datachain/lib/dc/datachain.py,sha256=5rR_QqG4vesq-x545ZTSFJDSb6Oc5CW4-ziQYD6DpW4,80993
103
- datachain/lib/dc/datasets.py,sha256=G65leCuo_3bItmvjoV1wK0pzj7a2IQqe3xRsflpF3xM,10794
102
+ datachain/lib/dc/datachain.py,sha256=cQjq6_OWQ_1JKvIqb8snl6mKfuBbpllPEao5ygVINog,81733
103
+ datachain/lib/dc/datasets.py,sha256=g_bBGCUwAwNJypYSUQvrDDqnaw7nfXpvrEvUVPtWATY,11268
104
104
  datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
105
105
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
106
106
  datachain/lib/dc/listings.py,sha256=2na9v63xO1vPUNaoBSzA-TSN49V7zQAb-4iS1wOPLFE,1029
107
107
  datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
108
108
  datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
109
109
  datachain/lib/dc/records.py,sha256=J1I69J2gFIBjRTGr2LG-5qn_rTVzRLcr2y3tVDrmHdg,3068
110
- datachain/lib/dc/storage.py,sha256=YUlw3OtdRmYc2k24AmqjnqJK8k1H-onjh-mCxu_3BbE,8195
110
+ datachain/lib/dc/storage.py,sha256=u-QB_0sn1Wwc0-9phi1zT38UDe5uBIc25xbAhKMU2fA,8774
111
111
  datachain/lib/dc/utils.py,sha256=VawOAlJSvAtZbsMg33s5tJe21TRx1Km3QggI1nN6tnw,3984
112
112
  datachain/lib/dc/values.py,sha256=7l1n352xWrEdql2NhBcZ3hj8xyPglWiY4qHjFPjn6iw,1428
113
113
  datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
@@ -153,9 +153,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
153
153
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
154
154
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
155
155
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
156
- datachain-0.18.11.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
157
- datachain-0.18.11.dist-info/METADATA,sha256=TgOokr9DxfY4A1mq7-5APy8DTHUqFEf2FslYxASH1IA,11320
158
- datachain-0.18.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
- datachain-0.18.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
160
- datachain-0.18.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
161
- datachain-0.18.11.dist-info/RECORD,,
156
+ datachain-0.19.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
157
+ datachain-0.19.1.dist-info/METADATA,sha256=qg4KSU457ARE-A00yjNYNtFP3vhX0yqsxrCGKctXva4,13281
158
+ datachain-0.19.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
+ datachain-0.19.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
160
+ datachain-0.19.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
161
+ datachain-0.19.1.dist-info/RECORD,,