datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +20 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +7 -7
- datachain/catalog/__init__.py +2 -2
- datachain/catalog/catalog.py +621 -507
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +28 -18
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +24 -33
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +83 -52
- datachain/cli/commands/ls.py +17 -17
- datachain/cli/commands/show.py +4 -4
- datachain/cli/parser/__init__.py +8 -74
- datachain/cli/parser/job.py +95 -3
- datachain/cli/parser/studio.py +11 -4
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +4 -4
- datachain/client/fsspec.py +45 -28
- datachain/client/gcs.py +6 -6
- datachain/client/hf.py +29 -2
- datachain/client/http.py +157 -0
- datachain/client/local.py +15 -11
- datachain/client/s3.py +17 -9
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +5 -1
- datachain/data_storage/metastore.py +1252 -186
- datachain/data_storage/schema.py +58 -45
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +286 -127
- datachain/data_storage/warehouse.py +250 -113
- datachain/dataset.py +353 -148
- datachain/delta.py +391 -0
- datachain/diff/__init__.py +27 -29
- datachain/error.py +60 -0
- datachain/func/__init__.py +2 -1
- datachain/func/aggregate.py +66 -42
- datachain/func/array.py +242 -38
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +110 -60
- datachain/func/func.py +96 -45
- datachain/func/numeric.py +55 -38
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +67 -37
- datachain/func/window.py +7 -8
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +58 -22
- datachain/lib/audio.py +245 -0
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/flatten.py +5 -3
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/sql_to_python.py +8 -0
- datachain/lib/convert/values_to_tuples.py +156 -51
- datachain/lib/data_model.py +42 -20
- datachain/lib/dataset_info.py +36 -8
- datachain/lib/dc/__init__.py +8 -2
- datachain/lib/dc/csv.py +25 -28
- datachain/lib/dc/database.py +398 -0
- datachain/lib/dc/datachain.py +1289 -425
- datachain/lib/dc/datasets.py +320 -38
- datachain/lib/dc/hf.py +38 -24
- datachain/lib/dc/json.py +29 -32
- datachain/lib/dc/listings.py +112 -8
- datachain/lib/dc/pandas.py +16 -12
- datachain/lib/dc/parquet.py +35 -23
- datachain/lib/dc/records.py +31 -23
- datachain/lib/dc/storage.py +154 -64
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +24 -16
- datachain/lib/dc/values.py +8 -9
- datachain/lib/file.py +622 -89
- datachain/lib/hf.py +69 -39
- datachain/lib/image.py +14 -14
- datachain/lib/listing.py +14 -11
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +3 -4
- datachain/lib/model_store.py +39 -7
- datachain/lib/namespaces.py +125 -0
- datachain/lib/projects.py +130 -0
- datachain/lib/pytorch.py +32 -21
- datachain/lib/settings.py +192 -56
- datachain/lib/signal_schema.py +427 -104
- datachain/lib/tar.py +1 -2
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +164 -76
- datachain/lib/udf_signature.py +60 -35
- datachain/lib/utils.py +118 -4
- datachain/lib/video.py +17 -9
- datachain/lib/webdataset.py +61 -56
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +22 -10
- datachain/model/bbox.py +3 -1
- datachain/model/ultralytics/bbox.py +16 -12
- datachain/model/ultralytics/pose.py +16 -12
- datachain/model/ultralytics/segment.py +16 -12
- datachain/namespace.py +84 -0
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +78 -0
- datachain/query/batch.py +40 -41
- datachain/query/dataset.py +604 -322
- datachain/query/dispatch.py +261 -154
- datachain/query/metrics.py +4 -6
- datachain/query/params.py +2 -3
- datachain/query/queue.py +3 -12
- datachain/query/schema.py +11 -6
- datachain/query/session.py +200 -33
- datachain/query/udf.py +34 -2
- datachain/remote/studio.py +171 -69
- datachain/script_meta.py +12 -12
- datachain/semver.py +68 -0
- datachain/sql/__init__.py +2 -0
- datachain/sql/functions/array.py +33 -1
- datachain/sql/postgresql_dialect.py +9 -0
- datachain/sql/postgresql_types.py +21 -0
- datachain/sql/sqlite/__init__.py +5 -1
- datachain/sql/sqlite/base.py +102 -29
- datachain/sql/sqlite/types.py +8 -13
- datachain/sql/types.py +70 -15
- datachain/studio.py +223 -46
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +101 -59
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
- datachain-0.39.0.dist-info/RECORD +173 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
- datachain/cli/commands/query.py +0 -53
- datachain/query/utils.py +0 -42
- datachain-0.14.2.dist-info/RECORD +0 -158
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/delta.py
ADDED
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from copy import copy
|
|
3
|
+
from functools import wraps
|
|
4
|
+
from typing import TYPE_CHECKING, TypeVar
|
|
5
|
+
|
|
6
|
+
import datachain
|
|
7
|
+
from datachain.dataset import DatasetDependency, DatasetRecord
|
|
8
|
+
from datachain.error import DatasetNotFoundError, SchemaDriftError
|
|
9
|
+
from datachain.project import Project
|
|
10
|
+
from datachain.query.dataset import UnionSchemaMismatchError
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Callable
|
|
14
|
+
from typing import Concatenate
|
|
15
|
+
|
|
16
|
+
from typing_extensions import ParamSpec
|
|
17
|
+
|
|
18
|
+
from datachain.lib.dc import DataChain
|
|
19
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
20
|
+
|
|
21
|
+
P = ParamSpec("P")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
T = TypeVar("T", bound="DataChain")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def delta_disabled(
|
|
28
|
+
method: "Callable[Concatenate[T, P], T]",
|
|
29
|
+
) -> "Callable[Concatenate[T, P], T]":
|
|
30
|
+
"""
|
|
31
|
+
Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to
|
|
32
|
+
work with delta updates. It throws `NotImplementedError` if chain on which
|
|
33
|
+
method is called is marked as delta.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
@wraps(method)
|
|
37
|
+
def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
|
|
38
|
+
if self.delta and not self._delta_unsafe:
|
|
39
|
+
raise NotImplementedError(
|
|
40
|
+
f"Cannot use {method.__name__} with delta datasets - may cause"
|
|
41
|
+
" inconsistency. Use delta_unsafe flag to allow this operation."
|
|
42
|
+
)
|
|
43
|
+
return method(self, *args, **kwargs)
|
|
44
|
+
|
|
45
|
+
return _inner
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _append_steps(dc: "DataChain", other: "DataChain"):
|
|
49
|
+
"""Returns cloned chain with appended steps from other chain.
|
|
50
|
+
Steps are all those modification methods applied like filters, mappers etc.
|
|
51
|
+
"""
|
|
52
|
+
dc = dc.clone()
|
|
53
|
+
dc._query.steps += other._query.steps.copy()
|
|
54
|
+
dc.signals_schema = other.signals_schema
|
|
55
|
+
return dc
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _format_schema_drift_message(
|
|
59
|
+
context: str,
|
|
60
|
+
existing_schema: "SignalSchema",
|
|
61
|
+
updated_schema: "SignalSchema",
|
|
62
|
+
) -> tuple[str, bool]:
|
|
63
|
+
missing_cols, new_cols = existing_schema.compare_signals(updated_schema)
|
|
64
|
+
|
|
65
|
+
if not new_cols and not missing_cols:
|
|
66
|
+
return "", False
|
|
67
|
+
|
|
68
|
+
parts: list[str] = []
|
|
69
|
+
if new_cols:
|
|
70
|
+
parts.append("new columns detected: " + ", ".join(sorted(new_cols)))
|
|
71
|
+
if missing_cols:
|
|
72
|
+
parts.append(
|
|
73
|
+
"columns missing in updated data: " + ", ".join(sorted(missing_cols))
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
details = "; ".join(parts)
|
|
77
|
+
message = f"Delta update failed: schema drift detected while {context}: {details}."
|
|
78
|
+
|
|
79
|
+
return message, True
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _safe_union(
|
|
83
|
+
left: "DataChain",
|
|
84
|
+
right: "DataChain",
|
|
85
|
+
context: str,
|
|
86
|
+
) -> "DataChain":
|
|
87
|
+
try:
|
|
88
|
+
return left.union(right)
|
|
89
|
+
except UnionSchemaMismatchError as exc:
|
|
90
|
+
message, has_drift = _format_schema_drift_message(
|
|
91
|
+
context,
|
|
92
|
+
left.signals_schema,
|
|
93
|
+
right.signals_schema,
|
|
94
|
+
)
|
|
95
|
+
if has_drift:
|
|
96
|
+
raise SchemaDriftError(message) from exc
|
|
97
|
+
raise
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _get_delta_chain(
|
|
101
|
+
source_ds_name: str,
|
|
102
|
+
source_ds_project: Project,
|
|
103
|
+
source_ds_version: str,
|
|
104
|
+
source_ds_latest_version: str,
|
|
105
|
+
on: str | Sequence[str],
|
|
106
|
+
compare: str | Sequence[str] | None = None,
|
|
107
|
+
) -> "DataChain":
|
|
108
|
+
"""Get delta chain for processing changes between versions."""
|
|
109
|
+
source_dc = datachain.read_dataset(
|
|
110
|
+
source_ds_name,
|
|
111
|
+
namespace=source_ds_project.namespace.name,
|
|
112
|
+
project=source_ds_project.name,
|
|
113
|
+
version=source_ds_version,
|
|
114
|
+
)
|
|
115
|
+
source_dc_latest = datachain.read_dataset(
|
|
116
|
+
source_ds_name,
|
|
117
|
+
namespace=source_ds_project.namespace.name,
|
|
118
|
+
project=source_ds_project.name,
|
|
119
|
+
version=source_ds_latest_version,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Calculate diff between source versions
|
|
123
|
+
return source_dc_latest.diff(source_dc, on=on, compare=compare, deleted=False)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _get_retry_chain(
|
|
127
|
+
name: str,
|
|
128
|
+
namespace_name: str,
|
|
129
|
+
project_name: str,
|
|
130
|
+
latest_version: str,
|
|
131
|
+
source_ds_name: str,
|
|
132
|
+
source_ds_project: Project,
|
|
133
|
+
source_ds_version: str,
|
|
134
|
+
on: str | Sequence[str],
|
|
135
|
+
right_on: str | Sequence[str] | None,
|
|
136
|
+
delta_retry: bool | str | None,
|
|
137
|
+
diff_chain: "DataChain",
|
|
138
|
+
) -> "DataChain | None":
|
|
139
|
+
"""Get retry chain for processing error records and missing records."""
|
|
140
|
+
# Import here to avoid circular import
|
|
141
|
+
from datachain.lib.dc import C
|
|
142
|
+
|
|
143
|
+
retry_chain = None
|
|
144
|
+
|
|
145
|
+
# Read the latest version of the result dataset for retry logic
|
|
146
|
+
result_dataset = datachain.read_dataset(
|
|
147
|
+
name,
|
|
148
|
+
namespace=namespace_name,
|
|
149
|
+
project=project_name,
|
|
150
|
+
version=latest_version,
|
|
151
|
+
)
|
|
152
|
+
source_dc = datachain.read_dataset(
|
|
153
|
+
source_ds_name,
|
|
154
|
+
namespace=source_ds_project.namespace.name,
|
|
155
|
+
project=source_ds_project.name,
|
|
156
|
+
version=source_ds_version,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Handle error records if delta_retry is a string (column name)
|
|
160
|
+
if isinstance(delta_retry, str):
|
|
161
|
+
error_records = result_dataset.filter(C(delta_retry) != "")
|
|
162
|
+
error_source_records = source_dc.merge(
|
|
163
|
+
error_records, on=on, right_on=right_on, inner=True
|
|
164
|
+
).select(
|
|
165
|
+
*list(source_dc.signals_schema.clone_without_sys_signals().values.keys())
|
|
166
|
+
)
|
|
167
|
+
retry_chain = error_source_records
|
|
168
|
+
|
|
169
|
+
# Handle missing records if delta_retry is True
|
|
170
|
+
elif delta_retry is True:
|
|
171
|
+
missing_records = source_dc.subtract(result_dataset, on=on, right_on=right_on)
|
|
172
|
+
retry_chain = missing_records
|
|
173
|
+
|
|
174
|
+
# Subtract also diff chain since some items might be picked
|
|
175
|
+
# up by `delta=True` itself (e.g. records got modified AND are missing in the
|
|
176
|
+
# result dataset atm)
|
|
177
|
+
on = [on] if isinstance(on, str) else on
|
|
178
|
+
|
|
179
|
+
return (
|
|
180
|
+
retry_chain.diff(
|
|
181
|
+
diff_chain, on=on, added=True, same=True, modified=False, deleted=False
|
|
182
|
+
).distinct(*on)
|
|
183
|
+
if retry_chain
|
|
184
|
+
else None
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _get_source_info(
|
|
189
|
+
source_ds: DatasetRecord,
|
|
190
|
+
name: str,
|
|
191
|
+
namespace_name: str,
|
|
192
|
+
project_name: str,
|
|
193
|
+
latest_version: str,
|
|
194
|
+
catalog,
|
|
195
|
+
) -> tuple[
|
|
196
|
+
str | None,
|
|
197
|
+
Project | None,
|
|
198
|
+
str | None,
|
|
199
|
+
str | None,
|
|
200
|
+
list[DatasetDependency] | None,
|
|
201
|
+
]:
|
|
202
|
+
"""Get source dataset information and dependencies.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Tuple of (source_name, source_version, source_latest_version, dependencies)
|
|
206
|
+
Returns (None, None, None, None) if source dataset was removed.
|
|
207
|
+
"""
|
|
208
|
+
dependencies = catalog.get_dataset_dependencies(
|
|
209
|
+
name,
|
|
210
|
+
latest_version,
|
|
211
|
+
namespace_name=namespace_name,
|
|
212
|
+
project_name=project_name,
|
|
213
|
+
indirect=False,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
source_ds_dep = next(
|
|
217
|
+
(d for d in dependencies if d and d.name == source_ds.name), None
|
|
218
|
+
)
|
|
219
|
+
if not source_ds_dep:
|
|
220
|
+
# Starting dataset was removed, back off to normal dataset creation
|
|
221
|
+
return None, None, None, None, None
|
|
222
|
+
|
|
223
|
+
# Refresh starting dataset to have new versions if they are created
|
|
224
|
+
source_ds = catalog.get_dataset(
|
|
225
|
+
source_ds.name,
|
|
226
|
+
namespace_name=source_ds.project.namespace.name,
|
|
227
|
+
project_name=source_ds.project.name,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return (
|
|
231
|
+
source_ds.name,
|
|
232
|
+
source_ds.project,
|
|
233
|
+
source_ds_dep.version,
|
|
234
|
+
source_ds.latest_version,
|
|
235
|
+
dependencies,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def delta_retry_update(
|
|
240
|
+
dc: "DataChain",
|
|
241
|
+
namespace_name: str,
|
|
242
|
+
project_name: str,
|
|
243
|
+
name: str,
|
|
244
|
+
on: str | Sequence[str],
|
|
245
|
+
right_on: str | Sequence[str] | None = None,
|
|
246
|
+
compare: str | Sequence[str] | None = None,
|
|
247
|
+
delta_retry: bool | str | None = None,
|
|
248
|
+
) -> tuple["DataChain | None", list[DatasetDependency] | None, bool]:
|
|
249
|
+
"""
|
|
250
|
+
Creates new chain that consists of the last version of current delta dataset
|
|
251
|
+
plus diff from the source with all needed modifications.
|
|
252
|
+
This way we don't need to re-calculate the whole chain from the source again
|
|
253
|
+
(apply all the DataChain methods like filters, mappers, generators etc.)
|
|
254
|
+
but just the diff part which is very important for performance.
|
|
255
|
+
|
|
256
|
+
Note that currently delta update works only if there is only one direct
|
|
257
|
+
dependency.
|
|
258
|
+
|
|
259
|
+
Additionally supports retry functionality to filter records that either:
|
|
260
|
+
1. Have a non-None value in the field specified by delta_retry (when it's a string)
|
|
261
|
+
2. Exist in the source dataset but are missing in the result dataset
|
|
262
|
+
(when delta_retry=True)
|
|
263
|
+
|
|
264
|
+
Parameters:
|
|
265
|
+
dc: The DataChain to filter for records that need reprocessing
|
|
266
|
+
name: Name of the destination dataset
|
|
267
|
+
on: Field(s) in source dataset that uniquely identify records
|
|
268
|
+
right_on: Corresponding field(s) in result dataset if they differ from
|
|
269
|
+
source
|
|
270
|
+
compare: Field(s) used to check if the same row has been modified
|
|
271
|
+
delta_retry: If string, field in result dataset that indicates an error
|
|
272
|
+
when not None. If True, include records missing from result dataset.
|
|
273
|
+
If False/None, no retry functionality.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
A tuple containing (filtered chain for delta/retry processing,
|
|
277
|
+
dependencies, found records flag)
|
|
278
|
+
"""
|
|
279
|
+
|
|
280
|
+
catalog = dc.session.catalog
|
|
281
|
+
# project = catalog.metastore.get_project(project_name, namespace_name)
|
|
282
|
+
dc._query.apply_listing_pre_step()
|
|
283
|
+
|
|
284
|
+
# Check if dataset exists
|
|
285
|
+
try:
|
|
286
|
+
dataset = catalog.get_dataset(
|
|
287
|
+
name, namespace_name=namespace_name, project_name=project_name
|
|
288
|
+
)
|
|
289
|
+
latest_version = dataset.latest_version
|
|
290
|
+
except DatasetNotFoundError:
|
|
291
|
+
# First creation of result dataset
|
|
292
|
+
return None, None, True
|
|
293
|
+
|
|
294
|
+
# Initialize variables
|
|
295
|
+
diff_chain = None
|
|
296
|
+
dependencies = None
|
|
297
|
+
retry_chain = None
|
|
298
|
+
processing_chain = None
|
|
299
|
+
|
|
300
|
+
(
|
|
301
|
+
source_ds_name,
|
|
302
|
+
source_ds_project,
|
|
303
|
+
source_ds_version,
|
|
304
|
+
source_ds_latest_version,
|
|
305
|
+
dependencies,
|
|
306
|
+
) = _get_source_info(
|
|
307
|
+
dc._query.starting_step.dataset, # type: ignore[union-attr]
|
|
308
|
+
name,
|
|
309
|
+
namespace_name,
|
|
310
|
+
project_name,
|
|
311
|
+
latest_version,
|
|
312
|
+
catalog,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# If source_ds_name is None, starting dataset was removed
|
|
316
|
+
if source_ds_name is None:
|
|
317
|
+
return None, None, True
|
|
318
|
+
|
|
319
|
+
assert source_ds_project
|
|
320
|
+
assert source_ds_version
|
|
321
|
+
assert source_ds_latest_version
|
|
322
|
+
|
|
323
|
+
diff_chain = _get_delta_chain(
|
|
324
|
+
source_ds_name,
|
|
325
|
+
source_ds_project,
|
|
326
|
+
source_ds_version,
|
|
327
|
+
source_ds_latest_version,
|
|
328
|
+
on,
|
|
329
|
+
compare,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# Filter out removed dep
|
|
333
|
+
if dependencies:
|
|
334
|
+
dependencies = copy(dependencies)
|
|
335
|
+
dependencies = [d for d in dependencies if d is not None]
|
|
336
|
+
source_ds_dep = next(d for d in dependencies if d.name == source_ds_name)
|
|
337
|
+
# Update to latest version
|
|
338
|
+
source_ds_dep.version = source_ds_latest_version # type: ignore[union-attr]
|
|
339
|
+
|
|
340
|
+
# Handle retry functionality if enabled
|
|
341
|
+
if delta_retry:
|
|
342
|
+
retry_chain = _get_retry_chain(
|
|
343
|
+
name,
|
|
344
|
+
namespace_name,
|
|
345
|
+
project_name,
|
|
346
|
+
latest_version,
|
|
347
|
+
source_ds_name,
|
|
348
|
+
source_ds_project,
|
|
349
|
+
source_ds_version,
|
|
350
|
+
on,
|
|
351
|
+
right_on,
|
|
352
|
+
delta_retry,
|
|
353
|
+
diff_chain,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Combine delta and retry chains
|
|
357
|
+
if retry_chain is not None:
|
|
358
|
+
processing_chain = _safe_union(
|
|
359
|
+
diff_chain,
|
|
360
|
+
retry_chain,
|
|
361
|
+
context="combining retry records with delta changes",
|
|
362
|
+
)
|
|
363
|
+
else:
|
|
364
|
+
processing_chain = diff_chain
|
|
365
|
+
|
|
366
|
+
# Apply all the steps from the original chain to processing_chain
|
|
367
|
+
processing_chain = _append_steps(processing_chain, dc).persist()
|
|
368
|
+
|
|
369
|
+
# Check if chain becomes empty after applying steps
|
|
370
|
+
if processing_chain is None or (processing_chain and processing_chain.empty):
|
|
371
|
+
return None, None, False
|
|
372
|
+
|
|
373
|
+
latest_dataset = datachain.read_dataset(
|
|
374
|
+
name,
|
|
375
|
+
namespace=namespace_name,
|
|
376
|
+
project=project_name,
|
|
377
|
+
version=latest_version,
|
|
378
|
+
)
|
|
379
|
+
compared_chain = latest_dataset.diff(
|
|
380
|
+
processing_chain,
|
|
381
|
+
on=right_on or on,
|
|
382
|
+
added=True,
|
|
383
|
+
modified=False,
|
|
384
|
+
deleted=False,
|
|
385
|
+
)
|
|
386
|
+
result_chain = _safe_union(
|
|
387
|
+
compared_chain,
|
|
388
|
+
processing_chain,
|
|
389
|
+
context="merging the delta output with the existing dataset version",
|
|
390
|
+
)
|
|
391
|
+
return result_chain, dependencies, True
|
datachain/diff/__init__.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
|
-
import random
|
|
2
|
-
import string
|
|
3
1
|
from collections.abc import Sequence
|
|
4
2
|
from enum import Enum
|
|
5
|
-
from typing import TYPE_CHECKING
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
6
4
|
|
|
7
5
|
from datachain.func import case, ifelse, isnone, or_
|
|
8
6
|
from datachain.lib.signal_schema import SignalSchema
|
|
@@ -11,16 +9,12 @@ from datachain.query.schema import Column
|
|
|
11
9
|
if TYPE_CHECKING:
|
|
12
10
|
from datachain.lib.dc import DataChain
|
|
13
11
|
|
|
14
|
-
|
|
15
12
|
C = Column
|
|
16
13
|
|
|
17
14
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
random.choice(string.ascii_letters) # noqa: S311
|
|
22
|
-
for _ in range(10)
|
|
23
|
-
)
|
|
15
|
+
STATUS_COL_NAME = "diff_7aeed3aa17ba4d50b8d1c368c76e16a6"
|
|
16
|
+
LEFT_DIFF_COL_NAME = "diff_95f95344064a4b819c8625cd1a5cfc2b"
|
|
17
|
+
RIGHT_DIFF_COL_NAME = "diff_5808838a49b54849aa461d7387376d34"
|
|
24
18
|
|
|
25
19
|
|
|
26
20
|
class CompareStatus(str, Enum):
|
|
@@ -33,22 +27,22 @@ class CompareStatus(str, Enum):
|
|
|
33
27
|
def _compare( # noqa: C901
|
|
34
28
|
left: "DataChain",
|
|
35
29
|
right: "DataChain",
|
|
36
|
-
on:
|
|
37
|
-
right_on:
|
|
38
|
-
compare:
|
|
39
|
-
right_compare:
|
|
30
|
+
on: str | Sequence[str],
|
|
31
|
+
right_on: str | Sequence[str] | None = None,
|
|
32
|
+
compare: str | Sequence[str] | None = None,
|
|
33
|
+
right_compare: str | Sequence[str] | None = None,
|
|
40
34
|
added: bool = True,
|
|
41
35
|
deleted: bool = True,
|
|
42
36
|
modified: bool = True,
|
|
43
37
|
same: bool = True,
|
|
44
|
-
status_col:
|
|
38
|
+
status_col: str | None = None,
|
|
45
39
|
) -> "DataChain":
|
|
46
40
|
"""Comparing two chains by identifying rows that are added, deleted, modified
|
|
47
41
|
or same"""
|
|
48
42
|
rname = "right_"
|
|
49
43
|
schema = left.signals_schema # final chain must have schema from left chain
|
|
50
44
|
|
|
51
|
-
def _to_list(obj:
|
|
45
|
+
def _to_list(obj: str | Sequence[str] | None) -> list[str] | None:
|
|
52
46
|
if obj is None:
|
|
53
47
|
return None
|
|
54
48
|
return [obj] if isinstance(obj, str) else list(obj)
|
|
@@ -77,14 +71,16 @@ def _compare( # noqa: C901
|
|
|
77
71
|
cols_select = list(left.signals_schema.clone_without_sys_signals().values.keys())
|
|
78
72
|
|
|
79
73
|
# getting correct on and right_on column names
|
|
74
|
+
on_ = on
|
|
80
75
|
on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
|
|
81
|
-
right_on = right.signals_schema.resolve(*(right_on or
|
|
76
|
+
right_on = right.signals_schema.resolve(*(right_on or on_)).db_signals() # type: ignore[assignment]
|
|
82
77
|
|
|
83
78
|
# getting correct compare and right_compare column names if they are defined
|
|
84
79
|
if compare:
|
|
80
|
+
compare_ = compare
|
|
85
81
|
compare = left.signals_schema.resolve(*compare).db_signals() # type: ignore[assignment]
|
|
86
82
|
right_compare = right.signals_schema.resolve(
|
|
87
|
-
*(right_compare or
|
|
83
|
+
*(right_compare or compare_)
|
|
88
84
|
).db_signals() # type: ignore[assignment]
|
|
89
85
|
elif not compare and len(cols) != len(right_cols):
|
|
90
86
|
# here we will mark all rows that are not added or deleted as modified since
|
|
@@ -99,21 +95,23 @@ def _compare( # noqa: C901
|
|
|
99
95
|
compare = right_compare = [c for c in cols if c in right_cols and c not in on] # type: ignore[misc]
|
|
100
96
|
|
|
101
97
|
# get diff column names
|
|
102
|
-
diff_col = status_col or
|
|
103
|
-
ldiff_col =
|
|
104
|
-
rdiff_col =
|
|
98
|
+
diff_col = status_col or STATUS_COL_NAME
|
|
99
|
+
ldiff_col = LEFT_DIFF_COL_NAME
|
|
100
|
+
rdiff_col = RIGHT_DIFF_COL_NAME
|
|
105
101
|
|
|
106
102
|
# adding helper diff columns, which will be removed after
|
|
107
103
|
left = left.mutate(**{ldiff_col: 1})
|
|
108
104
|
right = right.mutate(**{rdiff_col: 1})
|
|
109
105
|
|
|
110
|
-
if
|
|
106
|
+
if compare is None:
|
|
111
107
|
modified_cond = True
|
|
108
|
+
elif len(compare) == 0:
|
|
109
|
+
modified_cond = False
|
|
112
110
|
else:
|
|
113
111
|
modified_cond = or_( # type: ignore[assignment]
|
|
114
112
|
*[
|
|
115
113
|
C(c) != (C(f"{rname}{rc}") if c == rc else C(rc))
|
|
116
|
-
for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
|
|
114
|
+
for c, rc in zip(compare, right_compare, strict=False) # type: ignore[arg-type]
|
|
117
115
|
]
|
|
118
116
|
)
|
|
119
117
|
|
|
@@ -137,7 +135,7 @@ def _compare( # noqa: C901
|
|
|
137
135
|
C(f"{rname + l_on if on == right_on else r_on}"),
|
|
138
136
|
C(l_on),
|
|
139
137
|
)
|
|
140
|
-
for l_on, r_on in zip(on, right_on) # type: ignore[arg-type]
|
|
138
|
+
for l_on, r_on in zip(on, right_on, strict=False) # type: ignore[arg-type]
|
|
141
139
|
}
|
|
142
140
|
)
|
|
143
141
|
.select_except(ldiff_col, rdiff_col)
|
|
@@ -168,10 +166,10 @@ def _compare( # noqa: C901
|
|
|
168
166
|
def compare_and_split(
|
|
169
167
|
left: "DataChain",
|
|
170
168
|
right: "DataChain",
|
|
171
|
-
on:
|
|
172
|
-
right_on:
|
|
173
|
-
compare:
|
|
174
|
-
right_compare:
|
|
169
|
+
on: str | Sequence[str],
|
|
170
|
+
right_on: str | Sequence[str] | None = None,
|
|
171
|
+
compare: str | Sequence[str] | None = None,
|
|
172
|
+
right_compare: str | Sequence[str] | None = None,
|
|
175
173
|
added: bool = True,
|
|
176
174
|
deleted: bool = True,
|
|
177
175
|
modified: bool = True,
|
|
@@ -221,7 +219,7 @@ def compare_and_split(
|
|
|
221
219
|
)
|
|
222
220
|
```
|
|
223
221
|
"""
|
|
224
|
-
status_col =
|
|
222
|
+
status_col = STATUS_COL_NAME
|
|
225
223
|
|
|
226
224
|
res = _compare(
|
|
227
225
|
left,
|
datachain/error.py
CHANGED
|
@@ -2,10 +2,54 @@ class DataChainError(RuntimeError):
|
|
|
2
2
|
pass
|
|
3
3
|
|
|
4
4
|
|
|
5
|
+
class SchemaDriftError(DataChainError):
|
|
6
|
+
pass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class InvalidDatasetNameError(RuntimeError):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InvalidNamespaceNameError(RuntimeError):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class InvalidProjectNameError(RuntimeError):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
5
21
|
class NotFoundError(Exception):
|
|
6
22
|
pass
|
|
7
23
|
|
|
8
24
|
|
|
25
|
+
class NamespaceNotFoundError(NotFoundError):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class NotAllowedError(Exception):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class NamespaceCreateNotAllowedError(NotAllowedError):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ProjectCreateNotAllowedError(NotAllowedError):
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ProjectDeleteNotAllowedError(NotAllowedError):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class NamespaceDeleteNotAllowedError(NotAllowedError):
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ProjectNotFoundError(NotFoundError):
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
|
|
9
53
|
class DatasetNotFoundError(NotFoundError):
|
|
10
54
|
pass
|
|
11
55
|
|
|
@@ -53,3 +97,19 @@ class ClientError(RuntimeError):
|
|
|
53
97
|
|
|
54
98
|
class TableMissingError(DataChainError):
|
|
55
99
|
pass
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class OutdatedDatabaseSchemaError(DataChainError):
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class CheckpointNotFoundError(NotFoundError):
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class JobNotFoundError(NotFoundError):
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class JobAncestryDepthExceededError(DataChainError):
|
|
115
|
+
pass
|
datachain/func/__init__.py
CHANGED
|
@@ -16,7 +16,7 @@ from .aggregate import (
|
|
|
16
16
|
sum,
|
|
17
17
|
)
|
|
18
18
|
from .array import contains, cosine_distance, euclidean_distance, length, sip_hash_64
|
|
19
|
-
from .conditional import and_, case, greatest, ifelse, isnone, least, or_
|
|
19
|
+
from .conditional import and_, case, greatest, ifelse, isnone, least, not_, or_
|
|
20
20
|
from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
|
|
21
21
|
from .path import file_ext, file_stem, name, parent
|
|
22
22
|
from .random import rand
|
|
@@ -54,6 +54,7 @@ __all__ = [
|
|
|
54
54
|
"max",
|
|
55
55
|
"min",
|
|
56
56
|
"name",
|
|
57
|
+
"not_",
|
|
57
58
|
"or_",
|
|
58
59
|
"parent",
|
|
59
60
|
"path",
|