datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. datachain/__init__.py +20 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +7 -7
  4. datachain/catalog/__init__.py +2 -2
  5. datachain/catalog/catalog.py +621 -507
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +28 -18
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +24 -33
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +83 -52
  12. datachain/cli/commands/ls.py +17 -17
  13. datachain/cli/commands/show.py +4 -4
  14. datachain/cli/parser/__init__.py +8 -74
  15. datachain/cli/parser/job.py +95 -3
  16. datachain/cli/parser/studio.py +11 -4
  17. datachain/cli/parser/utils.py +1 -2
  18. datachain/cli/utils.py +2 -15
  19. datachain/client/azure.py +4 -4
  20. datachain/client/fsspec.py +45 -28
  21. datachain/client/gcs.py +6 -6
  22. datachain/client/hf.py +29 -2
  23. datachain/client/http.py +157 -0
  24. datachain/client/local.py +15 -11
  25. datachain/client/s3.py +17 -9
  26. datachain/config.py +4 -8
  27. datachain/data_storage/db_engine.py +12 -6
  28. datachain/data_storage/job.py +5 -1
  29. datachain/data_storage/metastore.py +1252 -186
  30. datachain/data_storage/schema.py +58 -45
  31. datachain/data_storage/serializer.py +105 -15
  32. datachain/data_storage/sqlite.py +286 -127
  33. datachain/data_storage/warehouse.py +250 -113
  34. datachain/dataset.py +353 -148
  35. datachain/delta.py +391 -0
  36. datachain/diff/__init__.py +27 -29
  37. datachain/error.py +60 -0
  38. datachain/func/__init__.py +2 -1
  39. datachain/func/aggregate.py +66 -42
  40. datachain/func/array.py +242 -38
  41. datachain/func/base.py +7 -4
  42. datachain/func/conditional.py +110 -60
  43. datachain/func/func.py +96 -45
  44. datachain/func/numeric.py +55 -38
  45. datachain/func/path.py +32 -20
  46. datachain/func/random.py +2 -2
  47. datachain/func/string.py +67 -37
  48. datachain/func/window.py +7 -8
  49. datachain/hash_utils.py +123 -0
  50. datachain/job.py +11 -7
  51. datachain/json.py +138 -0
  52. datachain/lib/arrow.py +58 -22
  53. datachain/lib/audio.py +245 -0
  54. datachain/lib/clip.py +14 -13
  55. datachain/lib/convert/flatten.py +5 -3
  56. datachain/lib/convert/python_to_sql.py +6 -10
  57. datachain/lib/convert/sql_to_python.py +8 -0
  58. datachain/lib/convert/values_to_tuples.py +156 -51
  59. datachain/lib/data_model.py +42 -20
  60. datachain/lib/dataset_info.py +36 -8
  61. datachain/lib/dc/__init__.py +8 -2
  62. datachain/lib/dc/csv.py +25 -28
  63. datachain/lib/dc/database.py +398 -0
  64. datachain/lib/dc/datachain.py +1289 -425
  65. datachain/lib/dc/datasets.py +320 -38
  66. datachain/lib/dc/hf.py +38 -24
  67. datachain/lib/dc/json.py +29 -32
  68. datachain/lib/dc/listings.py +112 -8
  69. datachain/lib/dc/pandas.py +16 -12
  70. datachain/lib/dc/parquet.py +35 -23
  71. datachain/lib/dc/records.py +31 -23
  72. datachain/lib/dc/storage.py +154 -64
  73. datachain/lib/dc/storage_pattern.py +251 -0
  74. datachain/lib/dc/utils.py +24 -16
  75. datachain/lib/dc/values.py +8 -9
  76. datachain/lib/file.py +622 -89
  77. datachain/lib/hf.py +69 -39
  78. datachain/lib/image.py +14 -14
  79. datachain/lib/listing.py +14 -11
  80. datachain/lib/listing_info.py +1 -2
  81. datachain/lib/meta_formats.py +3 -4
  82. datachain/lib/model_store.py +39 -7
  83. datachain/lib/namespaces.py +125 -0
  84. datachain/lib/projects.py +130 -0
  85. datachain/lib/pytorch.py +32 -21
  86. datachain/lib/settings.py +192 -56
  87. datachain/lib/signal_schema.py +427 -104
  88. datachain/lib/tar.py +1 -2
  89. datachain/lib/text.py +8 -7
  90. datachain/lib/udf.py +164 -76
  91. datachain/lib/udf_signature.py +60 -35
  92. datachain/lib/utils.py +118 -4
  93. datachain/lib/video.py +17 -9
  94. datachain/lib/webdataset.py +61 -56
  95. datachain/lib/webdataset_laion.py +15 -16
  96. datachain/listing.py +22 -10
  97. datachain/model/bbox.py +3 -1
  98. datachain/model/ultralytics/bbox.py +16 -12
  99. datachain/model/ultralytics/pose.py +16 -12
  100. datachain/model/ultralytics/segment.py +16 -12
  101. datachain/namespace.py +84 -0
  102. datachain/node.py +6 -6
  103. datachain/nodes_thread_pool.py +0 -1
  104. datachain/plugins.py +24 -0
  105. datachain/project.py +78 -0
  106. datachain/query/batch.py +40 -41
  107. datachain/query/dataset.py +604 -322
  108. datachain/query/dispatch.py +261 -154
  109. datachain/query/metrics.py +4 -6
  110. datachain/query/params.py +2 -3
  111. datachain/query/queue.py +3 -12
  112. datachain/query/schema.py +11 -6
  113. datachain/query/session.py +200 -33
  114. datachain/query/udf.py +34 -2
  115. datachain/remote/studio.py +171 -69
  116. datachain/script_meta.py +12 -12
  117. datachain/semver.py +68 -0
  118. datachain/sql/__init__.py +2 -0
  119. datachain/sql/functions/array.py +33 -1
  120. datachain/sql/postgresql_dialect.py +9 -0
  121. datachain/sql/postgresql_types.py +21 -0
  122. datachain/sql/sqlite/__init__.py +5 -1
  123. datachain/sql/sqlite/base.py +102 -29
  124. datachain/sql/sqlite/types.py +8 -13
  125. datachain/sql/types.py +70 -15
  126. datachain/studio.py +223 -46
  127. datachain/toolkit/split.py +31 -10
  128. datachain/utils.py +101 -59
  129. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
  130. datachain-0.39.0.dist-info/RECORD +173 -0
  131. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
  132. datachain/cli/commands/query.py +0 -53
  133. datachain/query/utils.py +0 -42
  134. datachain-0.14.2.dist-info/RECORD +0 -158
  135. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  136. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  137. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
datachain/delta.py ADDED
@@ -0,0 +1,391 @@
1
+ from collections.abc import Sequence
2
+ from copy import copy
3
+ from functools import wraps
4
+ from typing import TYPE_CHECKING, TypeVar
5
+
6
+ import datachain
7
+ from datachain.dataset import DatasetDependency, DatasetRecord
8
+ from datachain.error import DatasetNotFoundError, SchemaDriftError
9
+ from datachain.project import Project
10
+ from datachain.query.dataset import UnionSchemaMismatchError
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Callable
14
+ from typing import Concatenate
15
+
16
+ from typing_extensions import ParamSpec
17
+
18
+ from datachain.lib.dc import DataChain
19
+ from datachain.lib.signal_schema import SignalSchema
20
+
21
+ P = ParamSpec("P")
22
+
23
+
24
+ T = TypeVar("T", bound="DataChain")
25
+
26
+
27
+ def delta_disabled(
28
+ method: "Callable[Concatenate[T, P], T]",
29
+ ) -> "Callable[Concatenate[T, P], T]":
30
+ """
31
+ Decorator for disabling DataChain methods (e.g `.agg()` or `.union()`) to
32
+ work with delta updates. It throws `NotImplementedError` if chain on which
33
+ method is called is marked as delta.
34
+ """
35
+
36
+ @wraps(method)
37
+ def _inner(self: T, *args: "P.args", **kwargs: "P.kwargs") -> T:
38
+ if self.delta and not self._delta_unsafe:
39
+ raise NotImplementedError(
40
+ f"Cannot use {method.__name__} with delta datasets - may cause"
41
+ " inconsistency. Use delta_unsafe flag to allow this operation."
42
+ )
43
+ return method(self, *args, **kwargs)
44
+
45
+ return _inner
46
+
47
+
48
+ def _append_steps(dc: "DataChain", other: "DataChain"):
49
+ """Returns cloned chain with appended steps from other chain.
50
+ Steps are all those modification methods applied like filters, mappers etc.
51
+ """
52
+ dc = dc.clone()
53
+ dc._query.steps += other._query.steps.copy()
54
+ dc.signals_schema = other.signals_schema
55
+ return dc
56
+
57
+
58
+ def _format_schema_drift_message(
59
+ context: str,
60
+ existing_schema: "SignalSchema",
61
+ updated_schema: "SignalSchema",
62
+ ) -> tuple[str, bool]:
63
+ missing_cols, new_cols = existing_schema.compare_signals(updated_schema)
64
+
65
+ if not new_cols and not missing_cols:
66
+ return "", False
67
+
68
+ parts: list[str] = []
69
+ if new_cols:
70
+ parts.append("new columns detected: " + ", ".join(sorted(new_cols)))
71
+ if missing_cols:
72
+ parts.append(
73
+ "columns missing in updated data: " + ", ".join(sorted(missing_cols))
74
+ )
75
+
76
+ details = "; ".join(parts)
77
+ message = f"Delta update failed: schema drift detected while {context}: {details}."
78
+
79
+ return message, True
80
+
81
+
82
+ def _safe_union(
83
+ left: "DataChain",
84
+ right: "DataChain",
85
+ context: str,
86
+ ) -> "DataChain":
87
+ try:
88
+ return left.union(right)
89
+ except UnionSchemaMismatchError as exc:
90
+ message, has_drift = _format_schema_drift_message(
91
+ context,
92
+ left.signals_schema,
93
+ right.signals_schema,
94
+ )
95
+ if has_drift:
96
+ raise SchemaDriftError(message) from exc
97
+ raise
98
+
99
+
100
+ def _get_delta_chain(
101
+ source_ds_name: str,
102
+ source_ds_project: Project,
103
+ source_ds_version: str,
104
+ source_ds_latest_version: str,
105
+ on: str | Sequence[str],
106
+ compare: str | Sequence[str] | None = None,
107
+ ) -> "DataChain":
108
+ """Get delta chain for processing changes between versions."""
109
+ source_dc = datachain.read_dataset(
110
+ source_ds_name,
111
+ namespace=source_ds_project.namespace.name,
112
+ project=source_ds_project.name,
113
+ version=source_ds_version,
114
+ )
115
+ source_dc_latest = datachain.read_dataset(
116
+ source_ds_name,
117
+ namespace=source_ds_project.namespace.name,
118
+ project=source_ds_project.name,
119
+ version=source_ds_latest_version,
120
+ )
121
+
122
+ # Calculate diff between source versions
123
+ return source_dc_latest.diff(source_dc, on=on, compare=compare, deleted=False)
124
+
125
+
126
+ def _get_retry_chain(
127
+ name: str,
128
+ namespace_name: str,
129
+ project_name: str,
130
+ latest_version: str,
131
+ source_ds_name: str,
132
+ source_ds_project: Project,
133
+ source_ds_version: str,
134
+ on: str | Sequence[str],
135
+ right_on: str | Sequence[str] | None,
136
+ delta_retry: bool | str | None,
137
+ diff_chain: "DataChain",
138
+ ) -> "DataChain | None":
139
+ """Get retry chain for processing error records and missing records."""
140
+ # Import here to avoid circular import
141
+ from datachain.lib.dc import C
142
+
143
+ retry_chain = None
144
+
145
+ # Read the latest version of the result dataset for retry logic
146
+ result_dataset = datachain.read_dataset(
147
+ name,
148
+ namespace=namespace_name,
149
+ project=project_name,
150
+ version=latest_version,
151
+ )
152
+ source_dc = datachain.read_dataset(
153
+ source_ds_name,
154
+ namespace=source_ds_project.namespace.name,
155
+ project=source_ds_project.name,
156
+ version=source_ds_version,
157
+ )
158
+
159
+ # Handle error records if delta_retry is a string (column name)
160
+ if isinstance(delta_retry, str):
161
+ error_records = result_dataset.filter(C(delta_retry) != "")
162
+ error_source_records = source_dc.merge(
163
+ error_records, on=on, right_on=right_on, inner=True
164
+ ).select(
165
+ *list(source_dc.signals_schema.clone_without_sys_signals().values.keys())
166
+ )
167
+ retry_chain = error_source_records
168
+
169
+ # Handle missing records if delta_retry is True
170
+ elif delta_retry is True:
171
+ missing_records = source_dc.subtract(result_dataset, on=on, right_on=right_on)
172
+ retry_chain = missing_records
173
+
174
+ # Subtract also diff chain since some items might be picked
175
+ # up by `delta=True` itself (e.g. records got modified AND are missing in the
176
+ # result dataset atm)
177
+ on = [on] if isinstance(on, str) else on
178
+
179
+ return (
180
+ retry_chain.diff(
181
+ diff_chain, on=on, added=True, same=True, modified=False, deleted=False
182
+ ).distinct(*on)
183
+ if retry_chain
184
+ else None
185
+ )
186
+
187
+
188
+ def _get_source_info(
189
+ source_ds: DatasetRecord,
190
+ name: str,
191
+ namespace_name: str,
192
+ project_name: str,
193
+ latest_version: str,
194
+ catalog,
195
+ ) -> tuple[
196
+ str | None,
197
+ Project | None,
198
+ str | None,
199
+ str | None,
200
+ list[DatasetDependency] | None,
201
+ ]:
202
+ """Get source dataset information and dependencies.
203
+
204
+ Returns:
205
+ Tuple of (source_name, source_version, source_latest_version, dependencies)
206
+ Returns (None, None, None, None) if source dataset was removed.
207
+ """
208
+ dependencies = catalog.get_dataset_dependencies(
209
+ name,
210
+ latest_version,
211
+ namespace_name=namespace_name,
212
+ project_name=project_name,
213
+ indirect=False,
214
+ )
215
+
216
+ source_ds_dep = next(
217
+ (d for d in dependencies if d and d.name == source_ds.name), None
218
+ )
219
+ if not source_ds_dep:
220
+ # Starting dataset was removed, back off to normal dataset creation
221
+ return None, None, None, None, None
222
+
223
+ # Refresh starting dataset to have new versions if they are created
224
+ source_ds = catalog.get_dataset(
225
+ source_ds.name,
226
+ namespace_name=source_ds.project.namespace.name,
227
+ project_name=source_ds.project.name,
228
+ )
229
+
230
+ return (
231
+ source_ds.name,
232
+ source_ds.project,
233
+ source_ds_dep.version,
234
+ source_ds.latest_version,
235
+ dependencies,
236
+ )
237
+
238
+
239
+ def delta_retry_update(
240
+ dc: "DataChain",
241
+ namespace_name: str,
242
+ project_name: str,
243
+ name: str,
244
+ on: str | Sequence[str],
245
+ right_on: str | Sequence[str] | None = None,
246
+ compare: str | Sequence[str] | None = None,
247
+ delta_retry: bool | str | None = None,
248
+ ) -> tuple["DataChain | None", list[DatasetDependency] | None, bool]:
249
+ """
250
+ Creates new chain that consists of the last version of current delta dataset
251
+ plus diff from the source with all needed modifications.
252
+ This way we don't need to re-calculate the whole chain from the source again
253
+ (apply all the DataChain methods like filters, mappers, generators etc.)
254
+ but just the diff part which is very important for performance.
255
+
256
+ Note that currently delta update works only if there is only one direct
257
+ dependency.
258
+
259
+ Additionally supports retry functionality to filter records that either:
260
+ 1. Have a non-None value in the field specified by delta_retry (when it's a string)
261
+ 2. Exist in the source dataset but are missing in the result dataset
262
+ (when delta_retry=True)
263
+
264
+ Parameters:
265
+ dc: The DataChain to filter for records that need reprocessing
266
+ name: Name of the destination dataset
267
+ on: Field(s) in source dataset that uniquely identify records
268
+ right_on: Corresponding field(s) in result dataset if they differ from
269
+ source
270
+ compare: Field(s) used to check if the same row has been modified
271
+ delta_retry: If string, field in result dataset that indicates an error
272
+ when not None. If True, include records missing from result dataset.
273
+ If False/None, no retry functionality.
274
+
275
+ Returns:
276
+ A tuple containing (filtered chain for delta/retry processing,
277
+ dependencies, found records flag)
278
+ """
279
+
280
+ catalog = dc.session.catalog
281
+ # project = catalog.metastore.get_project(project_name, namespace_name)
282
+ dc._query.apply_listing_pre_step()
283
+
284
+ # Check if dataset exists
285
+ try:
286
+ dataset = catalog.get_dataset(
287
+ name, namespace_name=namespace_name, project_name=project_name
288
+ )
289
+ latest_version = dataset.latest_version
290
+ except DatasetNotFoundError:
291
+ # First creation of result dataset
292
+ return None, None, True
293
+
294
+ # Initialize variables
295
+ diff_chain = None
296
+ dependencies = None
297
+ retry_chain = None
298
+ processing_chain = None
299
+
300
+ (
301
+ source_ds_name,
302
+ source_ds_project,
303
+ source_ds_version,
304
+ source_ds_latest_version,
305
+ dependencies,
306
+ ) = _get_source_info(
307
+ dc._query.starting_step.dataset, # type: ignore[union-attr]
308
+ name,
309
+ namespace_name,
310
+ project_name,
311
+ latest_version,
312
+ catalog,
313
+ )
314
+
315
+ # If source_ds_name is None, starting dataset was removed
316
+ if source_ds_name is None:
317
+ return None, None, True
318
+
319
+ assert source_ds_project
320
+ assert source_ds_version
321
+ assert source_ds_latest_version
322
+
323
+ diff_chain = _get_delta_chain(
324
+ source_ds_name,
325
+ source_ds_project,
326
+ source_ds_version,
327
+ source_ds_latest_version,
328
+ on,
329
+ compare,
330
+ )
331
+
332
+ # Filter out removed dep
333
+ if dependencies:
334
+ dependencies = copy(dependencies)
335
+ dependencies = [d for d in dependencies if d is not None]
336
+ source_ds_dep = next(d for d in dependencies if d.name == source_ds_name)
337
+ # Update to latest version
338
+ source_ds_dep.version = source_ds_latest_version # type: ignore[union-attr]
339
+
340
+ # Handle retry functionality if enabled
341
+ if delta_retry:
342
+ retry_chain = _get_retry_chain(
343
+ name,
344
+ namespace_name,
345
+ project_name,
346
+ latest_version,
347
+ source_ds_name,
348
+ source_ds_project,
349
+ source_ds_version,
350
+ on,
351
+ right_on,
352
+ delta_retry,
353
+ diff_chain,
354
+ )
355
+
356
+ # Combine delta and retry chains
357
+ if retry_chain is not None:
358
+ processing_chain = _safe_union(
359
+ diff_chain,
360
+ retry_chain,
361
+ context="combining retry records with delta changes",
362
+ )
363
+ else:
364
+ processing_chain = diff_chain
365
+
366
+ # Apply all the steps from the original chain to processing_chain
367
+ processing_chain = _append_steps(processing_chain, dc).persist()
368
+
369
+ # Check if chain becomes empty after applying steps
370
+ if processing_chain is None or (processing_chain and processing_chain.empty):
371
+ return None, None, False
372
+
373
+ latest_dataset = datachain.read_dataset(
374
+ name,
375
+ namespace=namespace_name,
376
+ project=project_name,
377
+ version=latest_version,
378
+ )
379
+ compared_chain = latest_dataset.diff(
380
+ processing_chain,
381
+ on=right_on or on,
382
+ added=True,
383
+ modified=False,
384
+ deleted=False,
385
+ )
386
+ result_chain = _safe_union(
387
+ compared_chain,
388
+ processing_chain,
389
+ context="merging the delta output with the existing dataset version",
390
+ )
391
+ return result_chain, dependencies, True
@@ -1,8 +1,6 @@
1
- import random
2
- import string
3
1
  from collections.abc import Sequence
4
2
  from enum import Enum
5
- from typing import TYPE_CHECKING, Optional, Union
3
+ from typing import TYPE_CHECKING
6
4
 
7
5
  from datachain.func import case, ifelse, isnone, or_
8
6
  from datachain.lib.signal_schema import SignalSchema
@@ -11,16 +9,12 @@ from datachain.query.schema import Column
11
9
  if TYPE_CHECKING:
12
10
  from datachain.lib.dc import DataChain
13
11
 
14
-
15
12
  C = Column
16
13
 
17
14
 
18
- def get_status_col_name() -> str:
19
- """Returns new unique status col name"""
20
- return "diff_" + "".join(
21
- random.choice(string.ascii_letters) # noqa: S311
22
- for _ in range(10)
23
- )
15
+ STATUS_COL_NAME = "diff_7aeed3aa17ba4d50b8d1c368c76e16a6"
16
+ LEFT_DIFF_COL_NAME = "diff_95f95344064a4b819c8625cd1a5cfc2b"
17
+ RIGHT_DIFF_COL_NAME = "diff_5808838a49b54849aa461d7387376d34"
24
18
 
25
19
 
26
20
  class CompareStatus(str, Enum):
@@ -33,22 +27,22 @@ class CompareStatus(str, Enum):
33
27
  def _compare( # noqa: C901
34
28
  left: "DataChain",
35
29
  right: "DataChain",
36
- on: Union[str, Sequence[str]],
37
- right_on: Optional[Union[str, Sequence[str]]] = None,
38
- compare: Optional[Union[str, Sequence[str]]] = None,
39
- right_compare: Optional[Union[str, Sequence[str]]] = None,
30
+ on: str | Sequence[str],
31
+ right_on: str | Sequence[str] | None = None,
32
+ compare: str | Sequence[str] | None = None,
33
+ right_compare: str | Sequence[str] | None = None,
40
34
  added: bool = True,
41
35
  deleted: bool = True,
42
36
  modified: bool = True,
43
37
  same: bool = True,
44
- status_col: Optional[str] = None,
38
+ status_col: str | None = None,
45
39
  ) -> "DataChain":
46
40
  """Comparing two chains by identifying rows that are added, deleted, modified
47
41
  or same"""
48
42
  rname = "right_"
49
43
  schema = left.signals_schema # final chain must have schema from left chain
50
44
 
51
- def _to_list(obj: Optional[Union[str, Sequence[str]]]) -> Optional[list[str]]:
45
+ def _to_list(obj: str | Sequence[str] | None) -> list[str] | None:
52
46
  if obj is None:
53
47
  return None
54
48
  return [obj] if isinstance(obj, str) else list(obj)
@@ -77,14 +71,16 @@ def _compare( # noqa: C901
77
71
  cols_select = list(left.signals_schema.clone_without_sys_signals().values.keys())
78
72
 
79
73
  # getting correct on and right_on column names
74
+ on_ = on
80
75
  on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
81
- right_on = right.signals_schema.resolve(*(right_on or on)).db_signals() # type: ignore[assignment]
76
+ right_on = right.signals_schema.resolve(*(right_on or on_)).db_signals() # type: ignore[assignment]
82
77
 
83
78
  # getting correct compare and right_compare column names if they are defined
84
79
  if compare:
80
+ compare_ = compare
85
81
  compare = left.signals_schema.resolve(*compare).db_signals() # type: ignore[assignment]
86
82
  right_compare = right.signals_schema.resolve(
87
- *(right_compare or compare)
83
+ *(right_compare or compare_)
88
84
  ).db_signals() # type: ignore[assignment]
89
85
  elif not compare and len(cols) != len(right_cols):
90
86
  # here we will mark all rows that are not added or deleted as modified since
@@ -99,21 +95,23 @@ def _compare( # noqa: C901
99
95
  compare = right_compare = [c for c in cols if c in right_cols and c not in on] # type: ignore[misc]
100
96
 
101
97
  # get diff column names
102
- diff_col = status_col or get_status_col_name()
103
- ldiff_col = get_status_col_name()
104
- rdiff_col = get_status_col_name()
98
+ diff_col = status_col or STATUS_COL_NAME
99
+ ldiff_col = LEFT_DIFF_COL_NAME
100
+ rdiff_col = RIGHT_DIFF_COL_NAME
105
101
 
106
102
  # adding helper diff columns, which will be removed after
107
103
  left = left.mutate(**{ldiff_col: 1})
108
104
  right = right.mutate(**{rdiff_col: 1})
109
105
 
110
- if not compare:
106
+ if compare is None:
111
107
  modified_cond = True
108
+ elif len(compare) == 0:
109
+ modified_cond = False
112
110
  else:
113
111
  modified_cond = or_( # type: ignore[assignment]
114
112
  *[
115
113
  C(c) != (C(f"{rname}{rc}") if c == rc else C(rc))
116
- for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
114
+ for c, rc in zip(compare, right_compare, strict=False) # type: ignore[arg-type]
117
115
  ]
118
116
  )
119
117
 
@@ -137,7 +135,7 @@ def _compare( # noqa: C901
137
135
  C(f"{rname + l_on if on == right_on else r_on}"),
138
136
  C(l_on),
139
137
  )
140
- for l_on, r_on in zip(on, right_on) # type: ignore[arg-type]
138
+ for l_on, r_on in zip(on, right_on, strict=False) # type: ignore[arg-type]
141
139
  }
142
140
  )
143
141
  .select_except(ldiff_col, rdiff_col)
@@ -168,10 +166,10 @@ def _compare( # noqa: C901
168
166
  def compare_and_split(
169
167
  left: "DataChain",
170
168
  right: "DataChain",
171
- on: Union[str, Sequence[str]],
172
- right_on: Optional[Union[str, Sequence[str]]] = None,
173
- compare: Optional[Union[str, Sequence[str]]] = None,
174
- right_compare: Optional[Union[str, Sequence[str]]] = None,
169
+ on: str | Sequence[str],
170
+ right_on: str | Sequence[str] | None = None,
171
+ compare: str | Sequence[str] | None = None,
172
+ right_compare: str | Sequence[str] | None = None,
175
173
  added: bool = True,
176
174
  deleted: bool = True,
177
175
  modified: bool = True,
@@ -221,7 +219,7 @@ def compare_and_split(
221
219
  )
222
220
  ```
223
221
  """
224
- status_col = get_status_col_name()
222
+ status_col = STATUS_COL_NAME
225
223
 
226
224
  res = _compare(
227
225
  left,
datachain/error.py CHANGED
@@ -2,10 +2,54 @@ class DataChainError(RuntimeError):
2
2
  pass
3
3
 
4
4
 
5
+ class SchemaDriftError(DataChainError):
6
+ pass
7
+
8
+
9
+ class InvalidDatasetNameError(RuntimeError):
10
+ pass
11
+
12
+
13
+ class InvalidNamespaceNameError(RuntimeError):
14
+ pass
15
+
16
+
17
+ class InvalidProjectNameError(RuntimeError):
18
+ pass
19
+
20
+
5
21
  class NotFoundError(Exception):
6
22
  pass
7
23
 
8
24
 
25
+ class NamespaceNotFoundError(NotFoundError):
26
+ pass
27
+
28
+
29
+ class NotAllowedError(Exception):
30
+ pass
31
+
32
+
33
+ class NamespaceCreateNotAllowedError(NotAllowedError):
34
+ pass
35
+
36
+
37
+ class ProjectCreateNotAllowedError(NotAllowedError):
38
+ pass
39
+
40
+
41
+ class ProjectDeleteNotAllowedError(NotAllowedError):
42
+ pass
43
+
44
+
45
+ class NamespaceDeleteNotAllowedError(NotAllowedError):
46
+ pass
47
+
48
+
49
+ class ProjectNotFoundError(NotFoundError):
50
+ pass
51
+
52
+
9
53
  class DatasetNotFoundError(NotFoundError):
10
54
  pass
11
55
 
@@ -53,3 +97,19 @@ class ClientError(RuntimeError):
53
97
 
54
98
  class TableMissingError(DataChainError):
55
99
  pass
100
+
101
+
102
+ class OutdatedDatabaseSchemaError(DataChainError):
103
+ pass
104
+
105
+
106
+ class CheckpointNotFoundError(NotFoundError):
107
+ pass
108
+
109
+
110
+ class JobNotFoundError(NotFoundError):
111
+ pass
112
+
113
+
114
+ class JobAncestryDepthExceededError(DataChainError):
115
+ pass
@@ -16,7 +16,7 @@ from .aggregate import (
16
16
  sum,
17
17
  )
18
18
  from .array import contains, cosine_distance, euclidean_distance, length, sip_hash_64
19
- from .conditional import and_, case, greatest, ifelse, isnone, least, or_
19
+ from .conditional import and_, case, greatest, ifelse, isnone, least, not_, or_
20
20
  from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
21
21
  from .path import file_ext, file_stem, name, parent
22
22
  from .random import rand
@@ -54,6 +54,7 @@ __all__ = [
54
54
  "max",
55
55
  "min",
56
56
  "name",
57
+ "not_",
57
58
  "or_",
58
59
  "parent",
59
60
  "path",