datachain 0.24.0__py3-none-any.whl → 0.24.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/delta.py +82 -25
- datachain/lib/dc/datachain.py +2 -0
- {datachain-0.24.0.dist-info → datachain-0.24.1.dist-info}/METADATA +1 -1
- {datachain-0.24.0.dist-info → datachain-0.24.1.dist-info}/RECORD +8 -8
- {datachain-0.24.0.dist-info → datachain-0.24.1.dist-info}/WHEEL +0 -0
- {datachain-0.24.0.dist-info → datachain-0.24.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.24.0.dist-info → datachain-0.24.1.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.24.0.dist-info → datachain-0.24.1.dist-info}/top_level.txt +0 -0
datachain/delta.py
CHANGED
|
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
|
|
|
6
6
|
import datachain
|
|
7
7
|
from datachain.dataset import DatasetDependency
|
|
8
8
|
from datachain.error import DatasetNotFoundError
|
|
9
|
+
from datachain.project import Project
|
|
9
10
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
12
|
from typing_extensions import Concatenate, ParamSpec
|
|
@@ -50,15 +51,24 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
|
|
|
50
51
|
|
|
51
52
|
def _get_delta_chain(
|
|
52
53
|
source_ds_name: str,
|
|
54
|
+
source_ds_project: Project,
|
|
53
55
|
source_ds_version: str,
|
|
54
56
|
source_ds_latest_version: str,
|
|
55
57
|
on: Union[str, Sequence[str]],
|
|
56
58
|
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
57
59
|
) -> "DataChain":
|
|
58
60
|
"""Get delta chain for processing changes between versions."""
|
|
59
|
-
source_dc = datachain.read_dataset(
|
|
61
|
+
source_dc = datachain.read_dataset(
|
|
62
|
+
source_ds_name,
|
|
63
|
+
namespace=source_ds_project.namespace.name,
|
|
64
|
+
project=source_ds_project.name,
|
|
65
|
+
version=source_ds_version,
|
|
66
|
+
)
|
|
60
67
|
source_dc_latest = datachain.read_dataset(
|
|
61
|
-
source_ds_name,
|
|
68
|
+
source_ds_name,
|
|
69
|
+
namespace=source_ds_project.namespace.name,
|
|
70
|
+
project=source_ds_project.name,
|
|
71
|
+
version=source_ds_latest_version,
|
|
62
72
|
)
|
|
63
73
|
|
|
64
74
|
# Calculate diff between source versions
|
|
@@ -67,12 +77,15 @@ def _get_delta_chain(
|
|
|
67
77
|
|
|
68
78
|
def _get_retry_chain(
|
|
69
79
|
name: str,
|
|
80
|
+
project: Project,
|
|
70
81
|
latest_version: str,
|
|
71
82
|
source_ds_name: str,
|
|
72
|
-
|
|
83
|
+
source_ds_project: Project,
|
|
84
|
+
source_ds_version: str,
|
|
73
85
|
on: Union[str, Sequence[str]],
|
|
74
86
|
right_on: Optional[Union[str, Sequence[str]]],
|
|
75
87
|
delta_retry: Optional[Union[bool, str]],
|
|
88
|
+
diff_chain: "DataChain",
|
|
76
89
|
) -> Optional["DataChain"]:
|
|
77
90
|
"""Get retry chain for processing error records and missing records."""
|
|
78
91
|
# Import here to avoid circular import
|
|
@@ -81,35 +94,49 @@ def _get_retry_chain(
|
|
|
81
94
|
retry_chain = None
|
|
82
95
|
|
|
83
96
|
# Read the latest version of the result dataset for retry logic
|
|
84
|
-
result_dataset = datachain.read_dataset(
|
|
85
|
-
|
|
86
|
-
|
|
97
|
+
result_dataset = datachain.read_dataset(
|
|
98
|
+
name,
|
|
99
|
+
namespace=project.namespace.name,
|
|
100
|
+
project=project.name,
|
|
101
|
+
version=latest_version,
|
|
102
|
+
)
|
|
103
|
+
source_dc = datachain.read_dataset(
|
|
104
|
+
source_ds_name,
|
|
105
|
+
namespace=source_ds_project.namespace.name,
|
|
106
|
+
project=source_ds_project.name,
|
|
107
|
+
version=source_ds_version,
|
|
87
108
|
)
|
|
88
109
|
|
|
89
110
|
# Handle error records if delta_retry is a string (column name)
|
|
90
111
|
if isinstance(delta_retry, str):
|
|
91
112
|
error_records = result_dataset.filter(C(delta_retry) != "")
|
|
92
|
-
error_source_records =
|
|
113
|
+
error_source_records = source_dc.merge(
|
|
93
114
|
error_records, on=on, right_on=right_on, inner=True
|
|
94
|
-
).select(*list(
|
|
115
|
+
).select(*list(source_dc.signals_schema.values))
|
|
95
116
|
retry_chain = error_source_records
|
|
96
117
|
|
|
97
118
|
# Handle missing records if delta_retry is True
|
|
98
119
|
elif delta_retry is True:
|
|
99
|
-
missing_records =
|
|
100
|
-
result_dataset, on=on, right_on=right_on
|
|
101
|
-
)
|
|
120
|
+
missing_records = source_dc.subtract(result_dataset, on=on, right_on=right_on)
|
|
102
121
|
retry_chain = missing_records
|
|
103
122
|
|
|
104
|
-
|
|
123
|
+
# Subtract also diff chain since some items might be picked
|
|
124
|
+
# up by `delta=True` itself (e.g. records got modified AND are missing in the
|
|
125
|
+
# result dataset atm)
|
|
126
|
+
return retry_chain.subtract(diff_chain, on=on) if retry_chain else None
|
|
105
127
|
|
|
106
128
|
|
|
107
129
|
def _get_source_info(
|
|
108
130
|
name: str,
|
|
131
|
+
project: Project,
|
|
109
132
|
latest_version: str,
|
|
110
133
|
catalog,
|
|
111
134
|
) -> tuple[
|
|
112
|
-
Optional[str],
|
|
135
|
+
Optional[str],
|
|
136
|
+
Optional[Project],
|
|
137
|
+
Optional[str],
|
|
138
|
+
Optional[str],
|
|
139
|
+
Optional[list[DatasetDependency]],
|
|
113
140
|
]:
|
|
114
141
|
"""Get source dataset information and dependencies.
|
|
115
142
|
|
|
@@ -118,23 +145,34 @@ def _get_source_info(
|
|
|
118
145
|
Returns (None, None, None, None) if source dataset was removed.
|
|
119
146
|
"""
|
|
120
147
|
dependencies = catalog.get_dataset_dependencies(
|
|
121
|
-
name, latest_version, indirect=False
|
|
148
|
+
name, latest_version, project=project, indirect=False
|
|
122
149
|
)
|
|
123
150
|
|
|
124
151
|
dep = dependencies[0]
|
|
125
152
|
if not dep:
|
|
126
153
|
# Starting dataset was removed, back off to normal dataset creation
|
|
127
|
-
return None, None, None, None
|
|
154
|
+
return None, None, None, None, None
|
|
128
155
|
|
|
156
|
+
source_ds_project = catalog.metastore.get_project(dep.project, dep.namespace)
|
|
129
157
|
source_ds_name = dep.name
|
|
130
158
|
source_ds_version = dep.version
|
|
131
|
-
source_ds_latest_version = catalog.get_dataset(
|
|
132
|
-
|
|
133
|
-
|
|
159
|
+
source_ds_latest_version = catalog.get_dataset(
|
|
160
|
+
source_ds_name, project=source_ds_project
|
|
161
|
+
).latest_version
|
|
162
|
+
|
|
163
|
+
return (
|
|
164
|
+
source_ds_name,
|
|
165
|
+
source_ds_project,
|
|
166
|
+
source_ds_version,
|
|
167
|
+
source_ds_latest_version,
|
|
168
|
+
dependencies,
|
|
169
|
+
)
|
|
134
170
|
|
|
135
171
|
|
|
136
172
|
def delta_retry_update(
|
|
137
173
|
dc: "DataChain",
|
|
174
|
+
namespace_name: str,
|
|
175
|
+
project_name: str,
|
|
138
176
|
name: str,
|
|
139
177
|
on: Union[str, Sequence[str]],
|
|
140
178
|
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
@@ -173,11 +211,12 @@ def delta_retry_update(
|
|
|
173
211
|
"""
|
|
174
212
|
|
|
175
213
|
catalog = dc.session.catalog
|
|
214
|
+
project = catalog.metastore.get_project(project_name, namespace_name)
|
|
176
215
|
dc._query.apply_listing_pre_step()
|
|
177
216
|
|
|
178
217
|
# Check if dataset exists
|
|
179
218
|
try:
|
|
180
|
-
dataset = catalog.get_dataset(name)
|
|
219
|
+
dataset = catalog.get_dataset(name, project=project)
|
|
181
220
|
latest_version = dataset.latest_version
|
|
182
221
|
except DatasetNotFoundError:
|
|
183
222
|
# First creation of result dataset
|
|
@@ -189,19 +228,29 @@ def delta_retry_update(
|
|
|
189
228
|
retry_chain = None
|
|
190
229
|
processing_chain = None
|
|
191
230
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
231
|
+
(
|
|
232
|
+
source_ds_name,
|
|
233
|
+
source_ds_project,
|
|
234
|
+
source_ds_version,
|
|
235
|
+
source_ds_latest_version,
|
|
236
|
+
dependencies,
|
|
237
|
+
) = _get_source_info(name, project, latest_version, catalog)
|
|
195
238
|
|
|
196
239
|
# If source_ds_name is None, starting dataset was removed
|
|
197
240
|
if source_ds_name is None:
|
|
198
241
|
return None, None, True
|
|
199
242
|
|
|
243
|
+
assert source_ds_project
|
|
200
244
|
assert source_ds_version
|
|
201
245
|
assert source_ds_latest_version
|
|
202
246
|
|
|
203
247
|
diff_chain = _get_delta_chain(
|
|
204
|
-
source_ds_name,
|
|
248
|
+
source_ds_name,
|
|
249
|
+
source_ds_project,
|
|
250
|
+
source_ds_version,
|
|
251
|
+
source_ds_latest_version,
|
|
252
|
+
on,
|
|
253
|
+
compare,
|
|
205
254
|
)
|
|
206
255
|
|
|
207
256
|
# Filter out removed dep
|
|
@@ -215,12 +264,15 @@ def delta_retry_update(
|
|
|
215
264
|
if delta_retry:
|
|
216
265
|
retry_chain = _get_retry_chain(
|
|
217
266
|
name,
|
|
267
|
+
project,
|
|
218
268
|
latest_version,
|
|
219
269
|
source_ds_name,
|
|
220
|
-
|
|
270
|
+
source_ds_project,
|
|
271
|
+
source_ds_version,
|
|
221
272
|
on,
|
|
222
273
|
right_on,
|
|
223
274
|
delta_retry,
|
|
275
|
+
diff_chain,
|
|
224
276
|
)
|
|
225
277
|
|
|
226
278
|
# Combine delta and retry chains
|
|
@@ -236,7 +288,12 @@ def delta_retry_update(
|
|
|
236
288
|
if processing_chain is None or (processing_chain and processing_chain.empty):
|
|
237
289
|
return None, None, False
|
|
238
290
|
|
|
239
|
-
latest_dataset = datachain.read_dataset(
|
|
291
|
+
latest_dataset = datachain.read_dataset(
|
|
292
|
+
name,
|
|
293
|
+
namespace=project.namespace.name,
|
|
294
|
+
project=project.name,
|
|
295
|
+
version=latest_version,
|
|
296
|
+
)
|
|
240
297
|
compared_chain = latest_dataset.diff(
|
|
241
298
|
processing_chain,
|
|
242
299
|
on=right_on or on,
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -4,7 +4,7 @@ datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
|
4
4
|
datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
|
|
5
5
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
6
6
|
datachain/dataset.py,sha256=wDrukmkDnYP0X8bAGY-7O1NDE3DWCFqrH8VVDpXM9Ok,25263
|
|
7
|
-
datachain/delta.py,sha256=
|
|
7
|
+
datachain/delta.py,sha256=fTEhCedseUsHuH_Ek52NXFhFPyFD_6MioEH5sCilNgo,9897
|
|
8
8
|
datachain/error.py,sha256=OWwWMkzZYJrkcoEDGhJHMf7SfKvxcsOLRF94mjPf29I,1609
|
|
9
9
|
datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
|
|
10
10
|
datachain/listing.py,sha256=T4bCgdCRuFW7bsPUG2PSl5om2nfJL6fzB84m7mCO8cA,7136
|
|
@@ -103,7 +103,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
|
|
|
103
103
|
datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
|
|
104
104
|
datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
|
|
105
105
|
datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
|
|
106
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
106
|
+
datachain/lib/dc/datachain.py,sha256=dOPtNOYx6ocFr61YHTDrKGoMTDWDY0AZt8MLh79EJkc,85876
|
|
107
107
|
datachain/lib/dc/datasets.py,sha256=U4xqAfs6FdW8HIJjeayQaIg1dunaIsVXYGqfq_sDSv0,13274
|
|
108
108
|
datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
|
|
109
109
|
datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
|
|
@@ -157,9 +157,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
157
157
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
158
158
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
159
159
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
160
|
-
datachain-0.24.
|
|
161
|
-
datachain-0.24.
|
|
162
|
-
datachain-0.24.
|
|
163
|
-
datachain-0.24.
|
|
164
|
-
datachain-0.24.
|
|
165
|
-
datachain-0.24.
|
|
160
|
+
datachain-0.24.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
161
|
+
datachain-0.24.1.dist-info/METADATA,sha256=lr7Q889hnRechtjUZUnuwDSfiydAGE6wGxMMG9ICSVg,13281
|
|
162
|
+
datachain-0.24.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
163
|
+
datachain-0.24.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
164
|
+
datachain-0.24.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
165
|
+
datachain-0.24.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|