datachain 0.24.0__py3-none-any.whl → 0.24.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -218,7 +218,7 @@ class AbstractWarehouse(ABC, Serializable):
218
218
  limit = query._limit
219
219
  paginated_query = query.limit(page_size)
220
220
 
221
- offset = 0
221
+ offset = query._offset or 0
222
222
  num_yielded = 0
223
223
 
224
224
  # Ensure we're using a thread-local connection
@@ -234,13 +234,13 @@ class AbstractWarehouse(ABC, Serializable):
234
234
  # Cursor results are not thread-safe, so we convert them to a list
235
235
  results = list(wh.dataset_rows_select(paginated_query.offset(offset)))
236
236
 
237
- processed = False
237
+ processed = 0
238
238
  for row in results:
239
- processed = True
239
+ processed += 1
240
240
  yield row
241
241
  num_yielded += 1
242
242
 
243
- if not processed:
243
+ if processed < page_size:
244
244
  break # no more results
245
245
  offset += page_size
246
246
 
@@ -343,6 +343,8 @@ class AbstractWarehouse(ABC, Serializable):
343
343
  if (id_col := get_query_id_column(query)) is None:
344
344
  raise RuntimeError("sys__id column not found in query")
345
345
 
346
+ query = query._clone().offset(None).limit(None).order_by(None)
347
+
346
348
  if is_batched:
347
349
  for batch in ids:
348
350
  yield list(self.dataset_rows_select(query.where(id_col.in_(batch))))
datachain/delta.py CHANGED
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
6
6
  import datachain
7
7
  from datachain.dataset import DatasetDependency
8
8
  from datachain.error import DatasetNotFoundError
9
+ from datachain.project import Project
9
10
 
10
11
  if TYPE_CHECKING:
11
12
  from typing_extensions import Concatenate, ParamSpec
@@ -50,15 +51,24 @@ def _append_steps(dc: "DataChain", other: "DataChain"):
50
51
 
51
52
  def _get_delta_chain(
52
53
  source_ds_name: str,
54
+ source_ds_project: Project,
53
55
  source_ds_version: str,
54
56
  source_ds_latest_version: str,
55
57
  on: Union[str, Sequence[str]],
56
58
  compare: Optional[Union[str, Sequence[str]]] = None,
57
59
  ) -> "DataChain":
58
60
  """Get delta chain for processing changes between versions."""
59
- source_dc = datachain.read_dataset(source_ds_name, version=source_ds_version)
61
+ source_dc = datachain.read_dataset(
62
+ source_ds_name,
63
+ namespace=source_ds_project.namespace.name,
64
+ project=source_ds_project.name,
65
+ version=source_ds_version,
66
+ )
60
67
  source_dc_latest = datachain.read_dataset(
61
- source_ds_name, version=source_ds_latest_version
68
+ source_ds_name,
69
+ namespace=source_ds_project.namespace.name,
70
+ project=source_ds_project.name,
71
+ version=source_ds_latest_version,
62
72
  )
63
73
 
64
74
  # Calculate diff between source versions
@@ -67,12 +77,15 @@ def _get_delta_chain(
67
77
 
68
78
  def _get_retry_chain(
69
79
  name: str,
80
+ project: Project,
70
81
  latest_version: str,
71
82
  source_ds_name: str,
72
- source_ds_latest_version: str,
83
+ source_ds_project: Project,
84
+ source_ds_version: str,
73
85
  on: Union[str, Sequence[str]],
74
86
  right_on: Optional[Union[str, Sequence[str]]],
75
87
  delta_retry: Optional[Union[bool, str]],
88
+ diff_chain: "DataChain",
76
89
  ) -> Optional["DataChain"]:
77
90
  """Get retry chain for processing error records and missing records."""
78
91
  # Import here to avoid circular import
@@ -81,35 +94,49 @@ def _get_retry_chain(
81
94
  retry_chain = None
82
95
 
83
96
  # Read the latest version of the result dataset for retry logic
84
- result_dataset = datachain.read_dataset(name, version=latest_version)
85
- source_dc_latest = datachain.read_dataset(
86
- source_ds_name, version=source_ds_latest_version
97
+ result_dataset = datachain.read_dataset(
98
+ name,
99
+ namespace=project.namespace.name,
100
+ project=project.name,
101
+ version=latest_version,
102
+ )
103
+ source_dc = datachain.read_dataset(
104
+ source_ds_name,
105
+ namespace=source_ds_project.namespace.name,
106
+ project=source_ds_project.name,
107
+ version=source_ds_version,
87
108
  )
88
109
 
89
110
  # Handle error records if delta_retry is a string (column name)
90
111
  if isinstance(delta_retry, str):
91
112
  error_records = result_dataset.filter(C(delta_retry) != "")
92
- error_source_records = source_dc_latest.merge(
113
+ error_source_records = source_dc.merge(
93
114
  error_records, on=on, right_on=right_on, inner=True
94
- ).select(*list(source_dc_latest.signals_schema.values))
115
+ ).select(*list(source_dc.signals_schema.values))
95
116
  retry_chain = error_source_records
96
117
 
97
118
  # Handle missing records if delta_retry is True
98
119
  elif delta_retry is True:
99
- missing_records = source_dc_latest.subtract(
100
- result_dataset, on=on, right_on=right_on
101
- )
120
+ missing_records = source_dc.subtract(result_dataset, on=on, right_on=right_on)
102
121
  retry_chain = missing_records
103
122
 
104
- return retry_chain
123
+ # Subtract also diff chain since some items might be picked
124
+ # up by `delta=True` itself (e.g. records got modified AND are missing in the
125
+ # result dataset atm)
126
+ return retry_chain.subtract(diff_chain, on=on) if retry_chain else None
105
127
 
106
128
 
107
129
  def _get_source_info(
108
130
  name: str,
131
+ project: Project,
109
132
  latest_version: str,
110
133
  catalog,
111
134
  ) -> tuple[
112
- Optional[str], Optional[str], Optional[str], Optional[list[DatasetDependency]]
135
+ Optional[str],
136
+ Optional[Project],
137
+ Optional[str],
138
+ Optional[str],
139
+ Optional[list[DatasetDependency]],
113
140
  ]:
114
141
  """Get source dataset information and dependencies.
115
142
 
@@ -118,23 +145,34 @@ def _get_source_info(
118
145
  Returns (None, None, None, None) if source dataset was removed.
119
146
  """
120
147
  dependencies = catalog.get_dataset_dependencies(
121
- name, latest_version, indirect=False
148
+ name, latest_version, project=project, indirect=False
122
149
  )
123
150
 
124
151
  dep = dependencies[0]
125
152
  if not dep:
126
153
  # Starting dataset was removed, back off to normal dataset creation
127
- return None, None, None, None
154
+ return None, None, None, None, None
128
155
 
156
+ source_ds_project = catalog.metastore.get_project(dep.project, dep.namespace)
129
157
  source_ds_name = dep.name
130
158
  source_ds_version = dep.version
131
- source_ds_latest_version = catalog.get_dataset(source_ds_name).latest_version
132
-
133
- return source_ds_name, source_ds_version, source_ds_latest_version, dependencies
159
+ source_ds_latest_version = catalog.get_dataset(
160
+ source_ds_name, project=source_ds_project
161
+ ).latest_version
162
+
163
+ return (
164
+ source_ds_name,
165
+ source_ds_project,
166
+ source_ds_version,
167
+ source_ds_latest_version,
168
+ dependencies,
169
+ )
134
170
 
135
171
 
136
172
  def delta_retry_update(
137
173
  dc: "DataChain",
174
+ namespace_name: str,
175
+ project_name: str,
138
176
  name: str,
139
177
  on: Union[str, Sequence[str]],
140
178
  right_on: Optional[Union[str, Sequence[str]]] = None,
@@ -173,11 +211,12 @@ def delta_retry_update(
173
211
  """
174
212
 
175
213
  catalog = dc.session.catalog
214
+ project = catalog.metastore.get_project(project_name, namespace_name)
176
215
  dc._query.apply_listing_pre_step()
177
216
 
178
217
  # Check if dataset exists
179
218
  try:
180
- dataset = catalog.get_dataset(name)
219
+ dataset = catalog.get_dataset(name, project=project)
181
220
  latest_version = dataset.latest_version
182
221
  except DatasetNotFoundError:
183
222
  # First creation of result dataset
@@ -189,19 +228,29 @@ def delta_retry_update(
189
228
  retry_chain = None
190
229
  processing_chain = None
191
230
 
192
- source_ds_name, source_ds_version, source_ds_latest_version, dependencies = (
193
- _get_source_info(name, latest_version, catalog)
194
- )
231
+ (
232
+ source_ds_name,
233
+ source_ds_project,
234
+ source_ds_version,
235
+ source_ds_latest_version,
236
+ dependencies,
237
+ ) = _get_source_info(name, project, latest_version, catalog)
195
238
 
196
239
  # If source_ds_name is None, starting dataset was removed
197
240
  if source_ds_name is None:
198
241
  return None, None, True
199
242
 
243
+ assert source_ds_project
200
244
  assert source_ds_version
201
245
  assert source_ds_latest_version
202
246
 
203
247
  diff_chain = _get_delta_chain(
204
- source_ds_name, source_ds_version, source_ds_latest_version, on, compare
248
+ source_ds_name,
249
+ source_ds_project,
250
+ source_ds_version,
251
+ source_ds_latest_version,
252
+ on,
253
+ compare,
205
254
  )
206
255
 
207
256
  # Filter out removed dep
@@ -215,12 +264,15 @@ def delta_retry_update(
215
264
  if delta_retry:
216
265
  retry_chain = _get_retry_chain(
217
266
  name,
267
+ project,
218
268
  latest_version,
219
269
  source_ds_name,
220
- source_ds_latest_version,
270
+ source_ds_project,
271
+ source_ds_version,
221
272
  on,
222
273
  right_on,
223
274
  delta_retry,
275
+ diff_chain,
224
276
  )
225
277
 
226
278
  # Combine delta and retry chains
@@ -236,7 +288,12 @@ def delta_retry_update(
236
288
  if processing_chain is None or (processing_chain and processing_chain.empty):
237
289
  return None, None, False
238
290
 
239
- latest_dataset = datachain.read_dataset(name, version=latest_version)
291
+ latest_dataset = datachain.read_dataset(
292
+ name,
293
+ namespace=project.namespace.name,
294
+ project=project.name,
295
+ version=latest_version,
296
+ )
240
297
  compared_chain = latest_dataset.diff(
241
298
  processing_chain,
242
299
  on=right_on or on,
@@ -598,6 +598,8 @@ class DataChain:
598
598
 
599
599
  result_ds, dependencies, has_changes = delta_retry_update(
600
600
  self,
601
+ namespace_name,
602
+ project_name,
601
603
  name,
602
604
  on=self._delta_on,
603
605
  right_on=self._delta_result_on,
@@ -11,6 +11,7 @@ from collections.abc import Generator, Iterable, Iterator, Sequence
11
11
  from copy import copy
12
12
  from functools import wraps
13
13
  from secrets import token_hex
14
+ from types import GeneratorType
14
15
  from typing import (
15
16
  TYPE_CHECKING,
16
17
  Any,
@@ -557,8 +558,8 @@ class UDFStep(Step, ABC):
557
558
  """
558
559
  assert self.partition_by is not None
559
560
 
560
- if isinstance(self.partition_by, Sequence):
561
- list_partition_by = self.partition_by
561
+ if isinstance(self.partition_by, (list, tuple, GeneratorType)):
562
+ list_partition_by = list(self.partition_by)
562
563
  else:
563
564
  list_partition_by = [self.partition_by]
564
565
 
@@ -575,7 +576,10 @@ class UDFStep(Step, ABC):
575
576
  f.dense_rank().over(order_by=partition_by).label(PARTITION_COLUMN_ID),
576
577
  ]
577
578
  self.catalog.warehouse.db.execute(
578
- tbl.insert().from_select(cols, query.with_only_columns(*cols))
579
+ tbl.insert().from_select(
580
+ cols,
581
+ query.offset(None).limit(None).with_only_columns(*cols),
582
+ )
579
583
  )
580
584
 
581
585
  return tbl
@@ -601,13 +605,10 @@ class UDFStep(Step, ABC):
601
605
  if self.partition_by is not None:
602
606
  partition_tbl = self.create_partitions_table(query)
603
607
  temp_tables.append(partition_tbl.name)
604
-
605
- subq = query.subquery()
606
- query = (
607
- sqlalchemy.select(*subq.c)
608
- .outerjoin(partition_tbl, partition_tbl.c.sys__id == subq.c.sys__id)
609
- .add_columns(*partition_columns())
610
- )
608
+ query = query.outerjoin(
609
+ partition_tbl,
610
+ partition_tbl.c.sys__id == query.selected_columns.sys__id,
611
+ ).add_columns(*partition_columns())
611
612
 
612
613
  query, tables = self.process_input_query(query)
613
614
  temp_tables.extend(t.name for t in tables)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.24.0
3
+ Version: 0.24.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
4
4
  datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
5
5
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
6
6
  datachain/dataset.py,sha256=wDrukmkDnYP0X8bAGY-7O1NDE3DWCFqrH8VVDpXM9Ok,25263
7
- datachain/delta.py,sha256=4RqLLc9dJLF8x9GG9IDgi86DwuPerZQ4HAUnNBeACw8,8446
7
+ datachain/delta.py,sha256=fTEhCedseUsHuH_Ek52NXFhFPyFD_6MioEH5sCilNgo,9897
8
8
  datachain/error.py,sha256=OWwWMkzZYJrkcoEDGhJHMf7SfKvxcsOLRF94mjPf29I,1609
9
9
  datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
10
10
  datachain/listing.py,sha256=T4bCgdCRuFW7bsPUG2PSl5om2nfJL6fzB84m7mCO8cA,7136
@@ -53,7 +53,7 @@ datachain/data_storage/metastore.py,sha256=9mWYOKK3AoHeKPGFm-WBfPrmnYHhwYeXx5MOu
53
53
  datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
54
54
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
55
55
  datachain/data_storage/sqlite.py,sha256=tT_soVi6l_pFSKaDktA1t4qW_vmPvXnvYSf4TZTKZYk,30067
56
- datachain/data_storage/warehouse.py,sha256=_7btARw-kd-Nx19S0qW6JqdF3VYyypQXFzsXq68SWKI,32327
56
+ datachain/data_storage/warehouse.py,sha256=2Bp2fXfcm-acwYjDWqVzGjoIQSAR4L56GPNtPcaT2gU,32418
57
57
  datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
58
58
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
59
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -103,7 +103,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
103
103
  datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
104
104
  datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
105
105
  datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
106
- datachain/lib/dc/datachain.py,sha256=dFI7JX5-41HLgA-TUR99dtR1lvk2vokaMC3mbIW1XT4,85814
106
+ datachain/lib/dc/datachain.py,sha256=dOPtNOYx6ocFr61YHTDrKGoMTDWDY0AZt8MLh79EJkc,85876
107
107
  datachain/lib/dc/datasets.py,sha256=U4xqAfs6FdW8HIJjeayQaIg1dunaIsVXYGqfq_sDSv0,13274
108
108
  datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
109
109
  datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
@@ -125,7 +125,7 @@ datachain/model/ultralytics/pose.py,sha256=pBlmt63Qe68FKmexHimUGlNbNOoOlMHXG4fzX
125
125
  datachain/model/ultralytics/segment.py,sha256=63bDCj43E6iZ0hFI5J6uQfksdCmjEp6sEm1XzVaE8pw,2986
126
126
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
127
127
  datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
128
- datachain/query/dataset.py,sha256=C60VM0pScsrWcMqLNdX-tU0HE1SnEE9lRN3TU8CfTu4,61223
128
+ datachain/query/dataset.py,sha256=mKee4PkQHYPT96utPjM1DocURU4TghAR7AHtYkzdqwY,61292
129
129
  datachain/query/dispatch.py,sha256=A0nPxn6mEN5d9dDo6S8m16Ji_9IvJLXrgF2kqXdi4fs,15546
130
130
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
131
131
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -157,9 +157,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
157
157
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
158
158
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
159
159
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
160
- datachain-0.24.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
- datachain-0.24.0.dist-info/METADATA,sha256=QWSVON3r5d5d18gRMs9G5DNV4z-kBBY47dMYUEFR0b0,13281
162
- datachain-0.24.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
- datachain-0.24.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
- datachain-0.24.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
- datachain-0.24.0.dist-info/RECORD,,
160
+ datachain-0.24.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
+ datachain-0.24.2.dist-info/METADATA,sha256=66Dz51BXpod8ZZG-pcmodALbszVZjNJXykMIrYRwXdA,13281
162
+ datachain-0.24.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
+ datachain-0.24.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
+ datachain-0.24.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
+ datachain-0.24.2.dist-info/RECORD,,