datachain 0.24.1__py3-none-any.whl → 0.24.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -218,7 +218,7 @@ class AbstractWarehouse(ABC, Serializable):
218
218
  limit = query._limit
219
219
  paginated_query = query.limit(page_size)
220
220
 
221
- offset = 0
221
+ offset = query._offset or 0
222
222
  num_yielded = 0
223
223
 
224
224
  # Ensure we're using a thread-local connection
@@ -234,13 +234,13 @@ class AbstractWarehouse(ABC, Serializable):
234
234
  # Cursor results are not thread-safe, so we convert them to a list
235
235
  results = list(wh.dataset_rows_select(paginated_query.offset(offset)))
236
236
 
237
- processed = False
237
+ processed = 0
238
238
  for row in results:
239
- processed = True
239
+ processed += 1
240
240
  yield row
241
241
  num_yielded += 1
242
242
 
243
- if not processed:
243
+ if processed < page_size:
244
244
  break # no more results
245
245
  offset += page_size
246
246
 
@@ -343,6 +343,8 @@ class AbstractWarehouse(ABC, Serializable):
343
343
  if (id_col := get_query_id_column(query)) is None:
344
344
  raise RuntimeError("sys__id column not found in query")
345
345
 
346
+ query = query._clone().offset(None).limit(None).order_by(None)
347
+
346
348
  if is_batched:
347
349
  for batch in ids:
348
350
  yield list(self.dataset_rows_select(query.where(id_col.in_(batch))))
@@ -11,6 +11,7 @@ from collections.abc import Generator, Iterable, Iterator, Sequence
11
11
  from copy import copy
12
12
  from functools import wraps
13
13
  from secrets import token_hex
14
+ from types import GeneratorType
14
15
  from typing import (
15
16
  TYPE_CHECKING,
16
17
  Any,
@@ -557,8 +558,8 @@ class UDFStep(Step, ABC):
557
558
  """
558
559
  assert self.partition_by is not None
559
560
 
560
- if isinstance(self.partition_by, Sequence):
561
- list_partition_by = self.partition_by
561
+ if isinstance(self.partition_by, (list, tuple, GeneratorType)):
562
+ list_partition_by = list(self.partition_by)
562
563
  else:
563
564
  list_partition_by = [self.partition_by]
564
565
 
@@ -575,7 +576,10 @@ class UDFStep(Step, ABC):
575
576
  f.dense_rank().over(order_by=partition_by).label(PARTITION_COLUMN_ID),
576
577
  ]
577
578
  self.catalog.warehouse.db.execute(
578
- tbl.insert().from_select(cols, query.with_only_columns(*cols))
579
+ tbl.insert().from_select(
580
+ cols,
581
+ query.offset(None).limit(None).with_only_columns(*cols),
582
+ )
579
583
  )
580
584
 
581
585
  return tbl
@@ -601,13 +605,10 @@ class UDFStep(Step, ABC):
601
605
  if self.partition_by is not None:
602
606
  partition_tbl = self.create_partitions_table(query)
603
607
  temp_tables.append(partition_tbl.name)
604
-
605
- subq = query.subquery()
606
- query = (
607
- sqlalchemy.select(*subq.c)
608
- .outerjoin(partition_tbl, partition_tbl.c.sys__id == subq.c.sys__id)
609
- .add_columns(*partition_columns())
610
- )
608
+ query = query.outerjoin(
609
+ partition_tbl,
610
+ partition_tbl.c.sys__id == query.selected_columns.sys__id,
611
+ ).add_columns(*partition_columns())
611
612
 
612
613
  query, tables = self.process_input_query(query)
613
614
  temp_tables.extend(t.name for t in tables)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.24.1
3
+ Version: 0.24.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -53,7 +53,7 @@ datachain/data_storage/metastore.py,sha256=9mWYOKK3AoHeKPGFm-WBfPrmnYHhwYeXx5MOu
53
53
  datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
54
54
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
55
55
  datachain/data_storage/sqlite.py,sha256=tT_soVi6l_pFSKaDktA1t4qW_vmPvXnvYSf4TZTKZYk,30067
56
- datachain/data_storage/warehouse.py,sha256=_7btARw-kd-Nx19S0qW6JqdF3VYyypQXFzsXq68SWKI,32327
56
+ datachain/data_storage/warehouse.py,sha256=2Bp2fXfcm-acwYjDWqVzGjoIQSAR4L56GPNtPcaT2gU,32418
57
57
  datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
58
58
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
59
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -125,7 +125,7 @@ datachain/model/ultralytics/pose.py,sha256=pBlmt63Qe68FKmexHimUGlNbNOoOlMHXG4fzX
125
125
  datachain/model/ultralytics/segment.py,sha256=63bDCj43E6iZ0hFI5J6uQfksdCmjEp6sEm1XzVaE8pw,2986
126
126
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
127
127
  datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
128
- datachain/query/dataset.py,sha256=C60VM0pScsrWcMqLNdX-tU0HE1SnEE9lRN3TU8CfTu4,61223
128
+ datachain/query/dataset.py,sha256=mKee4PkQHYPT96utPjM1DocURU4TghAR7AHtYkzdqwY,61292
129
129
  datachain/query/dispatch.py,sha256=A0nPxn6mEN5d9dDo6S8m16Ji_9IvJLXrgF2kqXdi4fs,15546
130
130
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
131
131
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -157,9 +157,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
157
157
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
158
158
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
159
159
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
160
- datachain-0.24.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
- datachain-0.24.1.dist-info/METADATA,sha256=lr7Q889hnRechtjUZUnuwDSfiydAGE6wGxMMG9ICSVg,13281
162
- datachain-0.24.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
- datachain-0.24.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
- datachain-0.24.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
- datachain-0.24.1.dist-info/RECORD,,
160
+ datachain-0.24.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
161
+ datachain-0.24.2.dist-info/METADATA,sha256=66Dz51BXpod8ZZG-pcmodALbszVZjNJXykMIrYRwXdA,13281
162
+ datachain-0.24.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
163
+ datachain-0.24.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
164
+ datachain-0.24.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
165
+ datachain-0.24.2.dist-info/RECORD,,