datachain 0.6.5__py3-none-any.whl → 0.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datachain/lib/dc.py CHANGED
@@ -981,10 +981,23 @@ class DataChain:
981
981
 
982
982
  @resolve_columns
983
983
  def order_by(self, *args, descending: bool = False) -> "Self":
984
- """Orders by specified set of signals.
984
+ """Orders by specified set of columns.
985
985
 
986
986
  Parameters:
987
987
  descending (bool): Whether to sort in descending order or not.
988
+
989
+ Example:
990
+ ```py
991
+ dc.order_by("similarity_score", descending=True).limit(10)
992
+ ```
993
+
994
+ Note:
995
+ Order is not guaranteed when steps are added after an `order_by` statement.
996
+ I.e. when using `from_dataset` an `order_by` statement should be used if
997
+ the order of the records in the chain is important.
998
+ Using `order_by` directly before `limit`, `collect` and `collect_flatten`
999
+ will give expected results.
1000
+ See https://github.com/iterative/datachain/issues/477 for further details.
988
1001
  """
989
1002
  if descending:
990
1003
  args = tuple(sqlalchemy.desc(a) for a in args)
@@ -1179,7 +1192,7 @@ class DataChain:
1179
1192
  a tuple of row values.
1180
1193
  """
1181
1194
  db_signals = self._effective_signals_schema.db_signals()
1182
- with self._query.select(*db_signals).as_iterable() as rows:
1195
+ with self._query.ordered_select(*db_signals).as_iterable() as rows:
1183
1196
  if row_factory:
1184
1197
  rows = (row_factory(db_signals, r) for r in rows)
1185
1198
  yield from rows
@@ -1270,7 +1283,7 @@ class DataChain:
1270
1283
  chain = self.select(*cols) if cols else self
1271
1284
  signals_schema = chain._effective_signals_schema
1272
1285
  db_signals = signals_schema.db_signals()
1273
- with self._query.select(*db_signals).as_iterable() as rows:
1286
+ with self._query.ordered_select(*db_signals).as_iterable() as rows:
1274
1287
  for row in rows:
1275
1288
  ret = signals_schema.row_to_features(
1276
1289
  row, catalog=chain.session.catalog, cache=chain._settings.cache
@@ -1678,7 +1691,7 @@ class DataChain:
1678
1691
 
1679
1692
  model_name = model_name or object_name or ""
1680
1693
  hf_features = next(iter(ds_dict.values())).features
1681
- output = output | get_output_schema(hf_features, model_name)
1694
+ output = output | get_output_schema(hf_features)
1682
1695
  model = dict_to_data_model(model_name, output)
1683
1696
  if object_name:
1684
1697
  output = {object_name: model}
datachain/lib/hf.py CHANGED
@@ -138,17 +138,15 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
138
138
  return HFAudio(**val)
139
139
 
140
140
 
141
- def get_output_schema(
142
- features: Features, model_name: str = "", stream: bool = True
143
- ) -> dict[str, DataType]:
141
+ def get_output_schema(features: Features) -> dict[str, DataType]:
144
142
  """Generate UDF output schema from huggingface datasets features."""
145
143
  fields_dict = {}
146
144
  for name, val in features.items():
147
- fields_dict[name] = _feature_to_chain_type(name, val) # type: ignore[assignment]
148
- return fields_dict # type: ignore[return-value]
145
+ fields_dict[name] = _feature_to_chain_type(name, val)
146
+ return fields_dict
149
147
 
150
148
 
151
- def _feature_to_chain_type(name: str, val: Any) -> type: # noqa: PLR0911
149
+ def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
152
150
  if isinstance(val, Value):
153
151
  return arrow_type_mapper(val.pa_type)
154
152
  if isinstance(val, ClassLabel):
@@ -1276,6 +1276,27 @@ class DatasetQuery:
1276
1276
  query.steps.append(SQLSelect((*args, *named_args)))
1277
1277
  return query
1278
1278
 
1279
+ @detach
1280
+ def ordered_select(self, *args, **kwargs) -> "Self":
1281
+ """
1282
+ Select the given columns or expressions using a subquery whilst
1283
+ maintaining query ordering (only applicable if last step was order_by).
1284
+
1285
+ If used with no arguments, this simply creates a subquery and
1286
+ select all columns from it.
1287
+
1288
+ Example:
1289
+ >>> ds.ordered_select(C.name, C.size * 10)
1290
+ >>> ds.ordered_select(C.name, size10x=C.size * 10)
1291
+ """
1292
+ named_args = [v.label(k) for k, v in kwargs.items()]
1293
+ query = self.clone()
1294
+ order_by = query.last_step if query.is_ordered else None
1295
+ query.steps.append(SQLSelect((*args, *named_args)))
1296
+ if order_by:
1297
+ query.steps.append(order_by)
1298
+ return query
1299
+
1279
1300
  @detach
1280
1301
  def select_except(self, *args) -> "Self":
1281
1302
  """
@@ -1338,7 +1359,7 @@ class DatasetQuery:
1338
1359
  query = self.clone(new_table=False)
1339
1360
  if (
1340
1361
  query.steps
1341
- and (last_step := query.steps[-1])
1362
+ and (last_step := query.last_step)
1342
1363
  and isinstance(last_step, SQLLimit)
1343
1364
  ):
1344
1365
  query.steps[-1] = SQLLimit(min(n, last_step.n))
@@ -1591,3 +1612,11 @@ class DatasetQuery:
1591
1612
  finally:
1592
1613
  self.cleanup()
1593
1614
  return self.__class__(name=name, version=version, catalog=self.catalog)
1615
+
1616
+ @property
1617
+ def is_ordered(self) -> bool:
1618
+ return isinstance(self.last_step, SQLOrderBy)
1619
+
1620
+ @property
1621
+ def last_step(self) -> Optional[Step]:
1622
+ return self.steps[-1] if self.steps else None
datachain/sql/types.py CHANGED
@@ -187,6 +187,22 @@ class Int32(Int):
187
187
  return read_converter(dialect).int32(value)
188
188
 
189
189
 
190
+ class UInt32(Int):
191
+ def load_dialect_impl(self, dialect):
192
+ return converter(dialect).uint32()
193
+
194
+ @staticmethod
195
+ def default_value(dialect):
196
+ return type_defaults(dialect).uint32()
197
+
198
+ @staticmethod
199
+ def db_default_value(dialect):
200
+ return db_defaults(dialect).uint32()
201
+
202
+ def on_read_convert(self, value, dialect):
203
+ return read_converter(dialect).uint32(value)
204
+
205
+
190
206
  class Int64(Int):
191
207
  def load_dialect_impl(self, dialect):
192
208
  return converter(dialect).int64()
@@ -395,6 +411,9 @@ class TypeReadConverter:
395
411
  def int32(self, value):
396
412
  return value
397
413
 
414
+ def uint32(self, value):
415
+ return value
416
+
398
417
  def int64(self, value):
399
418
  return value
400
419
 
@@ -446,6 +465,9 @@ class TypeConverter:
446
465
  def int32(self):
447
466
  return self.int()
448
467
 
468
+ def uint32(self):
469
+ return self.int()
470
+
449
471
  def int64(self):
450
472
  return self.int()
451
473
 
@@ -487,6 +509,9 @@ class TypeDefaults:
487
509
  def int32(self):
488
510
  return None
489
511
 
512
+ def uint32(self):
513
+ return None
514
+
490
515
  def int64(self):
491
516
  return None
492
517
 
@@ -528,6 +553,9 @@ class DBDefaults:
528
553
  def int32(self):
529
554
  return self.int()
530
555
 
556
+ def uint32(self):
557
+ return self.int()
558
+
531
559
  def int64(self):
532
560
  return self.int()
533
561
 
@@ -561,6 +589,7 @@ TYPES = [
561
589
  Boolean,
562
590
  Int,
563
591
  Int32,
592
+ UInt32,
564
593
  Int64,
565
594
  UInt64,
566
595
  Float,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.5
3
+ Version: 0.6.6
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -43,9 +43,9 @@ datachain/lib/arrow.py,sha256=-hu9tic79a01SY2UBqkA3U6wUr6tnE3T3q5q_BnO93A,9156
43
43
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
44
44
  datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
45
45
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
46
- datachain/lib/dc.py,sha256=pOyE8LqIwo86GrZTSpSMUJAYYwep7nCdIxebkSYlMGo,84484
46
+ datachain/lib/dc.py,sha256=U1evAvSs563OMuUVildoaIOuOFiNB6fZcsN4BI8L9f0,85076
47
47
  datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
48
- datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
48
+ datachain/lib/hf.py,sha256=BW2NPpqxkpPwkSaGlppT8Rbs8zPpyYC-tR6htY08c-0,5817
49
49
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
50
50
  datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
51
51
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
@@ -73,7 +73,7 @@ datachain/lib/func/aggregate.py,sha256=H1ziFQdaK9zvnxvttfnEzkkyGvEEmMAvmgCsBV6nf
73
73
  datachain/lib/func/func.py,sha256=HAJZ_tpiRG2R-et7pr0WnoyNZYtpbPn3_HBuL3RQpbU,4800
74
74
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
75
75
  datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
76
- datachain/query/dataset.py,sha256=5LJCEpzaJWtWuv5jBheMCQbn9o5El6MiXORLaYcSR44,52924
76
+ datachain/query/dataset.py,sha256=MGArYxioeGvm8w7hQtQAjEI6wsZN_XAoh4-jO4d0U5Q,53926
77
77
  datachain/query/dispatch.py,sha256=wjjTWw6sFQbB9SKRh78VbfvwSMgJXCfqJklS3-9KnCU,12025
78
78
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
79
79
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -84,7 +84,7 @@ datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
84
84
  datachain/remote/studio.py,sha256=yCjK5fYN-OseMwakUc2nWU3ktUJNBWJHHSRBaHAwfPw,8768
85
85
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
86
86
  datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
87
- datachain/sql/types.py,sha256=3aXpoxkmCYbw0Dlta5J1enwS8_FuvjfSqyrNZO-dWj4,13383
87
+ datachain/sql/types.py,sha256=RjgWb4Bh-pxzZpBCAyjbtDociU01ZPQ7l-SPueaRpNA,13991
88
88
  datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
89
89
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
90
90
  datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
@@ -100,9 +100,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
100
100
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
101
101
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
102
102
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
103
- datachain-0.6.5.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
104
- datachain-0.6.5.dist-info/METADATA,sha256=eSh62q8OKalsO_IHYb0M2lT4y0x5z84uX1WVt7_dZlM,17188
105
- datachain-0.6.5.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
106
- datachain-0.6.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
107
- datachain-0.6.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
108
- datachain-0.6.5.dist-info/RECORD,,
103
+ datachain-0.6.6.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
104
+ datachain-0.6.6.dist-info/METADATA,sha256=Z211Vh59IGXt-dRZTSI9zYgTnvmAmTPbmsfRh_vWE8Q,17188
105
+ datachain-0.6.6.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
106
+ datachain-0.6.6.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
107
+ datachain-0.6.6.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
108
+ datachain-0.6.6.dist-info/RECORD,,