datachain 0.6.5__py3-none-any.whl → 0.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -58,7 +58,7 @@ from datachain.listing import Listing
58
58
  from datachain.node import DirType, Node, NodeWithPath
59
59
  from datachain.nodes_thread_pool import NodesThreadPool
60
60
  from datachain.remote.studio import StudioClient
61
- from datachain.sql.types import DateTime, SQLType, String
61
+ from datachain.sql.types import DateTime, SQLType
62
62
  from datachain.utils import (
63
63
  DataChainDir,
64
64
  batched,
@@ -196,11 +196,6 @@ class DatasetRowsFetcher(NodesThreadPool):
196
196
  for c in [c for c, t in self.schema.items() if t == DateTime]:
197
197
  df[c] = pd.to_datetime(df[c], unit="s")
198
198
 
199
- # strings are represented as binaries in parquet export so need to
200
- # decode it back to strings
201
- for c in [c for c, t in self.schema.items() if t == String]:
202
- df[c] = df[c].str.decode("utf-8")
203
-
204
199
  def do_task(self, urls):
205
200
  import lz4.frame
206
201
  import pandas as pd
@@ -1403,6 +1398,7 @@ class Catalog:
1403
1398
  query_script=remote_dataset_version.query_script,
1404
1399
  create_rows=True,
1405
1400
  columns=columns,
1401
+ feature_schema=remote_dataset_version.feature_schema,
1406
1402
  validate_version=False,
1407
1403
  )
1408
1404
 
@@ -145,6 +145,8 @@ class DirExpansion:
145
145
 
146
146
 
147
147
  class DataTable:
148
+ MAX_RANDOM = 2**63 - 1
149
+
148
150
  def __init__(
149
151
  self,
150
152
  name: str,
@@ -269,8 +271,8 @@ class DataTable:
269
271
  def delete(self):
270
272
  return self.apply_conditions(self.table.delete())
271
273
 
272
- @staticmethod
273
- def sys_columns():
274
+ @classmethod
275
+ def sys_columns(cls):
274
276
  return [
275
277
  sa.Column("sys__id", Int, primary_key=True),
276
278
  sa.Column(
datachain/lib/dc.py CHANGED
@@ -981,10 +981,23 @@ class DataChain:
981
981
 
982
982
  @resolve_columns
983
983
  def order_by(self, *args, descending: bool = False) -> "Self":
984
- """Orders by specified set of signals.
984
+ """Orders by specified set of columns.
985
985
 
986
986
  Parameters:
987
987
  descending (bool): Whether to sort in descending order or not.
988
+
989
+ Example:
990
+ ```py
991
+ dc.order_by("similarity_score", descending=True).limit(10)
992
+ ```
993
+
994
+ Note:
995
+ Order is not guaranteed when steps are added after an `order_by` statement.
996
+ I.e. when using `from_dataset` an `order_by` statement should be used if
997
+ the order of the records in the chain is important.
998
+ Using `order_by` directly before `limit`, `collect` and `collect_flatten`
999
+ will give expected results.
1000
+ See https://github.com/iterative/datachain/issues/477 for further details.
988
1001
  """
989
1002
  if descending:
990
1003
  args = tuple(sqlalchemy.desc(a) for a in args)
@@ -1179,7 +1192,7 @@ class DataChain:
1179
1192
  a tuple of row values.
1180
1193
  """
1181
1194
  db_signals = self._effective_signals_schema.db_signals()
1182
- with self._query.select(*db_signals).as_iterable() as rows:
1195
+ with self._query.ordered_select(*db_signals).as_iterable() as rows:
1183
1196
  if row_factory:
1184
1197
  rows = (row_factory(db_signals, r) for r in rows)
1185
1198
  yield from rows
@@ -1270,7 +1283,7 @@ class DataChain:
1270
1283
  chain = self.select(*cols) if cols else self
1271
1284
  signals_schema = chain._effective_signals_schema
1272
1285
  db_signals = signals_schema.db_signals()
1273
- with self._query.select(*db_signals).as_iterable() as rows:
1286
+ with self._query.ordered_select(*db_signals).as_iterable() as rows:
1274
1287
  for row in rows:
1275
1288
  ret = signals_schema.row_to_features(
1276
1289
  row, catalog=chain.session.catalog, cache=chain._settings.cache
@@ -1678,7 +1691,7 @@ class DataChain:
1678
1691
 
1679
1692
  model_name = model_name or object_name or ""
1680
1693
  hf_features = next(iter(ds_dict.values())).features
1681
- output = output | get_output_schema(hf_features, model_name)
1694
+ output = output | get_output_schema(hf_features)
1682
1695
  model = dict_to_data_model(model_name, output)
1683
1696
  if object_name:
1684
1697
  output = {object_name: model}
datachain/lib/hf.py CHANGED
@@ -138,17 +138,15 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
138
138
  return HFAudio(**val)
139
139
 
140
140
 
141
- def get_output_schema(
142
- features: Features, model_name: str = "", stream: bool = True
143
- ) -> dict[str, DataType]:
141
+ def get_output_schema(features: Features) -> dict[str, DataType]:
144
142
  """Generate UDF output schema from huggingface datasets features."""
145
143
  fields_dict = {}
146
144
  for name, val in features.items():
147
- fields_dict[name] = _feature_to_chain_type(name, val) # type: ignore[assignment]
148
- return fields_dict # type: ignore[return-value]
145
+ fields_dict[name] = _feature_to_chain_type(name, val)
146
+ return fields_dict
149
147
 
150
148
 
151
- def _feature_to_chain_type(name: str, val: Any) -> type: # noqa: PLR0911
149
+ def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
152
150
  if isinstance(val, Value):
153
151
  return arrow_type_mapper(val.pa_type)
154
152
  if isinstance(val, ClassLabel):
@@ -1276,6 +1276,27 @@ class DatasetQuery:
1276
1276
  query.steps.append(SQLSelect((*args, *named_args)))
1277
1277
  return query
1278
1278
 
1279
+ @detach
1280
+ def ordered_select(self, *args, **kwargs) -> "Self":
1281
+ """
1282
+ Select the given columns or expressions using a subquery whilst
1283
+ maintaining query ordering (only applicable if last step was order_by).
1284
+
1285
+ If used with no arguments, this simply creates a subquery and
1286
+ select all columns from it.
1287
+
1288
+ Example:
1289
+ >>> ds.ordered_select(C.name, C.size * 10)
1290
+ >>> ds.ordered_select(C.name, size10x=C.size * 10)
1291
+ """
1292
+ named_args = [v.label(k) for k, v in kwargs.items()]
1293
+ query = self.clone()
1294
+ order_by = query.last_step if query.is_ordered else None
1295
+ query.steps.append(SQLSelect((*args, *named_args)))
1296
+ if order_by:
1297
+ query.steps.append(order_by)
1298
+ return query
1299
+
1279
1300
  @detach
1280
1301
  def select_except(self, *args) -> "Self":
1281
1302
  """
@@ -1338,7 +1359,7 @@ class DatasetQuery:
1338
1359
  query = self.clone(new_table=False)
1339
1360
  if (
1340
1361
  query.steps
1341
- and (last_step := query.steps[-1])
1362
+ and (last_step := query.last_step)
1342
1363
  and isinstance(last_step, SQLLimit)
1343
1364
  ):
1344
1365
  query.steps[-1] = SQLLimit(min(n, last_step.n))
@@ -1591,3 +1612,11 @@ class DatasetQuery:
1591
1612
  finally:
1592
1613
  self.cleanup()
1593
1614
  return self.__class__(name=name, version=version, catalog=self.catalog)
1615
+
1616
+ @property
1617
+ def is_ordered(self) -> bool:
1618
+ return isinstance(self.last_step, SQLOrderBy)
1619
+
1620
+ @property
1621
+ def last_step(self) -> Optional[Step]:
1622
+ return self.steps[-1] if self.steps else None
datachain/sql/types.py CHANGED
@@ -187,6 +187,22 @@ class Int32(Int):
187
187
  return read_converter(dialect).int32(value)
188
188
 
189
189
 
190
+ class UInt32(Int):
191
+ def load_dialect_impl(self, dialect):
192
+ return converter(dialect).uint32()
193
+
194
+ @staticmethod
195
+ def default_value(dialect):
196
+ return type_defaults(dialect).uint32()
197
+
198
+ @staticmethod
199
+ def db_default_value(dialect):
200
+ return db_defaults(dialect).uint32()
201
+
202
+ def on_read_convert(self, value, dialect):
203
+ return read_converter(dialect).uint32(value)
204
+
205
+
190
206
  class Int64(Int):
191
207
  def load_dialect_impl(self, dialect):
192
208
  return converter(dialect).int64()
@@ -395,6 +411,9 @@ class TypeReadConverter:
395
411
  def int32(self, value):
396
412
  return value
397
413
 
414
+ def uint32(self, value):
415
+ return value
416
+
398
417
  def int64(self, value):
399
418
  return value
400
419
 
@@ -421,6 +440,8 @@ class TypeReadConverter:
421
440
 
422
441
  def json(self, value):
423
442
  if isinstance(value, str):
443
+ if value == "":
444
+ return {}
424
445
  return orjson.loads(value)
425
446
  return value
426
447
 
@@ -446,6 +467,9 @@ class TypeConverter:
446
467
  def int32(self):
447
468
  return self.int()
448
469
 
470
+ def uint32(self):
471
+ return self.int()
472
+
449
473
  def int64(self):
450
474
  return self.int()
451
475
 
@@ -487,6 +511,9 @@ class TypeDefaults:
487
511
  def int32(self):
488
512
  return None
489
513
 
514
+ def uint32(self):
515
+ return None
516
+
490
517
  def int64(self):
491
518
  return None
492
519
 
@@ -528,6 +555,9 @@ class DBDefaults:
528
555
  def int32(self):
529
556
  return self.int()
530
557
 
558
+ def uint32(self):
559
+ return self.int()
560
+
531
561
  def int64(self):
532
562
  return self.int()
533
563
 
@@ -561,6 +591,7 @@ TYPES = [
561
591
  Boolean,
562
592
  Int,
563
593
  Int32,
594
+ UInt32,
564
595
  Int64,
565
596
  UInt64,
566
597
  Float,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.5
3
+ Version: 0.6.7
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -18,7 +18,7 @@ datachain/studio.py,sha256=d-jUsYpfI1LEv3g8KU-lLchVgb9L0TXvlHakieFud_E,3788
18
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
19
  datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
20
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
21
- datachain/catalog/catalog.py,sha256=qFlRrR01_9h1MjK6DEgVSgIwbtZEGV_SdG_E5qUsHmM,57352
21
+ datachain/catalog/catalog.py,sha256=VwItaZG8MUqNKYz0xopDCdkVkbbxgTZYky3ElgsK5-M,57183
22
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
23
23
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
24
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
@@ -34,7 +34,7 @@ datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kT
34
34
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
35
35
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
36
36
  datachain/data_storage/metastore.py,sha256=-TJCqG70VofSVOh2yEez4dwjHS3eQL8p7d9uO3WTVwM,35878
37
- datachain/data_storage/schema.py,sha256=CiRXrDYp5ZZopSyUgZ7MT2ml_6YvqSTYXdybatcbX9M,9849
37
+ datachain/data_storage/schema.py,sha256=scANMQqozita3HjEtq7eupMgh6yYkrZHoXtfuL2RoQg,9879
38
38
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
39
39
  datachain/data_storage/sqlite.py,sha256=wb8xlMJYYyt59wft0psJj587d-AwpNThzIqspVcKnRI,27388
40
40
  datachain/data_storage/warehouse.py,sha256=xwMaR4jBpR13vjG3zrhphH4z2_CFLNj0KPF0LJCXCJ8,30727
@@ -43,9 +43,9 @@ datachain/lib/arrow.py,sha256=-hu9tic79a01SY2UBqkA3U6wUr6tnE3T3q5q_BnO93A,9156
43
43
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
44
44
  datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
45
45
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
46
- datachain/lib/dc.py,sha256=pOyE8LqIwo86GrZTSpSMUJAYYwep7nCdIxebkSYlMGo,84484
46
+ datachain/lib/dc.py,sha256=U1evAvSs563OMuUVildoaIOuOFiNB6fZcsN4BI8L9f0,85076
47
47
  datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
48
- datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
48
+ datachain/lib/hf.py,sha256=BW2NPpqxkpPwkSaGlppT8Rbs8zPpyYC-tR6htY08c-0,5817
49
49
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
50
50
  datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
51
51
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
@@ -73,7 +73,7 @@ datachain/lib/func/aggregate.py,sha256=H1ziFQdaK9zvnxvttfnEzkkyGvEEmMAvmgCsBV6nf
73
73
  datachain/lib/func/func.py,sha256=HAJZ_tpiRG2R-et7pr0WnoyNZYtpbPn3_HBuL3RQpbU,4800
74
74
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
75
75
  datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
76
- datachain/query/dataset.py,sha256=5LJCEpzaJWtWuv5jBheMCQbn9o5El6MiXORLaYcSR44,52924
76
+ datachain/query/dataset.py,sha256=MGArYxioeGvm8w7hQtQAjEI6wsZN_XAoh4-jO4d0U5Q,53926
77
77
  datachain/query/dispatch.py,sha256=wjjTWw6sFQbB9SKRh78VbfvwSMgJXCfqJklS3-9KnCU,12025
78
78
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
79
79
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -84,7 +84,7 @@ datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
84
84
  datachain/remote/studio.py,sha256=yCjK5fYN-OseMwakUc2nWU3ktUJNBWJHHSRBaHAwfPw,8768
85
85
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
86
86
  datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
87
- datachain/sql/types.py,sha256=3aXpoxkmCYbw0Dlta5J1enwS8_FuvjfSqyrNZO-dWj4,13383
87
+ datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
88
88
  datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
89
89
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
90
90
  datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
@@ -100,9 +100,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
100
100
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
101
101
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
102
102
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
103
- datachain-0.6.5.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
104
- datachain-0.6.5.dist-info/METADATA,sha256=eSh62q8OKalsO_IHYb0M2lT4y0x5z84uX1WVt7_dZlM,17188
105
- datachain-0.6.5.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
106
- datachain-0.6.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
107
- datachain-0.6.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
108
- datachain-0.6.5.dist-info/RECORD,,
103
+ datachain-0.6.7.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
104
+ datachain-0.6.7.dist-info/METADATA,sha256=JfsOnrPpyCXuxHel2XXD2BQXK6khsm-z25jxUAx8KIk,17188
105
+ datachain-0.6.7.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
106
+ datachain-0.6.7.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
107
+ datachain-0.6.7.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
108
+ datachain-0.6.7.dist-info/RECORD,,