datachain 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/client/hf.py +1 -0
- datachain/lib/arrow.py +1 -1
- datachain/lib/dc.py +17 -4
- datachain/lib/hf.py +4 -6
- datachain/query/dataset.py +30 -1
- datachain/sql/types.py +29 -0
- {datachain-0.6.4.dist-info → datachain-0.6.6.dist-info}/METADATA +1 -1
- {datachain-0.6.4.dist-info → datachain-0.6.6.dist-info}/RECORD +12 -12
- {datachain-0.6.4.dist-info → datachain-0.6.6.dist-info}/LICENSE +0 -0
- {datachain-0.6.4.dist-info → datachain-0.6.6.dist-info}/WHEEL +0 -0
- {datachain-0.6.4.dist-info → datachain-0.6.6.dist-info}/entry_points.txt +0 -0
- {datachain-0.6.4.dist-info → datachain-0.6.6.dist-info}/top_level.txt +0 -0
datachain/client/hf.py
CHANGED
datachain/lib/arrow.py
CHANGED
|
@@ -175,7 +175,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
|
|
|
175
175
|
return dict
|
|
176
176
|
if isinstance(col_type, pa.lib.DictionaryType):
|
|
177
177
|
return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
|
|
178
|
-
raise TypeError(f"{col_type!r} datatypes not supported")
|
|
178
|
+
raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
|
|
179
179
|
|
|
180
180
|
|
|
181
181
|
def _nrows_file(file: File, nrows: int) -> str:
|
datachain/lib/dc.py
CHANGED
|
@@ -981,10 +981,23 @@ class DataChain:
|
|
|
981
981
|
|
|
982
982
|
@resolve_columns
|
|
983
983
|
def order_by(self, *args, descending: bool = False) -> "Self":
|
|
984
|
-
"""Orders by specified set of
|
|
984
|
+
"""Orders by specified set of columns.
|
|
985
985
|
|
|
986
986
|
Parameters:
|
|
987
987
|
descending (bool): Whether to sort in descending order or not.
|
|
988
|
+
|
|
989
|
+
Example:
|
|
990
|
+
```py
|
|
991
|
+
dc.order_by("similarity_score", descending=True).limit(10)
|
|
992
|
+
```
|
|
993
|
+
|
|
994
|
+
Note:
|
|
995
|
+
Order is not guaranteed when steps are added after an `order_by` statement.
|
|
996
|
+
I.e. when using `from_dataset` an `order_by` statement should be used if
|
|
997
|
+
the order of the records in the chain is important.
|
|
998
|
+
Using `order_by` directly before `limit`, `collect` and `collect_flatten`
|
|
999
|
+
will give expected results.
|
|
1000
|
+
See https://github.com/iterative/datachain/issues/477 for further details.
|
|
988
1001
|
"""
|
|
989
1002
|
if descending:
|
|
990
1003
|
args = tuple(sqlalchemy.desc(a) for a in args)
|
|
@@ -1179,7 +1192,7 @@ class DataChain:
|
|
|
1179
1192
|
a tuple of row values.
|
|
1180
1193
|
"""
|
|
1181
1194
|
db_signals = self._effective_signals_schema.db_signals()
|
|
1182
|
-
with self._query.
|
|
1195
|
+
with self._query.ordered_select(*db_signals).as_iterable() as rows:
|
|
1183
1196
|
if row_factory:
|
|
1184
1197
|
rows = (row_factory(db_signals, r) for r in rows)
|
|
1185
1198
|
yield from rows
|
|
@@ -1270,7 +1283,7 @@ class DataChain:
|
|
|
1270
1283
|
chain = self.select(*cols) if cols else self
|
|
1271
1284
|
signals_schema = chain._effective_signals_schema
|
|
1272
1285
|
db_signals = signals_schema.db_signals()
|
|
1273
|
-
with self._query.
|
|
1286
|
+
with self._query.ordered_select(*db_signals).as_iterable() as rows:
|
|
1274
1287
|
for row in rows:
|
|
1275
1288
|
ret = signals_schema.row_to_features(
|
|
1276
1289
|
row, catalog=chain.session.catalog, cache=chain._settings.cache
|
|
@@ -1678,7 +1691,7 @@ class DataChain:
|
|
|
1678
1691
|
|
|
1679
1692
|
model_name = model_name or object_name or ""
|
|
1680
1693
|
hf_features = next(iter(ds_dict.values())).features
|
|
1681
|
-
output = output | get_output_schema(hf_features
|
|
1694
|
+
output = output | get_output_schema(hf_features)
|
|
1682
1695
|
model = dict_to_data_model(model_name, output)
|
|
1683
1696
|
if object_name:
|
|
1684
1697
|
output = {object_name: model}
|
datachain/lib/hf.py
CHANGED
|
@@ -138,17 +138,15 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
|
|
|
138
138
|
return HFAudio(**val)
|
|
139
139
|
|
|
140
140
|
|
|
141
|
-
def get_output_schema(
|
|
142
|
-
features: Features, model_name: str = "", stream: bool = True
|
|
143
|
-
) -> dict[str, DataType]:
|
|
141
|
+
def get_output_schema(features: Features) -> dict[str, DataType]:
|
|
144
142
|
"""Generate UDF output schema from huggingface datasets features."""
|
|
145
143
|
fields_dict = {}
|
|
146
144
|
for name, val in features.items():
|
|
147
|
-
fields_dict[name] = _feature_to_chain_type(name, val)
|
|
148
|
-
return fields_dict
|
|
145
|
+
fields_dict[name] = _feature_to_chain_type(name, val)
|
|
146
|
+
return fields_dict
|
|
149
147
|
|
|
150
148
|
|
|
151
|
-
def _feature_to_chain_type(name: str, val: Any) ->
|
|
149
|
+
def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
|
|
152
150
|
if isinstance(val, Value):
|
|
153
151
|
return arrow_type_mapper(val.pa_type)
|
|
154
152
|
if isinstance(val, ClassLabel):
|
datachain/query/dataset.py
CHANGED
|
@@ -1276,6 +1276,27 @@ class DatasetQuery:
|
|
|
1276
1276
|
query.steps.append(SQLSelect((*args, *named_args)))
|
|
1277
1277
|
return query
|
|
1278
1278
|
|
|
1279
|
+
@detach
|
|
1280
|
+
def ordered_select(self, *args, **kwargs) -> "Self":
|
|
1281
|
+
"""
|
|
1282
|
+
Select the given columns or expressions using a subquery whilst
|
|
1283
|
+
maintaining query ordering (only applicable if last step was order_by).
|
|
1284
|
+
|
|
1285
|
+
If used with no arguments, this simply creates a subquery and
|
|
1286
|
+
select all columns from it.
|
|
1287
|
+
|
|
1288
|
+
Example:
|
|
1289
|
+
>>> ds.ordered_select(C.name, C.size * 10)
|
|
1290
|
+
>>> ds.ordered_select(C.name, size10x=C.size * 10)
|
|
1291
|
+
"""
|
|
1292
|
+
named_args = [v.label(k) for k, v in kwargs.items()]
|
|
1293
|
+
query = self.clone()
|
|
1294
|
+
order_by = query.last_step if query.is_ordered else None
|
|
1295
|
+
query.steps.append(SQLSelect((*args, *named_args)))
|
|
1296
|
+
if order_by:
|
|
1297
|
+
query.steps.append(order_by)
|
|
1298
|
+
return query
|
|
1299
|
+
|
|
1279
1300
|
@detach
|
|
1280
1301
|
def select_except(self, *args) -> "Self":
|
|
1281
1302
|
"""
|
|
@@ -1338,7 +1359,7 @@ class DatasetQuery:
|
|
|
1338
1359
|
query = self.clone(new_table=False)
|
|
1339
1360
|
if (
|
|
1340
1361
|
query.steps
|
|
1341
|
-
and (last_step := query.
|
|
1362
|
+
and (last_step := query.last_step)
|
|
1342
1363
|
and isinstance(last_step, SQLLimit)
|
|
1343
1364
|
):
|
|
1344
1365
|
query.steps[-1] = SQLLimit(min(n, last_step.n))
|
|
@@ -1591,3 +1612,11 @@ class DatasetQuery:
|
|
|
1591
1612
|
finally:
|
|
1592
1613
|
self.cleanup()
|
|
1593
1614
|
return self.__class__(name=name, version=version, catalog=self.catalog)
|
|
1615
|
+
|
|
1616
|
+
@property
|
|
1617
|
+
def is_ordered(self) -> bool:
|
|
1618
|
+
return isinstance(self.last_step, SQLOrderBy)
|
|
1619
|
+
|
|
1620
|
+
@property
|
|
1621
|
+
def last_step(self) -> Optional[Step]:
|
|
1622
|
+
return self.steps[-1] if self.steps else None
|
datachain/sql/types.py
CHANGED
|
@@ -187,6 +187,22 @@ class Int32(Int):
|
|
|
187
187
|
return read_converter(dialect).int32(value)
|
|
188
188
|
|
|
189
189
|
|
|
190
|
+
class UInt32(Int):
|
|
191
|
+
def load_dialect_impl(self, dialect):
|
|
192
|
+
return converter(dialect).uint32()
|
|
193
|
+
|
|
194
|
+
@staticmethod
|
|
195
|
+
def default_value(dialect):
|
|
196
|
+
return type_defaults(dialect).uint32()
|
|
197
|
+
|
|
198
|
+
@staticmethod
|
|
199
|
+
def db_default_value(dialect):
|
|
200
|
+
return db_defaults(dialect).uint32()
|
|
201
|
+
|
|
202
|
+
def on_read_convert(self, value, dialect):
|
|
203
|
+
return read_converter(dialect).uint32(value)
|
|
204
|
+
|
|
205
|
+
|
|
190
206
|
class Int64(Int):
|
|
191
207
|
def load_dialect_impl(self, dialect):
|
|
192
208
|
return converter(dialect).int64()
|
|
@@ -395,6 +411,9 @@ class TypeReadConverter:
|
|
|
395
411
|
def int32(self, value):
|
|
396
412
|
return value
|
|
397
413
|
|
|
414
|
+
def uint32(self, value):
|
|
415
|
+
return value
|
|
416
|
+
|
|
398
417
|
def int64(self, value):
|
|
399
418
|
return value
|
|
400
419
|
|
|
@@ -446,6 +465,9 @@ class TypeConverter:
|
|
|
446
465
|
def int32(self):
|
|
447
466
|
return self.int()
|
|
448
467
|
|
|
468
|
+
def uint32(self):
|
|
469
|
+
return self.int()
|
|
470
|
+
|
|
449
471
|
def int64(self):
|
|
450
472
|
return self.int()
|
|
451
473
|
|
|
@@ -487,6 +509,9 @@ class TypeDefaults:
|
|
|
487
509
|
def int32(self):
|
|
488
510
|
return None
|
|
489
511
|
|
|
512
|
+
def uint32(self):
|
|
513
|
+
return None
|
|
514
|
+
|
|
490
515
|
def int64(self):
|
|
491
516
|
return None
|
|
492
517
|
|
|
@@ -528,6 +553,9 @@ class DBDefaults:
|
|
|
528
553
|
def int32(self):
|
|
529
554
|
return self.int()
|
|
530
555
|
|
|
556
|
+
def uint32(self):
|
|
557
|
+
return self.int()
|
|
558
|
+
|
|
531
559
|
def int64(self):
|
|
532
560
|
return self.int()
|
|
533
561
|
|
|
@@ -561,6 +589,7 @@ TYPES = [
|
|
|
561
589
|
Boolean,
|
|
562
590
|
Int,
|
|
563
591
|
Int32,
|
|
592
|
+
UInt32,
|
|
564
593
|
Int64,
|
|
565
594
|
UInt64,
|
|
566
595
|
Float,
|
|
@@ -26,7 +26,7 @@ datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,223
|
|
|
26
26
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
27
27
|
datachain/client/fsspec.py,sha256=C6C5AO6ndkgcoUxCRN9_8fUzqX2cRWJWG6FL6oD9X_Q,12708
|
|
28
28
|
datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
|
|
29
|
-
datachain/client/hf.py,sha256=
|
|
29
|
+
datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
|
|
30
30
|
datachain/client/local.py,sha256=vwbgCwZ7IqY2voj2l7tLJjgov7Dp--fEUvUwUBsMbls,4457
|
|
31
31
|
datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
|
|
32
32
|
datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
|
|
@@ -39,13 +39,13 @@ datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2kru
|
|
|
39
39
|
datachain/data_storage/sqlite.py,sha256=wb8xlMJYYyt59wft0psJj587d-AwpNThzIqspVcKnRI,27388
|
|
40
40
|
datachain/data_storage/warehouse.py,sha256=xwMaR4jBpR13vjG3zrhphH4z2_CFLNj0KPF0LJCXCJ8,30727
|
|
41
41
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
-
datachain/lib/arrow.py,sha256
|
|
42
|
+
datachain/lib/arrow.py,sha256=-hu9tic79a01SY2UBqkA3U6wUr6tnE3T3q5q_BnO93A,9156
|
|
43
43
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
44
44
|
datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
|
|
45
45
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
46
|
-
datachain/lib/dc.py,sha256=
|
|
46
|
+
datachain/lib/dc.py,sha256=U1evAvSs563OMuUVildoaIOuOFiNB6fZcsN4BI8L9f0,85076
|
|
47
47
|
datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
|
|
48
|
-
datachain/lib/hf.py,sha256=
|
|
48
|
+
datachain/lib/hf.py,sha256=BW2NPpqxkpPwkSaGlppT8Rbs8zPpyYC-tR6htY08c-0,5817
|
|
49
49
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
50
50
|
datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
|
|
51
51
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
@@ -73,7 +73,7 @@ datachain/lib/func/aggregate.py,sha256=H1ziFQdaK9zvnxvttfnEzkkyGvEEmMAvmgCsBV6nf
|
|
|
73
73
|
datachain/lib/func/func.py,sha256=HAJZ_tpiRG2R-et7pr0WnoyNZYtpbPn3_HBuL3RQpbU,4800
|
|
74
74
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
75
75
|
datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
|
|
76
|
-
datachain/query/dataset.py,sha256=
|
|
76
|
+
datachain/query/dataset.py,sha256=MGArYxioeGvm8w7hQtQAjEI6wsZN_XAoh4-jO4d0U5Q,53926
|
|
77
77
|
datachain/query/dispatch.py,sha256=wjjTWw6sFQbB9SKRh78VbfvwSMgJXCfqJklS3-9KnCU,12025
|
|
78
78
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
79
79
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -84,7 +84,7 @@ datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
|
84
84
|
datachain/remote/studio.py,sha256=yCjK5fYN-OseMwakUc2nWU3ktUJNBWJHHSRBaHAwfPw,8768
|
|
85
85
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
86
86
|
datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
|
|
87
|
-
datachain/sql/types.py,sha256=
|
|
87
|
+
datachain/sql/types.py,sha256=RjgWb4Bh-pxzZpBCAyjbtDociU01ZPQ7l-SPueaRpNA,13991
|
|
88
88
|
datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
|
|
89
89
|
datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
|
|
90
90
|
datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
|
|
@@ -100,9 +100,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
|
|
|
100
100
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
101
101
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
102
102
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
103
|
-
datachain-0.6.
|
|
104
|
-
datachain-0.6.
|
|
105
|
-
datachain-0.6.
|
|
106
|
-
datachain-0.6.
|
|
107
|
-
datachain-0.6.
|
|
108
|
-
datachain-0.6.
|
|
103
|
+
datachain-0.6.6.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
104
|
+
datachain-0.6.6.dist-info/METADATA,sha256=Z211Vh59IGXt-dRZTSI9zYgTnvmAmTPbmsfRh_vWE8Q,17188
|
|
105
|
+
datachain-0.6.6.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
106
|
+
datachain-0.6.6.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
107
|
+
datachain-0.6.6.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
108
|
+
datachain-0.6.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|