datachain 0.6.5__py3-none-any.whl → 0.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +2 -6
- datachain/data_storage/schema.py +4 -2
- datachain/lib/dc.py +17 -4
- datachain/lib/hf.py +4 -6
- datachain/query/dataset.py +30 -1
- datachain/sql/types.py +31 -0
- {datachain-0.6.5.dist-info → datachain-0.6.7.dist-info}/METADATA +1 -1
- {datachain-0.6.5.dist-info → datachain-0.6.7.dist-info}/RECORD +12 -12
- {datachain-0.6.5.dist-info → datachain-0.6.7.dist-info}/LICENSE +0 -0
- {datachain-0.6.5.dist-info → datachain-0.6.7.dist-info}/WHEEL +0 -0
- {datachain-0.6.5.dist-info → datachain-0.6.7.dist-info}/entry_points.txt +0 -0
- {datachain-0.6.5.dist-info → datachain-0.6.7.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -58,7 +58,7 @@ from datachain.listing import Listing
|
|
|
58
58
|
from datachain.node import DirType, Node, NodeWithPath
|
|
59
59
|
from datachain.nodes_thread_pool import NodesThreadPool
|
|
60
60
|
from datachain.remote.studio import StudioClient
|
|
61
|
-
from datachain.sql.types import DateTime, SQLType
|
|
61
|
+
from datachain.sql.types import DateTime, SQLType
|
|
62
62
|
from datachain.utils import (
|
|
63
63
|
DataChainDir,
|
|
64
64
|
batched,
|
|
@@ -196,11 +196,6 @@ class DatasetRowsFetcher(NodesThreadPool):
|
|
|
196
196
|
for c in [c for c, t in self.schema.items() if t == DateTime]:
|
|
197
197
|
df[c] = pd.to_datetime(df[c], unit="s")
|
|
198
198
|
|
|
199
|
-
# strings are represented as binaries in parquet export so need to
|
|
200
|
-
# decode it back to strings
|
|
201
|
-
for c in [c for c, t in self.schema.items() if t == String]:
|
|
202
|
-
df[c] = df[c].str.decode("utf-8")
|
|
203
|
-
|
|
204
199
|
def do_task(self, urls):
|
|
205
200
|
import lz4.frame
|
|
206
201
|
import pandas as pd
|
|
@@ -1403,6 +1398,7 @@ class Catalog:
|
|
|
1403
1398
|
query_script=remote_dataset_version.query_script,
|
|
1404
1399
|
create_rows=True,
|
|
1405
1400
|
columns=columns,
|
|
1401
|
+
feature_schema=remote_dataset_version.feature_schema,
|
|
1406
1402
|
validate_version=False,
|
|
1407
1403
|
)
|
|
1408
1404
|
|
datachain/data_storage/schema.py
CHANGED
|
@@ -145,6 +145,8 @@ class DirExpansion:
|
|
|
145
145
|
|
|
146
146
|
|
|
147
147
|
class DataTable:
|
|
148
|
+
MAX_RANDOM = 2**63 - 1
|
|
149
|
+
|
|
148
150
|
def __init__(
|
|
149
151
|
self,
|
|
150
152
|
name: str,
|
|
@@ -269,8 +271,8 @@ class DataTable:
|
|
|
269
271
|
def delete(self):
|
|
270
272
|
return self.apply_conditions(self.table.delete())
|
|
271
273
|
|
|
272
|
-
@
|
|
273
|
-
def sys_columns():
|
|
274
|
+
@classmethod
|
|
275
|
+
def sys_columns(cls):
|
|
274
276
|
return [
|
|
275
277
|
sa.Column("sys__id", Int, primary_key=True),
|
|
276
278
|
sa.Column(
|
datachain/lib/dc.py
CHANGED
|
@@ -981,10 +981,23 @@ class DataChain:
|
|
|
981
981
|
|
|
982
982
|
@resolve_columns
|
|
983
983
|
def order_by(self, *args, descending: bool = False) -> "Self":
|
|
984
|
-
"""Orders by specified set of
|
|
984
|
+
"""Orders by specified set of columns.
|
|
985
985
|
|
|
986
986
|
Parameters:
|
|
987
987
|
descending (bool): Whether to sort in descending order or not.
|
|
988
|
+
|
|
989
|
+
Example:
|
|
990
|
+
```py
|
|
991
|
+
dc.order_by("similarity_score", descending=True).limit(10)
|
|
992
|
+
```
|
|
993
|
+
|
|
994
|
+
Note:
|
|
995
|
+
Order is not guaranteed when steps are added after an `order_by` statement.
|
|
996
|
+
I.e. when using `from_dataset` an `order_by` statement should be used if
|
|
997
|
+
the order of the records in the chain is important.
|
|
998
|
+
Using `order_by` directly before `limit`, `collect` and `collect_flatten`
|
|
999
|
+
will give expected results.
|
|
1000
|
+
See https://github.com/iterative/datachain/issues/477 for further details.
|
|
988
1001
|
"""
|
|
989
1002
|
if descending:
|
|
990
1003
|
args = tuple(sqlalchemy.desc(a) for a in args)
|
|
@@ -1179,7 +1192,7 @@ class DataChain:
|
|
|
1179
1192
|
a tuple of row values.
|
|
1180
1193
|
"""
|
|
1181
1194
|
db_signals = self._effective_signals_schema.db_signals()
|
|
1182
|
-
with self._query.
|
|
1195
|
+
with self._query.ordered_select(*db_signals).as_iterable() as rows:
|
|
1183
1196
|
if row_factory:
|
|
1184
1197
|
rows = (row_factory(db_signals, r) for r in rows)
|
|
1185
1198
|
yield from rows
|
|
@@ -1270,7 +1283,7 @@ class DataChain:
|
|
|
1270
1283
|
chain = self.select(*cols) if cols else self
|
|
1271
1284
|
signals_schema = chain._effective_signals_schema
|
|
1272
1285
|
db_signals = signals_schema.db_signals()
|
|
1273
|
-
with self._query.
|
|
1286
|
+
with self._query.ordered_select(*db_signals).as_iterable() as rows:
|
|
1274
1287
|
for row in rows:
|
|
1275
1288
|
ret = signals_schema.row_to_features(
|
|
1276
1289
|
row, catalog=chain.session.catalog, cache=chain._settings.cache
|
|
@@ -1678,7 +1691,7 @@ class DataChain:
|
|
|
1678
1691
|
|
|
1679
1692
|
model_name = model_name or object_name or ""
|
|
1680
1693
|
hf_features = next(iter(ds_dict.values())).features
|
|
1681
|
-
output = output | get_output_schema(hf_features
|
|
1694
|
+
output = output | get_output_schema(hf_features)
|
|
1682
1695
|
model = dict_to_data_model(model_name, output)
|
|
1683
1696
|
if object_name:
|
|
1684
1697
|
output = {object_name: model}
|
datachain/lib/hf.py
CHANGED
|
@@ -138,17 +138,15 @@ def convert_feature(val: Any, feat: Any, anno: Any) -> Any: # noqa: PLR0911
|
|
|
138
138
|
return HFAudio(**val)
|
|
139
139
|
|
|
140
140
|
|
|
141
|
-
def get_output_schema(
|
|
142
|
-
features: Features, model_name: str = "", stream: bool = True
|
|
143
|
-
) -> dict[str, DataType]:
|
|
141
|
+
def get_output_schema(features: Features) -> dict[str, DataType]:
|
|
144
142
|
"""Generate UDF output schema from huggingface datasets features."""
|
|
145
143
|
fields_dict = {}
|
|
146
144
|
for name, val in features.items():
|
|
147
|
-
fields_dict[name] = _feature_to_chain_type(name, val)
|
|
148
|
-
return fields_dict
|
|
145
|
+
fields_dict[name] = _feature_to_chain_type(name, val)
|
|
146
|
+
return fields_dict
|
|
149
147
|
|
|
150
148
|
|
|
151
|
-
def _feature_to_chain_type(name: str, val: Any) ->
|
|
149
|
+
def _feature_to_chain_type(name: str, val: Any) -> DataType: # noqa: PLR0911
|
|
152
150
|
if isinstance(val, Value):
|
|
153
151
|
return arrow_type_mapper(val.pa_type)
|
|
154
152
|
if isinstance(val, ClassLabel):
|
datachain/query/dataset.py
CHANGED
|
@@ -1276,6 +1276,27 @@ class DatasetQuery:
|
|
|
1276
1276
|
query.steps.append(SQLSelect((*args, *named_args)))
|
|
1277
1277
|
return query
|
|
1278
1278
|
|
|
1279
|
+
@detach
|
|
1280
|
+
def ordered_select(self, *args, **kwargs) -> "Self":
|
|
1281
|
+
"""
|
|
1282
|
+
Select the given columns or expressions using a subquery whilst
|
|
1283
|
+
maintaining query ordering (only applicable if last step was order_by).
|
|
1284
|
+
|
|
1285
|
+
If used with no arguments, this simply creates a subquery and
|
|
1286
|
+
select all columns from it.
|
|
1287
|
+
|
|
1288
|
+
Example:
|
|
1289
|
+
>>> ds.ordered_select(C.name, C.size * 10)
|
|
1290
|
+
>>> ds.ordered_select(C.name, size10x=C.size * 10)
|
|
1291
|
+
"""
|
|
1292
|
+
named_args = [v.label(k) for k, v in kwargs.items()]
|
|
1293
|
+
query = self.clone()
|
|
1294
|
+
order_by = query.last_step if query.is_ordered else None
|
|
1295
|
+
query.steps.append(SQLSelect((*args, *named_args)))
|
|
1296
|
+
if order_by:
|
|
1297
|
+
query.steps.append(order_by)
|
|
1298
|
+
return query
|
|
1299
|
+
|
|
1279
1300
|
@detach
|
|
1280
1301
|
def select_except(self, *args) -> "Self":
|
|
1281
1302
|
"""
|
|
@@ -1338,7 +1359,7 @@ class DatasetQuery:
|
|
|
1338
1359
|
query = self.clone(new_table=False)
|
|
1339
1360
|
if (
|
|
1340
1361
|
query.steps
|
|
1341
|
-
and (last_step := query.
|
|
1362
|
+
and (last_step := query.last_step)
|
|
1342
1363
|
and isinstance(last_step, SQLLimit)
|
|
1343
1364
|
):
|
|
1344
1365
|
query.steps[-1] = SQLLimit(min(n, last_step.n))
|
|
@@ -1591,3 +1612,11 @@ class DatasetQuery:
|
|
|
1591
1612
|
finally:
|
|
1592
1613
|
self.cleanup()
|
|
1593
1614
|
return self.__class__(name=name, version=version, catalog=self.catalog)
|
|
1615
|
+
|
|
1616
|
+
@property
|
|
1617
|
+
def is_ordered(self) -> bool:
|
|
1618
|
+
return isinstance(self.last_step, SQLOrderBy)
|
|
1619
|
+
|
|
1620
|
+
@property
|
|
1621
|
+
def last_step(self) -> Optional[Step]:
|
|
1622
|
+
return self.steps[-1] if self.steps else None
|
datachain/sql/types.py
CHANGED
|
@@ -187,6 +187,22 @@ class Int32(Int):
|
|
|
187
187
|
return read_converter(dialect).int32(value)
|
|
188
188
|
|
|
189
189
|
|
|
190
|
+
class UInt32(Int):
|
|
191
|
+
def load_dialect_impl(self, dialect):
|
|
192
|
+
return converter(dialect).uint32()
|
|
193
|
+
|
|
194
|
+
@staticmethod
|
|
195
|
+
def default_value(dialect):
|
|
196
|
+
return type_defaults(dialect).uint32()
|
|
197
|
+
|
|
198
|
+
@staticmethod
|
|
199
|
+
def db_default_value(dialect):
|
|
200
|
+
return db_defaults(dialect).uint32()
|
|
201
|
+
|
|
202
|
+
def on_read_convert(self, value, dialect):
|
|
203
|
+
return read_converter(dialect).uint32(value)
|
|
204
|
+
|
|
205
|
+
|
|
190
206
|
class Int64(Int):
|
|
191
207
|
def load_dialect_impl(self, dialect):
|
|
192
208
|
return converter(dialect).int64()
|
|
@@ -395,6 +411,9 @@ class TypeReadConverter:
|
|
|
395
411
|
def int32(self, value):
|
|
396
412
|
return value
|
|
397
413
|
|
|
414
|
+
def uint32(self, value):
|
|
415
|
+
return value
|
|
416
|
+
|
|
398
417
|
def int64(self, value):
|
|
399
418
|
return value
|
|
400
419
|
|
|
@@ -421,6 +440,8 @@ class TypeReadConverter:
|
|
|
421
440
|
|
|
422
441
|
def json(self, value):
|
|
423
442
|
if isinstance(value, str):
|
|
443
|
+
if value == "":
|
|
444
|
+
return {}
|
|
424
445
|
return orjson.loads(value)
|
|
425
446
|
return value
|
|
426
447
|
|
|
@@ -446,6 +467,9 @@ class TypeConverter:
|
|
|
446
467
|
def int32(self):
|
|
447
468
|
return self.int()
|
|
448
469
|
|
|
470
|
+
def uint32(self):
|
|
471
|
+
return self.int()
|
|
472
|
+
|
|
449
473
|
def int64(self):
|
|
450
474
|
return self.int()
|
|
451
475
|
|
|
@@ -487,6 +511,9 @@ class TypeDefaults:
|
|
|
487
511
|
def int32(self):
|
|
488
512
|
return None
|
|
489
513
|
|
|
514
|
+
def uint32(self):
|
|
515
|
+
return None
|
|
516
|
+
|
|
490
517
|
def int64(self):
|
|
491
518
|
return None
|
|
492
519
|
|
|
@@ -528,6 +555,9 @@ class DBDefaults:
|
|
|
528
555
|
def int32(self):
|
|
529
556
|
return self.int()
|
|
530
557
|
|
|
558
|
+
def uint32(self):
|
|
559
|
+
return self.int()
|
|
560
|
+
|
|
531
561
|
def int64(self):
|
|
532
562
|
return self.int()
|
|
533
563
|
|
|
@@ -561,6 +591,7 @@ TYPES = [
|
|
|
561
591
|
Boolean,
|
|
562
592
|
Int,
|
|
563
593
|
Int32,
|
|
594
|
+
UInt32,
|
|
564
595
|
Int64,
|
|
565
596
|
UInt64,
|
|
566
597
|
Float,
|
|
@@ -18,7 +18,7 @@ datachain/studio.py,sha256=d-jUsYpfI1LEv3g8KU-lLchVgb9L0TXvlHakieFud_E,3788
|
|
|
18
18
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
19
|
datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
|
|
20
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
21
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=VwItaZG8MUqNKYz0xopDCdkVkbbxgTZYky3ElgsK5-M,57183
|
|
22
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
23
23
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
24
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
@@ -34,7 +34,7 @@ datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kT
|
|
|
34
34
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
35
35
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
36
36
|
datachain/data_storage/metastore.py,sha256=-TJCqG70VofSVOh2yEez4dwjHS3eQL8p7d9uO3WTVwM,35878
|
|
37
|
-
datachain/data_storage/schema.py,sha256=
|
|
37
|
+
datachain/data_storage/schema.py,sha256=scANMQqozita3HjEtq7eupMgh6yYkrZHoXtfuL2RoQg,9879
|
|
38
38
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
39
39
|
datachain/data_storage/sqlite.py,sha256=wb8xlMJYYyt59wft0psJj587d-AwpNThzIqspVcKnRI,27388
|
|
40
40
|
datachain/data_storage/warehouse.py,sha256=xwMaR4jBpR13vjG3zrhphH4z2_CFLNj0KPF0LJCXCJ8,30727
|
|
@@ -43,9 +43,9 @@ datachain/lib/arrow.py,sha256=-hu9tic79a01SY2UBqkA3U6wUr6tnE3T3q5q_BnO93A,9156
|
|
|
43
43
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
44
44
|
datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
|
|
45
45
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
46
|
-
datachain/lib/dc.py,sha256=
|
|
46
|
+
datachain/lib/dc.py,sha256=U1evAvSs563OMuUVildoaIOuOFiNB6fZcsN4BI8L9f0,85076
|
|
47
47
|
datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
|
|
48
|
-
datachain/lib/hf.py,sha256=
|
|
48
|
+
datachain/lib/hf.py,sha256=BW2NPpqxkpPwkSaGlppT8Rbs8zPpyYC-tR6htY08c-0,5817
|
|
49
49
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
50
50
|
datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
|
|
51
51
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
@@ -73,7 +73,7 @@ datachain/lib/func/aggregate.py,sha256=H1ziFQdaK9zvnxvttfnEzkkyGvEEmMAvmgCsBV6nf
|
|
|
73
73
|
datachain/lib/func/func.py,sha256=HAJZ_tpiRG2R-et7pr0WnoyNZYtpbPn3_HBuL3RQpbU,4800
|
|
74
74
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
75
75
|
datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
|
|
76
|
-
datachain/query/dataset.py,sha256=
|
|
76
|
+
datachain/query/dataset.py,sha256=MGArYxioeGvm8w7hQtQAjEI6wsZN_XAoh4-jO4d0U5Q,53926
|
|
77
77
|
datachain/query/dispatch.py,sha256=wjjTWw6sFQbB9SKRh78VbfvwSMgJXCfqJklS3-9KnCU,12025
|
|
78
78
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
79
79
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -84,7 +84,7 @@ datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
|
84
84
|
datachain/remote/studio.py,sha256=yCjK5fYN-OseMwakUc2nWU3ktUJNBWJHHSRBaHAwfPw,8768
|
|
85
85
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
86
86
|
datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
|
|
87
|
-
datachain/sql/types.py,sha256=
|
|
87
|
+
datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
|
|
88
88
|
datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
|
|
89
89
|
datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
|
|
90
90
|
datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
|
|
@@ -100,9 +100,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
|
|
|
100
100
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
101
101
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
102
102
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
103
|
-
datachain-0.6.
|
|
104
|
-
datachain-0.6.
|
|
105
|
-
datachain-0.6.
|
|
106
|
-
datachain-0.6.
|
|
107
|
-
datachain-0.6.
|
|
108
|
-
datachain-0.6.
|
|
103
|
+
datachain-0.6.7.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
104
|
+
datachain-0.6.7.dist-info/METADATA,sha256=JfsOnrPpyCXuxHel2XXD2BQXK6khsm-z25jxUAx8KIk,17188
|
|
105
|
+
datachain-0.6.7.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
106
|
+
datachain-0.6.7.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
107
|
+
datachain-0.6.7.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
108
|
+
datachain-0.6.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|