datachain 0.24.2__py3-none-any.whl → 0.24.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +19 -2
- datachain/data_storage/metastore.py +3 -1
- datachain/data_storage/sqlite.py +9 -6
- datachain/dataset.py +1 -1
- datachain/lib/dc/datachain.py +26 -1
- datachain/lib/dc/datasets.py +1 -0
- datachain/listing.py +10 -3
- datachain/namespace.py +1 -1
- datachain/project.py +1 -1
- datachain/query/dataset.py +5 -1
- {datachain-0.24.2.dist-info → datachain-0.24.4.dist-info}/METADATA +1 -1
- {datachain-0.24.2.dist-info → datachain-0.24.4.dist-info}/RECORD +16 -16
- {datachain-0.24.2.dist-info → datachain-0.24.4.dist-info}/WHEEL +0 -0
- {datachain-0.24.2.dist-info → datachain-0.24.4.dist-info}/entry_points.txt +0 -0
- {datachain-0.24.2.dist-info → datachain-0.24.4.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.24.2.dist-info → datachain-0.24.4.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -1098,9 +1098,18 @@ class Catalog:
|
|
|
1098
1098
|
) -> DatasetRecord:
|
|
1099
1099
|
from datachain.lib.listing import is_listing_dataset
|
|
1100
1100
|
|
|
1101
|
+
project = project or self.metastore.default_project
|
|
1102
|
+
|
|
1101
1103
|
if is_listing_dataset(name):
|
|
1102
1104
|
project = self.metastore.listing_project
|
|
1103
|
-
|
|
1105
|
+
|
|
1106
|
+
try:
|
|
1107
|
+
return self.metastore.get_dataset(name, project.id if project else None)
|
|
1108
|
+
except DatasetNotFoundError:
|
|
1109
|
+
raise DatasetNotFoundError(
|
|
1110
|
+
f"Dataset {name} not found in namespace {project.namespace.name}"
|
|
1111
|
+
f" and project {project.name}"
|
|
1112
|
+
) from None
|
|
1104
1113
|
|
|
1105
1114
|
def get_dataset_with_remote_fallback(
|
|
1106
1115
|
self,
|
|
@@ -1111,6 +1120,14 @@ class Catalog:
|
|
|
1111
1120
|
pull_dataset: bool = False,
|
|
1112
1121
|
update: bool = False,
|
|
1113
1122
|
) -> DatasetRecord:
|
|
1123
|
+
# Intentionally ignore update flag is version is provided. Here only exact
|
|
1124
|
+
# version can be provided and update then doesn't make sense.
|
|
1125
|
+
# It corresponds to a query like this for example:
|
|
1126
|
+
#
|
|
1127
|
+
# dc.read_dataset("some.remote.dataset", version="1.0.0", update=True)
|
|
1128
|
+
if version:
|
|
1129
|
+
update = False
|
|
1130
|
+
|
|
1114
1131
|
if self.metastore.is_local_dataset(namespace_name) or not update:
|
|
1115
1132
|
try:
|
|
1116
1133
|
project = self.metastore.get_project(project_name, namespace_name)
|
|
@@ -1124,7 +1141,7 @@ class Catalog:
|
|
|
1124
1141
|
raise DatasetNotFoundError(
|
|
1125
1142
|
f"Dataset {name}"
|
|
1126
1143
|
+ (f" version {version} " if version else " ")
|
|
1127
|
-
+ "not found"
|
|
1144
|
+
+ f"not found in namespace {namespace_name} and project {project_name}"
|
|
1128
1145
|
)
|
|
1129
1146
|
|
|
1130
1147
|
if pull_dataset:
|
|
@@ -1194,14 +1194,16 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1194
1194
|
Gets a single dataset in project by dataset name.
|
|
1195
1195
|
"""
|
|
1196
1196
|
project_id = project_id or self.default_project.id
|
|
1197
|
+
|
|
1197
1198
|
d = self._datasets
|
|
1198
1199
|
query = self._base_dataset_query()
|
|
1199
1200
|
query = query.where(d.c.name == name, d.c.project_id == project_id) # type: ignore [attr-defined]
|
|
1200
1201
|
ds = self._parse_dataset(self.db.execute(query, conn=conn))
|
|
1201
1202
|
if not ds:
|
|
1202
1203
|
raise DatasetNotFoundError(
|
|
1203
|
-
f"Dataset {name} not found in project {project_id}"
|
|
1204
|
+
f"Dataset {name} not found in project with id {project_id}"
|
|
1204
1205
|
)
|
|
1206
|
+
|
|
1205
1207
|
return ds
|
|
1206
1208
|
|
|
1207
1209
|
def remove_dataset_version(
|
datachain/data_storage/sqlite.py
CHANGED
|
@@ -774,7 +774,15 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
774
774
|
query: Select,
|
|
775
775
|
progress_cb: Optional[Callable[[int], None]] = None,
|
|
776
776
|
) -> None:
|
|
777
|
-
|
|
777
|
+
col_id = (
|
|
778
|
+
query.selected_columns.sys__id
|
|
779
|
+
if "sys__id" in query.selected_columns
|
|
780
|
+
else None
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
# If there is no sys__id column, we cannot copy the table in batches,
|
|
784
|
+
# and we need to copy all rows at once. Same if there is a group by clause.
|
|
785
|
+
if col_id is None or len(query._group_by_clause) > 0:
|
|
778
786
|
select_q = query.with_only_columns(
|
|
779
787
|
*[c for c in query.selected_columns if c.name != "sys__id"]
|
|
780
788
|
)
|
|
@@ -782,12 +790,7 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
782
790
|
self.db.execute(q)
|
|
783
791
|
return
|
|
784
792
|
|
|
785
|
-
if "sys__id" in query.selected_columns:
|
|
786
|
-
col_id = query.selected_columns.sys__id
|
|
787
|
-
else:
|
|
788
|
-
col_id = sqlalchemy.column("sys__id")
|
|
789
793
|
select_ids = query.with_only_columns(col_id)
|
|
790
|
-
|
|
791
794
|
ids = self.db.execute(select_ids).fetchall()
|
|
792
795
|
|
|
793
796
|
select_q = (
|
datachain/dataset.py
CHANGED
datachain/lib/dc/datachain.py
CHANGED
|
@@ -21,6 +21,7 @@ from typing import (
|
|
|
21
21
|
import orjson
|
|
22
22
|
import sqlalchemy
|
|
23
23
|
from pydantic import BaseModel
|
|
24
|
+
from sqlalchemy.sql.elements import ColumnElement
|
|
24
25
|
from tqdm import tqdm
|
|
25
26
|
|
|
26
27
|
from datachain import semver
|
|
@@ -806,11 +807,35 @@ class DataChain:
|
|
|
806
807
|
chain.save("new_dataset")
|
|
807
808
|
```
|
|
808
809
|
"""
|
|
810
|
+
# Convert string partition_by parameters to Column objects
|
|
811
|
+
processed_partition_by = partition_by
|
|
812
|
+
if partition_by is not None:
|
|
813
|
+
if isinstance(partition_by, (str, Function, ColumnElement)):
|
|
814
|
+
list_partition_by = [partition_by]
|
|
815
|
+
else:
|
|
816
|
+
list_partition_by = list(partition_by)
|
|
817
|
+
|
|
818
|
+
processed_partition_columns: list[ColumnElement] = []
|
|
819
|
+
for col in list_partition_by:
|
|
820
|
+
if isinstance(col, str):
|
|
821
|
+
col_db_name = ColumnMeta.to_db_name(col)
|
|
822
|
+
col_type = self.signals_schema.get_column_type(col_db_name)
|
|
823
|
+
column = Column(col_db_name, python_to_sql(col_type))
|
|
824
|
+
processed_partition_columns.append(column)
|
|
825
|
+
elif isinstance(col, Function):
|
|
826
|
+
column = col.get_column(self.signals_schema)
|
|
827
|
+
processed_partition_columns.append(column)
|
|
828
|
+
else:
|
|
829
|
+
# Assume it's already a ColumnElement
|
|
830
|
+
processed_partition_columns.append(col)
|
|
831
|
+
|
|
832
|
+
processed_partition_by = processed_partition_columns
|
|
833
|
+
|
|
809
834
|
udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
|
|
810
835
|
return self._evolve(
|
|
811
836
|
query=self._query.generate(
|
|
812
837
|
udf_obj.to_udf_wrapper(),
|
|
813
|
-
partition_by=
|
|
838
|
+
partition_by=processed_partition_by,
|
|
814
839
|
**self._settings.to_dict(),
|
|
815
840
|
),
|
|
816
841
|
signal_schema=udf_obj.output,
|
datachain/lib/dc/datasets.py
CHANGED
datachain/listing.py
CHANGED
|
@@ -65,10 +65,17 @@ class Listing:
|
|
|
65
65
|
|
|
66
66
|
@cached_property
|
|
67
67
|
def dataset(self) -> "DatasetRecord":
|
|
68
|
+
from datachain.error import DatasetNotFoundError
|
|
69
|
+
|
|
68
70
|
assert self.dataset_name
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
71
|
+
project = self.metastore.listing_project
|
|
72
|
+
try:
|
|
73
|
+
return self.metastore.get_dataset(self.dataset_name, project.id)
|
|
74
|
+
except DatasetNotFoundError:
|
|
75
|
+
raise DatasetNotFoundError(
|
|
76
|
+
f"Dataset {self.dataset_name} not found in namespace"
|
|
77
|
+
f" {project.namespace.name} and project {project.name}"
|
|
78
|
+
) from None
|
|
72
79
|
|
|
73
80
|
@cached_property
|
|
74
81
|
def dataset_rows(self):
|
datachain/namespace.py
CHANGED
datachain/project.py
CHANGED
datachain/query/dataset.py
CHANGED
|
@@ -82,7 +82,10 @@ if TYPE_CHECKING:
|
|
|
82
82
|
INSERT_BATCH_SIZE = 10000
|
|
83
83
|
|
|
84
84
|
PartitionByType = Union[
|
|
85
|
-
|
|
85
|
+
str,
|
|
86
|
+
Function,
|
|
87
|
+
ColumnElement,
|
|
88
|
+
Sequence[Union[str, Function, ColumnElement]],
|
|
86
89
|
]
|
|
87
90
|
JoinPredicateType = Union[str, ColumnClause, ColumnElement]
|
|
88
91
|
DatasetDependencyType = tuple["DatasetRecord", str]
|
|
@@ -1142,6 +1145,7 @@ class DatasetQuery:
|
|
|
1142
1145
|
project_name=project_name,
|
|
1143
1146
|
version=version,
|
|
1144
1147
|
pull_dataset=True,
|
|
1148
|
+
update=update,
|
|
1145
1149
|
)
|
|
1146
1150
|
)
|
|
1147
1151
|
|
|
@@ -3,17 +3,17 @@ datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
4
|
datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
|
|
5
5
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
6
|
-
datachain/dataset.py,sha256=
|
|
6
|
+
datachain/dataset.py,sha256=ATGa-CBTFoZeTN2V40-zHEzfMBcdYK0WuoJ6H2yEAvo,25268
|
|
7
7
|
datachain/delta.py,sha256=fTEhCedseUsHuH_Ek52NXFhFPyFD_6MioEH5sCilNgo,9897
|
|
8
8
|
datachain/error.py,sha256=OWwWMkzZYJrkcoEDGhJHMf7SfKvxcsOLRF94mjPf29I,1609
|
|
9
9
|
datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
|
|
10
|
-
datachain/listing.py,sha256=
|
|
11
|
-
datachain/namespace.py,sha256=
|
|
10
|
+
datachain/listing.py,sha256=GuUlj3oYJQOIbLYr3IqNCSeuiU5pfOtRAQQl9uBEFQU,7458
|
|
11
|
+
datachain/namespace.py,sha256=MozcXYxedIbamzY56YKy9r9fgSpOm2VryhWfIf6stYk,1791
|
|
12
12
|
datachain/node.py,sha256=KWDT0ClYXB7FYI-QOvzAa-UDkLJErUI2eWm5FBteYuU,5577
|
|
13
13
|
datachain/nodes_fetcher.py,sha256=_wgaKyqEjkqdwJ_Hj6D8vUYz7hnU7g6xhm0H6ZnYxmE,1095
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=mdo0s-VybuSZkRUARcUO4Tjh8KFfZr9foHqmupx2SmM,3989
|
|
15
15
|
datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
|
|
16
|
-
datachain/project.py,sha256=
|
|
16
|
+
datachain/project.py,sha256=90D4GpJSA3t0fayYZbzrL3sk4U7EJhQo8psnWvdI7_o,2280
|
|
17
17
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
18
|
datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
|
|
19
19
|
datachain/semver.py,sha256=UB8GHPBtAP3UJGeiuJoInD7SK-DnB93_Xd1qy_CQ9cU,2074
|
|
@@ -21,7 +21,7 @@ datachain/studio.py,sha256=bLok-eJNFRHQScEyAyA_Fas52dmijd5r-73KudWxV4k,13337
|
|
|
21
21
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
22
22
|
datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
|
|
23
23
|
datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
|
|
24
|
-
datachain/catalog/catalog.py,sha256=
|
|
24
|
+
datachain/catalog/catalog.py,sha256=3QwiljkEHWm5xNmvqT3ey_JvKS8viiJgGWwhbqWIH0M,65996
|
|
25
25
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
26
26
|
datachain/catalog/loader.py,sha256=UXjYD6BNRoupPvkiz3-b04jepXhtLHCA4gzKFnXxOtQ,5987
|
|
27
27
|
datachain/cli/__init__.py,sha256=WvBqnwjG8Wp9xGCn-4eqfoZ3n7Sj1HJemCi4MayJh_c,8221
|
|
@@ -49,10 +49,10 @@ datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
|
|
|
49
49
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
50
50
|
datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
|
|
51
51
|
datachain/data_storage/job.py,sha256=9r0OGwh22bHNIvLHqg8_-eJSP1YYB-BN5HOla5TdCxw,402
|
|
52
|
-
datachain/data_storage/metastore.py,sha256=
|
|
52
|
+
datachain/data_storage/metastore.py,sha256=af7TsMHbANXmWKIu-LzQmsZpNxM6-hhzZfInWx7MQXI,52667
|
|
53
53
|
datachain/data_storage/schema.py,sha256=o3JbURKXRg3IJyIVA4QjHHkn6byRuz7avbydU2FlvNY,9897
|
|
54
54
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
55
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
55
|
+
datachain/data_storage/sqlite.py,sha256=TTQjdDXUaZSr3MEaxZjDhsVIkIJqxFNA-sD25TO3m_4,30228
|
|
56
56
|
datachain/data_storage/warehouse.py,sha256=2Bp2fXfcm-acwYjDWqVzGjoIQSAR4L56GPNtPcaT2gU,32418
|
|
57
57
|
datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
|
|
58
58
|
datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -103,8 +103,8 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
|
|
|
103
103
|
datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
|
|
104
104
|
datachain/lib/dc/csv.py,sha256=q6a9BpapGwP6nwy6c5cklxQumep2fUp9l2LAjtTJr6s,4411
|
|
105
105
|
datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,6051
|
|
106
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
107
|
-
datachain/lib/dc/datasets.py,sha256=
|
|
106
|
+
datachain/lib/dc/datachain.py,sha256=_FJnpgNN_b2xz39MsgeS0NTto0hzpcFPbJlaUBLcqTs,87094
|
|
107
|
+
datachain/lib/dc/datasets.py,sha256=MzM7MTn90Q-dZYuMNUzJXRW1YHOpHeFHspadRjQoI70,13297
|
|
108
108
|
datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
|
|
109
109
|
datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
|
|
110
110
|
datachain/lib/dc/listings.py,sha256=V379Cb-7ZyquM0w7sWArQZkzInZy4GB7QQ1ZfowKzQY,4544
|
|
@@ -125,7 +125,7 @@ datachain/model/ultralytics/pose.py,sha256=pBlmt63Qe68FKmexHimUGlNbNOoOlMHXG4fzX
|
|
|
125
125
|
datachain/model/ultralytics/segment.py,sha256=63bDCj43E6iZ0hFI5J6uQfksdCmjEp6sEm1XzVaE8pw,2986
|
|
126
126
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
127
127
|
datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
|
|
128
|
-
datachain/query/dataset.py,sha256=
|
|
128
|
+
datachain/query/dataset.py,sha256=t9EWZkJGPRPcBvKOsFO7ZiaTeUXc8YuTZydRbcv83_E,61350
|
|
129
129
|
datachain/query/dispatch.py,sha256=A0nPxn6mEN5d9dDo6S8m16Ji_9IvJLXrgF2kqXdi4fs,15546
|
|
130
130
|
datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
|
|
131
131
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -157,9 +157,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
157
157
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
158
158
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
159
159
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
160
|
-
datachain-0.24.
|
|
161
|
-
datachain-0.24.
|
|
162
|
-
datachain-0.24.
|
|
163
|
-
datachain-0.24.
|
|
164
|
-
datachain-0.24.
|
|
165
|
-
datachain-0.24.
|
|
160
|
+
datachain-0.24.4.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
161
|
+
datachain-0.24.4.dist-info/METADATA,sha256=N2RlOFlgnsdSuBeN_JKm_NZEZY6qvBv8YgSiXeGeRZQ,13281
|
|
162
|
+
datachain-0.24.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
163
|
+
datachain-0.24.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
164
|
+
datachain-0.24.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
165
|
+
datachain-0.24.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|