datachain 0.18.3__py3-none-any.whl → 0.18.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +36 -22
- datachain/client/azure.py +1 -1
- datachain/client/gcs.py +1 -1
- datachain/client/s3.py +5 -3
- datachain/data_storage/metastore.py +87 -42
- datachain/dataset.py +1 -1
- datachain/func/aggregate.py +64 -38
- datachain/func/array.py +102 -73
- datachain/func/conditional.py +71 -51
- datachain/func/func.py +1 -1
- datachain/func/numeric.py +55 -36
- datachain/func/path.py +32 -20
- datachain/func/random.py +2 -2
- datachain/func/string.py +59 -37
- datachain/func/window.py +7 -8
- datachain/lib/dc/datachain.py +9 -0
- datachain/lib/listing.py +2 -3
- datachain/model/ultralytics/bbox.py +6 -4
- datachain/model/ultralytics/pose.py +6 -4
- datachain/model/ultralytics/segment.py +6 -4
- datachain/remote/studio.py +4 -2
- {datachain-0.18.3.dist-info → datachain-0.18.5.dist-info}/METADATA +3 -3
- {datachain-0.18.3.dist-info → datachain-0.18.5.dist-info}/RECORD +27 -27
- {datachain-0.18.3.dist-info → datachain-0.18.5.dist-info}/WHEEL +1 -1
- {datachain-0.18.3.dist-info → datachain-0.18.5.dist-info}/entry_points.txt +0 -0
- {datachain-0.18.3.dist-info → datachain-0.18.5.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.18.3.dist-info → datachain-0.18.5.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -66,6 +66,7 @@ if TYPE_CHECKING:
|
|
|
66
66
|
)
|
|
67
67
|
from datachain.dataset import DatasetListVersion
|
|
68
68
|
from datachain.job import Job
|
|
69
|
+
from datachain.lib.listing_info import ListingInfo
|
|
69
70
|
from datachain.listing import Listing
|
|
70
71
|
|
|
71
72
|
logger = logging.getLogger("datachain")
|
|
@@ -910,11 +911,7 @@ class Catalog:
|
|
|
910
911
|
values["num_objects"] = None
|
|
911
912
|
values["size"] = None
|
|
912
913
|
values["preview"] = None
|
|
913
|
-
self.metastore.update_dataset_version(
|
|
914
|
-
dataset,
|
|
915
|
-
version,
|
|
916
|
-
**values,
|
|
917
|
-
)
|
|
914
|
+
self.metastore.update_dataset_version(dataset, version, **values)
|
|
918
915
|
return
|
|
919
916
|
|
|
920
917
|
if not dataset_version.num_objects:
|
|
@@ -934,11 +931,7 @@ class Catalog:
|
|
|
934
931
|
if not values:
|
|
935
932
|
return
|
|
936
933
|
|
|
937
|
-
self.metastore.update_dataset_version(
|
|
938
|
-
dataset,
|
|
939
|
-
version,
|
|
940
|
-
**values,
|
|
941
|
-
)
|
|
934
|
+
self.metastore.update_dataset_version(dataset, version, **values)
|
|
942
935
|
|
|
943
936
|
def update_dataset(
|
|
944
937
|
self, dataset: DatasetRecord, conn=None, **kwargs
|
|
@@ -1116,13 +1109,16 @@ class Catalog:
|
|
|
1116
1109
|
return direct_dependencies
|
|
1117
1110
|
|
|
1118
1111
|
def ls_datasets(
|
|
1119
|
-
self,
|
|
1112
|
+
self,
|
|
1113
|
+
prefix: Optional[str] = None,
|
|
1114
|
+
include_listing: bool = False,
|
|
1115
|
+
studio: bool = False,
|
|
1120
1116
|
) -> Iterator[DatasetListRecord]:
|
|
1121
1117
|
from datachain.remote.studio import StudioClient
|
|
1122
1118
|
|
|
1123
1119
|
if studio:
|
|
1124
1120
|
client = StudioClient()
|
|
1125
|
-
response = client.ls_datasets()
|
|
1121
|
+
response = client.ls_datasets(prefix=prefix)
|
|
1126
1122
|
if not response.ok:
|
|
1127
1123
|
raise DataChainError(response.message)
|
|
1128
1124
|
if not response.data:
|
|
@@ -1133,6 +1129,8 @@ class Catalog:
|
|
|
1133
1129
|
for d in response.data
|
|
1134
1130
|
if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
|
|
1135
1131
|
)
|
|
1132
|
+
elif prefix:
|
|
1133
|
+
datasets = self.metastore.list_datasets_by_prefix(prefix)
|
|
1136
1134
|
else:
|
|
1137
1135
|
datasets = self.metastore.list_datasets()
|
|
1138
1136
|
|
|
@@ -1142,39 +1140,55 @@ class Catalog:
|
|
|
1142
1140
|
|
|
1143
1141
|
def list_datasets_versions(
|
|
1144
1142
|
self,
|
|
1143
|
+
prefix: Optional[str] = None,
|
|
1145
1144
|
include_listing: bool = False,
|
|
1145
|
+
with_job: bool = True,
|
|
1146
1146
|
studio: bool = False,
|
|
1147
1147
|
) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
|
|
1148
1148
|
"""Iterate over all dataset versions with related jobs."""
|
|
1149
1149
|
datasets = list(
|
|
1150
|
-
self.ls_datasets(
|
|
1150
|
+
self.ls_datasets(
|
|
1151
|
+
prefix=prefix, include_listing=include_listing, studio=studio
|
|
1152
|
+
)
|
|
1151
1153
|
)
|
|
1152
1154
|
|
|
1153
1155
|
# preselect dataset versions jobs from db to avoid multiple queries
|
|
1154
|
-
jobs_ids: set[str] = {
|
|
1155
|
-
v.job_id for ds in datasets for v in ds.versions if v.job_id
|
|
1156
|
-
}
|
|
1157
1156
|
jobs: dict[str, Job] = {}
|
|
1158
|
-
if
|
|
1159
|
-
|
|
1157
|
+
if with_job:
|
|
1158
|
+
jobs_ids: set[str] = {
|
|
1159
|
+
v.job_id for ds in datasets for v in ds.versions if v.job_id
|
|
1160
|
+
}
|
|
1161
|
+
if jobs_ids:
|
|
1162
|
+
jobs = {
|
|
1163
|
+
j.id: j for j in self.metastore.list_jobs_by_ids(list(jobs_ids))
|
|
1164
|
+
}
|
|
1160
1165
|
|
|
1161
1166
|
for d in datasets:
|
|
1162
1167
|
yield from (
|
|
1163
|
-
(d, v, jobs.get(str(v.job_id)) if v.job_id else None)
|
|
1168
|
+
(d, v, jobs.get(str(v.job_id)) if with_job and v.job_id else None)
|
|
1164
1169
|
for v in d.versions
|
|
1165
1170
|
)
|
|
1166
1171
|
|
|
1167
|
-
def listings(self):
|
|
1172
|
+
def listings(self, prefix: Optional[str] = None) -> list["ListingInfo"]:
|
|
1168
1173
|
"""
|
|
1169
1174
|
Returns list of ListingInfo objects which are representing specific
|
|
1170
1175
|
storage listing datasets
|
|
1171
1176
|
"""
|
|
1172
|
-
from datachain.lib.listing import is_listing_dataset
|
|
1177
|
+
from datachain.lib.listing import LISTING_PREFIX, is_listing_dataset
|
|
1173
1178
|
from datachain.lib.listing_info import ListingInfo
|
|
1174
1179
|
|
|
1180
|
+
if prefix and not prefix.startswith(LISTING_PREFIX):
|
|
1181
|
+
prefix = LISTING_PREFIX + prefix
|
|
1182
|
+
|
|
1183
|
+
listing_datasets_versions = self.list_datasets_versions(
|
|
1184
|
+
prefix=prefix,
|
|
1185
|
+
include_listing=True,
|
|
1186
|
+
with_job=False,
|
|
1187
|
+
)
|
|
1188
|
+
|
|
1175
1189
|
return [
|
|
1176
1190
|
ListingInfo.from_models(d, v, j)
|
|
1177
|
-
for d, v, j in
|
|
1191
|
+
for d, v, j in listing_datasets_versions
|
|
1178
1192
|
if is_listing_dataset(d.name)
|
|
1179
1193
|
]
|
|
1180
1194
|
|
datachain/client/azure.py
CHANGED
datachain/client/gcs.py
CHANGED
|
@@ -74,7 +74,7 @@ class GCSClient(Client):
|
|
|
74
74
|
try:
|
|
75
75
|
await self._get_pages(prefix, page_queue)
|
|
76
76
|
found = await consumer
|
|
77
|
-
if not found:
|
|
77
|
+
if not found and prefix:
|
|
78
78
|
raise FileNotFoundError(f"Unable to resolve remote path: {prefix}")
|
|
79
79
|
finally:
|
|
80
80
|
consumer.cancel() # In case _get_pages() raised
|
datachain/client/s3.py
CHANGED
|
@@ -80,7 +80,7 @@ class ClientS3(Client):
|
|
|
80
80
|
finally:
|
|
81
81
|
await page_queue.put(None)
|
|
82
82
|
|
|
83
|
-
async def process_pages(page_queue, result_queue):
|
|
83
|
+
async def process_pages(page_queue, result_queue, prefix):
|
|
84
84
|
found = False
|
|
85
85
|
with tqdm(desc=f"Listing {self.uri}", unit=" objects", leave=False) as pbar:
|
|
86
86
|
while (res := await page_queue.get()) is not None:
|
|
@@ -94,7 +94,7 @@ class ClientS3(Client):
|
|
|
94
94
|
if entries:
|
|
95
95
|
await result_queue.put(entries)
|
|
96
96
|
pbar.update(len(entries))
|
|
97
|
-
if not found:
|
|
97
|
+
if not found and prefix:
|
|
98
98
|
raise FileNotFoundError(f"Unable to resolve remote path: {prefix}")
|
|
99
99
|
|
|
100
100
|
try:
|
|
@@ -118,7 +118,9 @@ class ClientS3(Client):
|
|
|
118
118
|
Delimiter="",
|
|
119
119
|
)
|
|
120
120
|
page_queue: asyncio.Queue[list] = asyncio.Queue(2)
|
|
121
|
-
consumer = asyncio.create_task(
|
|
121
|
+
consumer = asyncio.create_task(
|
|
122
|
+
process_pages(page_queue, result_queue, prefix)
|
|
123
|
+
)
|
|
122
124
|
try:
|
|
123
125
|
await get_pages(it, page_queue)
|
|
124
126
|
await consumer
|
|
@@ -36,6 +36,7 @@ from datachain.dataset import (
|
|
|
36
36
|
)
|
|
37
37
|
from datachain.error import (
|
|
38
38
|
DatasetNotFoundError,
|
|
39
|
+
DatasetVersionNotFoundError,
|
|
39
40
|
TableMissingError,
|
|
40
41
|
)
|
|
41
42
|
from datachain.job import Job
|
|
@@ -273,7 +274,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
273
274
|
self,
|
|
274
275
|
job_id: str,
|
|
275
276
|
status: Optional[JobStatus] = None,
|
|
276
|
-
exit_code: Optional[int] = None,
|
|
277
277
|
error_message: Optional[str] = None,
|
|
278
278
|
error_stack: Optional[str] = None,
|
|
279
279
|
finished_at: Optional[datetime] = None,
|
|
@@ -620,22 +620,36 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
620
620
|
self, dataset: DatasetRecord, conn=None, **kwargs
|
|
621
621
|
) -> DatasetRecord:
|
|
622
622
|
"""Updates dataset fields."""
|
|
623
|
-
values = {}
|
|
624
|
-
dataset_values = {}
|
|
623
|
+
values: dict[str, Any] = {}
|
|
624
|
+
dataset_values: dict[str, Any] = {}
|
|
625
625
|
for field, value in kwargs.items():
|
|
626
|
-
if field in self._dataset_fields
|
|
627
|
-
|
|
628
|
-
|
|
626
|
+
if field in ("id", "created_at") or field not in self._dataset_fields:
|
|
627
|
+
continue # these fields are read-only or not applicable
|
|
628
|
+
|
|
629
|
+
if value is None and field in ("name", "status", "sources", "query_script"):
|
|
630
|
+
raise ValueError(f"Field {field} cannot be None")
|
|
631
|
+
if field == "name" and not value:
|
|
632
|
+
raise ValueError("name cannot be empty")
|
|
633
|
+
|
|
634
|
+
if field == "attrs":
|
|
635
|
+
if value is None:
|
|
636
|
+
values[field] = None
|
|
629
637
|
else:
|
|
630
|
-
values[field] = value
|
|
631
|
-
|
|
632
|
-
|
|
638
|
+
values[field] = json.dumps(value)
|
|
639
|
+
dataset_values[field] = value
|
|
640
|
+
elif field == "schema":
|
|
641
|
+
if value is None:
|
|
642
|
+
values[field] = None
|
|
643
|
+
dataset_values[field] = None
|
|
633
644
|
else:
|
|
634
|
-
|
|
645
|
+
values[field] = json.dumps(value)
|
|
646
|
+
dataset_values[field] = DatasetRecord.parse_schema(value)
|
|
647
|
+
else:
|
|
648
|
+
values[field] = value
|
|
649
|
+
dataset_values[field] = value
|
|
635
650
|
|
|
636
651
|
if not values:
|
|
637
|
-
#
|
|
638
|
-
return dataset
|
|
652
|
+
return dataset # nothing to update
|
|
639
653
|
|
|
640
654
|
d = self._datasets
|
|
641
655
|
self.db.execute(
|
|
@@ -651,36 +665,70 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
651
665
|
self, dataset: DatasetRecord, version: str, conn=None, **kwargs
|
|
652
666
|
) -> DatasetVersion:
|
|
653
667
|
"""Updates dataset fields."""
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
values = {}
|
|
657
|
-
version_values: dict = {}
|
|
668
|
+
values: dict[str, Any] = {}
|
|
669
|
+
version_values: dict[str, Any] = {}
|
|
658
670
|
for field, value in kwargs.items():
|
|
659
|
-
if
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
671
|
+
if (
|
|
672
|
+
field in ("id", "created_at")
|
|
673
|
+
or field not in self._dataset_version_fields
|
|
674
|
+
):
|
|
675
|
+
continue # these fields are read-only or not applicable
|
|
676
|
+
|
|
677
|
+
if value is None and field in (
|
|
678
|
+
"status",
|
|
679
|
+
"sources",
|
|
680
|
+
"query_script",
|
|
681
|
+
"error_message",
|
|
682
|
+
"error_stack",
|
|
683
|
+
"script_output",
|
|
684
|
+
"uuid",
|
|
685
|
+
):
|
|
686
|
+
raise ValueError(f"Field {field} cannot be None")
|
|
687
|
+
|
|
688
|
+
if field == "schema":
|
|
689
|
+
values[field] = json.dumps(value) if value else None
|
|
690
|
+
version_values[field] = (
|
|
691
|
+
DatasetRecord.parse_schema(value) if value else None
|
|
692
|
+
)
|
|
693
|
+
elif field == "feature_schema":
|
|
694
|
+
if value is None:
|
|
695
|
+
values[field] = None
|
|
696
|
+
else:
|
|
697
|
+
values[field] = json.dumps(value)
|
|
698
|
+
version_values[field] = value
|
|
699
|
+
elif field == "preview":
|
|
700
|
+
if value is None:
|
|
701
|
+
values[field] = None
|
|
702
|
+
elif not isinstance(value, list):
|
|
703
|
+
raise ValueError(
|
|
704
|
+
f"Field '{field}' must be a list, got {type(value).__name__}"
|
|
705
|
+
)
|
|
669
706
|
else:
|
|
670
|
-
values[field] = value
|
|
671
|
-
|
|
707
|
+
values[field] = json.dumps(value, cls=JSONSerialize)
|
|
708
|
+
version_values["_preview_data"] = value
|
|
709
|
+
else:
|
|
710
|
+
values[field] = value
|
|
711
|
+
version_values[field] = value
|
|
672
712
|
|
|
673
|
-
if values:
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
)
|
|
681
|
-
|
|
713
|
+
if not values:
|
|
714
|
+
return dataset.get_version(version)
|
|
715
|
+
|
|
716
|
+
dv = self._datasets_versions
|
|
717
|
+
self.db.execute(
|
|
718
|
+
self._datasets_versions_update()
|
|
719
|
+
.where(dv.c.dataset_id == dataset.id, dv.c.version == version)
|
|
720
|
+
.values(values),
|
|
721
|
+
conn=conn,
|
|
722
|
+
) # type: ignore [attr-defined]
|
|
723
|
+
|
|
724
|
+
for v in dataset.versions:
|
|
725
|
+
if v.version == version:
|
|
726
|
+
v.update(**version_values)
|
|
727
|
+
return v
|
|
682
728
|
|
|
683
|
-
|
|
729
|
+
raise DatasetVersionNotFoundError(
|
|
730
|
+
f"Dataset {dataset.name} does not have version {version}"
|
|
731
|
+
)
|
|
684
732
|
|
|
685
733
|
def _parse_dataset(self, rows) -> Optional[DatasetRecord]:
|
|
686
734
|
versions = [self.dataset_class.parse(*r) for r in rows]
|
|
@@ -812,7 +860,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
812
860
|
update_data["error_message"] = error_message
|
|
813
861
|
update_data["error_stack"] = error_stack
|
|
814
862
|
|
|
815
|
-
self.update_dataset(dataset, conn=conn, **update_data)
|
|
863
|
+
dataset = self.update_dataset(dataset, conn=conn, **update_data)
|
|
816
864
|
|
|
817
865
|
if version:
|
|
818
866
|
self.update_dataset_version(dataset, version, conn=conn, **update_data)
|
|
@@ -1064,7 +1112,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1064
1112
|
self,
|
|
1065
1113
|
job_id: str,
|
|
1066
1114
|
status: Optional[JobStatus] = None,
|
|
1067
|
-
exit_code: Optional[int] = None,
|
|
1068
1115
|
error_message: Optional[str] = None,
|
|
1069
1116
|
error_stack: Optional[str] = None,
|
|
1070
1117
|
finished_at: Optional[datetime] = None,
|
|
@@ -1075,8 +1122,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1075
1122
|
values: dict = {}
|
|
1076
1123
|
if status is not None:
|
|
1077
1124
|
values["status"] = status
|
|
1078
|
-
if exit_code is not None:
|
|
1079
|
-
values["exit_code"] = exit_code
|
|
1080
1125
|
if error_message is not None:
|
|
1081
1126
|
values["error_message"] = error_message
|
|
1082
1127
|
if error_stack is not None:
|
datachain/dataset.py
CHANGED
|
@@ -93,7 +93,7 @@ class DatasetDependency:
|
|
|
93
93
|
if self.type == DatasetDependencyType.DATASET:
|
|
94
94
|
return self.name
|
|
95
95
|
|
|
96
|
-
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/")
|
|
96
|
+
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"))
|
|
97
97
|
assert list_dataset_name
|
|
98
98
|
return list_dataset_name
|
|
99
99
|
|