datachain 0.18.2__py3-none-any.whl → 0.18.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +34 -12
- datachain/dataset.py +1 -1
- datachain/lib/listing.py +2 -3
- datachain/query/dataset.py +18 -4
- datachain/remote/studio.py +4 -2
- {datachain-0.18.2.dist-info → datachain-0.18.4.dist-info}/METADATA +1 -1
- {datachain-0.18.2.dist-info → datachain-0.18.4.dist-info}/RECORD +11 -11
- {datachain-0.18.2.dist-info → datachain-0.18.4.dist-info}/WHEEL +0 -0
- {datachain-0.18.2.dist-info → datachain-0.18.4.dist-info}/entry_points.txt +0 -0
- {datachain-0.18.2.dist-info → datachain-0.18.4.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.18.2.dist-info → datachain-0.18.4.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -66,6 +66,7 @@ if TYPE_CHECKING:
|
|
|
66
66
|
)
|
|
67
67
|
from datachain.dataset import DatasetListVersion
|
|
68
68
|
from datachain.job import Job
|
|
69
|
+
from datachain.lib.listing_info import ListingInfo
|
|
69
70
|
from datachain.listing import Listing
|
|
70
71
|
|
|
71
72
|
logger = logging.getLogger("datachain")
|
|
@@ -1116,13 +1117,16 @@ class Catalog:
|
|
|
1116
1117
|
return direct_dependencies
|
|
1117
1118
|
|
|
1118
1119
|
def ls_datasets(
|
|
1119
|
-
self,
|
|
1120
|
+
self,
|
|
1121
|
+
prefix: Optional[str] = None,
|
|
1122
|
+
include_listing: bool = False,
|
|
1123
|
+
studio: bool = False,
|
|
1120
1124
|
) -> Iterator[DatasetListRecord]:
|
|
1121
1125
|
from datachain.remote.studio import StudioClient
|
|
1122
1126
|
|
|
1123
1127
|
if studio:
|
|
1124
1128
|
client = StudioClient()
|
|
1125
|
-
response = client.ls_datasets()
|
|
1129
|
+
response = client.ls_datasets(prefix=prefix)
|
|
1126
1130
|
if not response.ok:
|
|
1127
1131
|
raise DataChainError(response.message)
|
|
1128
1132
|
if not response.data:
|
|
@@ -1133,6 +1137,8 @@ class Catalog:
|
|
|
1133
1137
|
for d in response.data
|
|
1134
1138
|
if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
|
|
1135
1139
|
)
|
|
1140
|
+
elif prefix:
|
|
1141
|
+
datasets = self.metastore.list_datasets_by_prefix(prefix)
|
|
1136
1142
|
else:
|
|
1137
1143
|
datasets = self.metastore.list_datasets()
|
|
1138
1144
|
|
|
@@ -1142,39 +1148,55 @@ class Catalog:
|
|
|
1142
1148
|
|
|
1143
1149
|
def list_datasets_versions(
|
|
1144
1150
|
self,
|
|
1151
|
+
prefix: Optional[str] = None,
|
|
1145
1152
|
include_listing: bool = False,
|
|
1153
|
+
with_job: bool = True,
|
|
1146
1154
|
studio: bool = False,
|
|
1147
1155
|
) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
|
|
1148
1156
|
"""Iterate over all dataset versions with related jobs."""
|
|
1149
1157
|
datasets = list(
|
|
1150
|
-
self.ls_datasets(
|
|
1158
|
+
self.ls_datasets(
|
|
1159
|
+
prefix=prefix, include_listing=include_listing, studio=studio
|
|
1160
|
+
)
|
|
1151
1161
|
)
|
|
1152
1162
|
|
|
1153
1163
|
# preselect dataset versions jobs from db to avoid multiple queries
|
|
1154
|
-
jobs_ids: set[str] = {
|
|
1155
|
-
v.job_id for ds in datasets for v in ds.versions if v.job_id
|
|
1156
|
-
}
|
|
1157
1164
|
jobs: dict[str, Job] = {}
|
|
1158
|
-
if
|
|
1159
|
-
|
|
1165
|
+
if with_job:
|
|
1166
|
+
jobs_ids: set[str] = {
|
|
1167
|
+
v.job_id for ds in datasets for v in ds.versions if v.job_id
|
|
1168
|
+
}
|
|
1169
|
+
if jobs_ids:
|
|
1170
|
+
jobs = {
|
|
1171
|
+
j.id: j for j in self.metastore.list_jobs_by_ids(list(jobs_ids))
|
|
1172
|
+
}
|
|
1160
1173
|
|
|
1161
1174
|
for d in datasets:
|
|
1162
1175
|
yield from (
|
|
1163
|
-
(d, v, jobs.get(str(v.job_id)) if v.job_id else None)
|
|
1176
|
+
(d, v, jobs.get(str(v.job_id)) if with_job and v.job_id else None)
|
|
1164
1177
|
for v in d.versions
|
|
1165
1178
|
)
|
|
1166
1179
|
|
|
1167
|
-
def listings(self):
|
|
1180
|
+
def listings(self, prefix: Optional[str] = None) -> list["ListingInfo"]:
|
|
1168
1181
|
"""
|
|
1169
1182
|
Returns list of ListingInfo objects which are representing specific
|
|
1170
1183
|
storage listing datasets
|
|
1171
1184
|
"""
|
|
1172
|
-
from datachain.lib.listing import is_listing_dataset
|
|
1185
|
+
from datachain.lib.listing import LISTING_PREFIX, is_listing_dataset
|
|
1173
1186
|
from datachain.lib.listing_info import ListingInfo
|
|
1174
1187
|
|
|
1188
|
+
if prefix and not prefix.startswith(LISTING_PREFIX):
|
|
1189
|
+
prefix = LISTING_PREFIX + prefix
|
|
1190
|
+
|
|
1191
|
+
listing_datasets_versions = self.list_datasets_versions(
|
|
1192
|
+
prefix=prefix,
|
|
1193
|
+
include_listing=True,
|
|
1194
|
+
with_job=False,
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1175
1197
|
return [
|
|
1176
1198
|
ListingInfo.from_models(d, v, j)
|
|
1177
|
-
for d, v, j in
|
|
1199
|
+
for d, v, j in listing_datasets_versions
|
|
1178
1200
|
if is_listing_dataset(d.name)
|
|
1179
1201
|
]
|
|
1180
1202
|
|
datachain/dataset.py
CHANGED
|
@@ -93,7 +93,7 @@ class DatasetDependency:
|
|
|
93
93
|
if self.type == DatasetDependencyType.DATASET:
|
|
94
94
|
return self.name
|
|
95
95
|
|
|
96
|
-
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/")
|
|
96
|
+
list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"))
|
|
97
97
|
assert list_dataset_name
|
|
98
98
|
return list_dataset_name
|
|
99
99
|
|
datachain/lib/listing.py
CHANGED
|
@@ -107,11 +107,10 @@ def ls(
|
|
|
107
107
|
return dc.filter(pathfunc.parent(_file_c("path")) == path.lstrip("/").rstrip("/*"))
|
|
108
108
|
|
|
109
109
|
|
|
110
|
-
def parse_listing_uri(uri: str
|
|
110
|
+
def parse_listing_uri(uri: str) -> tuple[str, str, str]:
|
|
111
111
|
"""
|
|
112
112
|
Parsing uri and returns listing dataset name, listing uri and listing path
|
|
113
113
|
"""
|
|
114
|
-
client_config = client_config or {}
|
|
115
114
|
storage_uri, path = Client.parse_url(uri)
|
|
116
115
|
if uses_glob(path):
|
|
117
116
|
lst_uri_path = posixpath.dirname(path)
|
|
@@ -175,7 +174,7 @@ def get_listing(
|
|
|
175
174
|
_, path = Client.parse_url(uri)
|
|
176
175
|
return None, uri, path, False
|
|
177
176
|
|
|
178
|
-
ds_name, list_uri, list_path = parse_listing_uri(uri
|
|
177
|
+
ds_name, list_uri, list_path = parse_listing_uri(uri)
|
|
179
178
|
listing = None
|
|
180
179
|
listings = [
|
|
181
180
|
ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
|
datachain/query/dataset.py
CHANGED
|
@@ -1675,13 +1675,27 @@ class DatasetQuery:
|
|
|
1675
1675
|
return query
|
|
1676
1676
|
|
|
1677
1677
|
def _add_dependencies(self, dataset: "DatasetRecord", version: str):
|
|
1678
|
-
|
|
1679
|
-
|
|
1678
|
+
dependencies: set[DatasetDependencyType] = set()
|
|
1679
|
+
for dep_name, dep_version in self.dependencies:
|
|
1680
|
+
if Session.is_temp_dataset(dep_name):
|
|
1681
|
+
# temp dataset are created for optimization and they will be removed
|
|
1682
|
+
# afterwards. Therefore, we should not put them as dependencies, but
|
|
1683
|
+
# their own direct dependencies
|
|
1684
|
+
for dep in self.catalog.get_dataset_dependencies(
|
|
1685
|
+
dep_name, dep_version, indirect=False
|
|
1686
|
+
):
|
|
1687
|
+
if dep:
|
|
1688
|
+
dependencies.add((dep.name, dep.version))
|
|
1689
|
+
else:
|
|
1690
|
+
dependencies.add((dep_name, dep_version))
|
|
1691
|
+
|
|
1692
|
+
for dep_name, dep_version in dependencies:
|
|
1693
|
+
# ds_dependency_name, ds_dependency_version = dependency
|
|
1680
1694
|
self.catalog.metastore.add_dataset_dependency(
|
|
1681
1695
|
dataset.name,
|
|
1682
1696
|
version,
|
|
1683
|
-
|
|
1684
|
-
|
|
1697
|
+
dep_name,
|
|
1698
|
+
dep_version,
|
|
1685
1699
|
)
|
|
1686
1700
|
|
|
1687
1701
|
def exec(self) -> "Self":
|
datachain/remote/studio.py
CHANGED
|
@@ -282,8 +282,10 @@ class StudioClient:
|
|
|
282
282
|
response = self._send_request_msgpack("datachain/ls", {"source": path})
|
|
283
283
|
yield path, response
|
|
284
284
|
|
|
285
|
-
def ls_datasets(self) -> Response[LsData]:
|
|
286
|
-
return self._send_request(
|
|
285
|
+
def ls_datasets(self, prefix: Optional[str] = None) -> Response[LsData]:
|
|
286
|
+
return self._send_request(
|
|
287
|
+
"datachain/datasets", {"prefix": prefix}, method="GET"
|
|
288
|
+
)
|
|
287
289
|
|
|
288
290
|
def edit_dataset(
|
|
289
291
|
self,
|
|
@@ -3,7 +3,7 @@ datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
4
|
datachain/cache.py,sha256=yQblPhOh_Mq74Ma7xT1CL1idLJ0HgrQxpGVYvRy_9Eg,3623
|
|
5
5
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
6
|
-
datachain/dataset.py,sha256=
|
|
6
|
+
datachain/dataset.py,sha256=XUZ-kSBL1y6juFqlSWXXbattGS1E53lXpyhc0Ip1_AA,20527
|
|
7
7
|
datachain/delta.py,sha256=q-ritPMxgsTh53qJYd2N1TqZ3Inxc7GJ9JED9rE-Z1M,3994
|
|
8
8
|
datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
|
|
9
9
|
datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
|
|
@@ -19,7 +19,7 @@ datachain/studio.py,sha256=1J2ANFVVA1ysPxBuLibQSnSXt0U9Vfgz9ZNGikYtWdk,11038
|
|
|
19
19
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
20
20
|
datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
|
|
21
21
|
datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
|
|
22
|
-
datachain/catalog/catalog.py,sha256=
|
|
22
|
+
datachain/catalog/catalog.py,sha256=vC6CkPKSmF5heXx7JEHbm94BzQOqKWLg-T0CBv3a24A,59462
|
|
23
23
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
24
24
|
datachain/catalog/loader.py,sha256=UXjYD6BNRoupPvkiz3-b04jepXhtLHCA4gzKFnXxOtQ,5987
|
|
25
25
|
datachain/cli/__init__.py,sha256=eKCyqT05OMESHXCC93iQdqzusBdk1ptqZbBeaEghkgo,8344
|
|
@@ -75,7 +75,7 @@ datachain/lib/dataset_info.py,sha256=d-jz6zeDU5DEgYtyeSF5nK0MU-40FV5km_iOCh4pXzo
|
|
|
75
75
|
datachain/lib/file.py,sha256=0oFm1MWU7AatXplxRj-6Xbjjb6A_AvM_awwk9mYb0hc,30466
|
|
76
76
|
datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
|
|
77
77
|
datachain/lib/image.py,sha256=butvUY_33PVEYPKX2nVCPeJjJVcBaptZwsE9REQsTS8,3247
|
|
78
|
-
datachain/lib/listing.py,sha256=
|
|
78
|
+
datachain/lib/listing.py,sha256=5_GoATtIwCtd1JMqlorPB_vQDxndOQZpiWjNOG3NMw4,7007
|
|
79
79
|
datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
|
|
80
80
|
datachain/lib/meta_formats.py,sha256=Epydbdch1g4CojK8wd_ePzmwmljC4fVWlJtZ16jsX-A,6349
|
|
81
81
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
@@ -121,7 +121,7 @@ datachain/model/ultralytics/pose.py,sha256=gXAWfAk4OWZl93hKcQPKZvqJa3nIrECB4RM8K
|
|
|
121
121
|
datachain/model/ultralytics/segment.py,sha256=koq1HASo29isf0in6oSlzmU4IzsmOXe87F1ajQQVfh4,2911
|
|
122
122
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
123
123
|
datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
|
|
124
|
-
datachain/query/dataset.py,sha256=
|
|
124
|
+
datachain/query/dataset.py,sha256=3c3MAiIl7ZnCii_0dZA-Om73ornNMSKkna32JX3H05E,60587
|
|
125
125
|
datachain/query/dispatch.py,sha256=15M3zlTUFKM6D2ijITX4o5QxCkRe2klkODsIDi3aQOg,15544
|
|
126
126
|
datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
|
|
127
127
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -131,7 +131,7 @@ datachain/query/session.py,sha256=6_ydvPasurmc5tR11dzFj51DpUAo4NxXP9p4ltoTauc,67
|
|
|
131
131
|
datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
|
|
132
132
|
datachain/query/utils.py,sha256=HaSDNH_XGvp_NIcXjcB7j4vJRPi4_tbztDWclYelHY4,1208
|
|
133
133
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
134
|
-
datachain/remote/studio.py,sha256=
|
|
134
|
+
datachain/remote/studio.py,sha256=kUu4TBTKSky1qwdql5DtXNA045qXeoPn4o8G67eZJI4,13666
|
|
135
135
|
datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
|
|
136
136
|
datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
|
|
137
137
|
datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
|
|
@@ -153,9 +153,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
153
153
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
154
154
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
155
155
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
156
|
-
datachain-0.18.
|
|
157
|
-
datachain-0.18.
|
|
158
|
-
datachain-0.18.
|
|
159
|
-
datachain-0.18.
|
|
160
|
-
datachain-0.18.
|
|
161
|
-
datachain-0.18.
|
|
156
|
+
datachain-0.18.4.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
157
|
+
datachain-0.18.4.dist-info/METADATA,sha256=8v8KDSKKk69QhkpKbfXqNg6cW9S5FGBaGVEVVkHXl5g,11331
|
|
158
|
+
datachain-0.18.4.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
|
|
159
|
+
datachain-0.18.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
160
|
+
datachain-0.18.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
161
|
+
datachain-0.18.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|