datachain 0.18.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -66,6 +66,7 @@ if TYPE_CHECKING:
66
66
  )
67
67
  from datachain.dataset import DatasetListVersion
68
68
  from datachain.job import Job
69
+ from datachain.lib.listing_info import ListingInfo
69
70
  from datachain.listing import Listing
70
71
 
71
72
  logger = logging.getLogger("datachain")
@@ -1116,13 +1117,16 @@ class Catalog:
1116
1117
  return direct_dependencies
1117
1118
 
1118
1119
  def ls_datasets(
1119
- self, include_listing: bool = False, studio: bool = False
1120
+ self,
1121
+ prefix: Optional[str] = None,
1122
+ include_listing: bool = False,
1123
+ studio: bool = False,
1120
1124
  ) -> Iterator[DatasetListRecord]:
1121
1125
  from datachain.remote.studio import StudioClient
1122
1126
 
1123
1127
  if studio:
1124
1128
  client = StudioClient()
1125
- response = client.ls_datasets()
1129
+ response = client.ls_datasets(prefix=prefix)
1126
1130
  if not response.ok:
1127
1131
  raise DataChainError(response.message)
1128
1132
  if not response.data:
@@ -1133,6 +1137,8 @@ class Catalog:
1133
1137
  for d in response.data
1134
1138
  if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
1135
1139
  )
1140
+ elif prefix:
1141
+ datasets = self.metastore.list_datasets_by_prefix(prefix)
1136
1142
  else:
1137
1143
  datasets = self.metastore.list_datasets()
1138
1144
 
@@ -1142,39 +1148,55 @@ class Catalog:
1142
1148
 
1143
1149
  def list_datasets_versions(
1144
1150
  self,
1151
+ prefix: Optional[str] = None,
1145
1152
  include_listing: bool = False,
1153
+ with_job: bool = True,
1146
1154
  studio: bool = False,
1147
1155
  ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
1148
1156
  """Iterate over all dataset versions with related jobs."""
1149
1157
  datasets = list(
1150
- self.ls_datasets(include_listing=include_listing, studio=studio)
1158
+ self.ls_datasets(
1159
+ prefix=prefix, include_listing=include_listing, studio=studio
1160
+ )
1151
1161
  )
1152
1162
 
1153
1163
  # preselect dataset versions jobs from db to avoid multiple queries
1154
- jobs_ids: set[str] = {
1155
- v.job_id for ds in datasets for v in ds.versions if v.job_id
1156
- }
1157
1164
  jobs: dict[str, Job] = {}
1158
- if jobs_ids:
1159
- jobs = {j.id: j for j in self.metastore.list_jobs_by_ids(list(jobs_ids))}
1165
+ if with_job:
1166
+ jobs_ids: set[str] = {
1167
+ v.job_id for ds in datasets for v in ds.versions if v.job_id
1168
+ }
1169
+ if jobs_ids:
1170
+ jobs = {
1171
+ j.id: j for j in self.metastore.list_jobs_by_ids(list(jobs_ids))
1172
+ }
1160
1173
 
1161
1174
  for d in datasets:
1162
1175
  yield from (
1163
- (d, v, jobs.get(str(v.job_id)) if v.job_id else None)
1176
+ (d, v, jobs.get(str(v.job_id)) if with_job and v.job_id else None)
1164
1177
  for v in d.versions
1165
1178
  )
1166
1179
 
1167
- def listings(self):
1180
+ def listings(self, prefix: Optional[str] = None) -> list["ListingInfo"]:
1168
1181
  """
1169
1182
  Returns list of ListingInfo objects which are representing specific
1170
1183
  storage listing datasets
1171
1184
  """
1172
- from datachain.lib.listing import is_listing_dataset
1185
+ from datachain.lib.listing import LISTING_PREFIX, is_listing_dataset
1173
1186
  from datachain.lib.listing_info import ListingInfo
1174
1187
 
1188
+ if prefix and not prefix.startswith(LISTING_PREFIX):
1189
+ prefix = LISTING_PREFIX + prefix
1190
+
1191
+ listing_datasets_versions = self.list_datasets_versions(
1192
+ prefix=prefix,
1193
+ include_listing=True,
1194
+ with_job=False,
1195
+ )
1196
+
1175
1197
  return [
1176
1198
  ListingInfo.from_models(d, v, j)
1177
- for d, v, j in self.list_datasets_versions(include_listing=True)
1199
+ for d, v, j in listing_datasets_versions
1178
1200
  if is_listing_dataset(d.name)
1179
1201
  ]
1180
1202
 
datachain/dataset.py CHANGED
@@ -93,7 +93,7 @@ class DatasetDependency:
93
93
  if self.type == DatasetDependencyType.DATASET:
94
94
  return self.name
95
95
 
96
- list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), {})
96
+ list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"))
97
97
  assert list_dataset_name
98
98
  return list_dataset_name
99
99
 
datachain/lib/listing.py CHANGED
@@ -107,11 +107,10 @@ def ls(
107
107
  return dc.filter(pathfunc.parent(_file_c("path")) == path.lstrip("/").rstrip("/*"))
108
108
 
109
109
 
110
- def parse_listing_uri(uri: str, client_config) -> tuple[str, str, str]:
110
+ def parse_listing_uri(uri: str) -> tuple[str, str, str]:
111
111
  """
112
112
  Parsing uri and returns listing dataset name, listing uri and listing path
113
113
  """
114
- client_config = client_config or {}
115
114
  storage_uri, path = Client.parse_url(uri)
116
115
  if uses_glob(path):
117
116
  lst_uri_path = posixpath.dirname(path)
@@ -175,7 +174,7 @@ def get_listing(
175
174
  _, path = Client.parse_url(uri)
176
175
  return None, uri, path, False
177
176
 
178
- ds_name, list_uri, list_path = parse_listing_uri(uri, client_config)
177
+ ds_name, list_uri, list_path = parse_listing_uri(uri)
179
178
  listing = None
180
179
  listings = [
181
180
  ls for ls in catalog.listings() if not ls.is_expired and ls.contains(ds_name)
@@ -1675,13 +1675,27 @@ class DatasetQuery:
1675
1675
  return query
1676
1676
 
1677
1677
  def _add_dependencies(self, dataset: "DatasetRecord", version: str):
1678
- for dependency in self.dependencies:
1679
- ds_dependency_name, ds_dependency_version = dependency
1678
+ dependencies: set[DatasetDependencyType] = set()
1679
+ for dep_name, dep_version in self.dependencies:
1680
+ if Session.is_temp_dataset(dep_name):
1681
+ # temp dataset are created for optimization and they will be removed
1682
+ # afterwards. Therefore, we should not put them as dependencies, but
1683
+ # their own direct dependencies
1684
+ for dep in self.catalog.get_dataset_dependencies(
1685
+ dep_name, dep_version, indirect=False
1686
+ ):
1687
+ if dep:
1688
+ dependencies.add((dep.name, dep.version))
1689
+ else:
1690
+ dependencies.add((dep_name, dep_version))
1691
+
1692
+ for dep_name, dep_version in dependencies:
1693
+ # ds_dependency_name, ds_dependency_version = dependency
1680
1694
  self.catalog.metastore.add_dataset_dependency(
1681
1695
  dataset.name,
1682
1696
  version,
1683
- ds_dependency_name,
1684
- ds_dependency_version,
1697
+ dep_name,
1698
+ dep_version,
1685
1699
  )
1686
1700
 
1687
1701
  def exec(self) -> "Self":
@@ -282,8 +282,10 @@ class StudioClient:
282
282
  response = self._send_request_msgpack("datachain/ls", {"source": path})
283
283
  yield path, response
284
284
 
285
- def ls_datasets(self) -> Response[LsData]:
286
- return self._send_request("datachain/datasets", {}, method="GET")
285
+ def ls_datasets(self, prefix: Optional[str] = None) -> Response[LsData]:
286
+ return self._send_request(
287
+ "datachain/datasets", {"prefix": prefix}, method="GET"
288
+ )
287
289
 
288
290
  def edit_dataset(
289
291
  self,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.18.2
3
+ Version: 0.18.4
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -3,7 +3,7 @@ datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
4
4
  datachain/cache.py,sha256=yQblPhOh_Mq74Ma7xT1CL1idLJ0HgrQxpGVYvRy_9Eg,3623
5
5
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
6
- datachain/dataset.py,sha256=F0Yk9SmyAf0RNSAEWGjOyZ4nxgMNi538FaQaLPe7bJk,20531
6
+ datachain/dataset.py,sha256=XUZ-kSBL1y6juFqlSWXXbattGS1E53lXpyhc0Ip1_AA,20527
7
7
  datachain/delta.py,sha256=q-ritPMxgsTh53qJYd2N1TqZ3Inxc7GJ9JED9rE-Z1M,3994
8
8
  datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
9
9
  datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
@@ -19,7 +19,7 @@ datachain/studio.py,sha256=1J2ANFVVA1ysPxBuLibQSnSXt0U9Vfgz9ZNGikYtWdk,11038
19
19
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
20
20
  datachain/utils.py,sha256=DNqOi-Ydb7InyWvD9m7_yailxz6-YGpZzh00biQaHNo,15305
21
21
  datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
22
- datachain/catalog/catalog.py,sha256=aB8IGLuvWjZVROOmOKksA0gKiLQyur9Z4GCRdjgfdXo,58725
22
+ datachain/catalog/catalog.py,sha256=vC6CkPKSmF5heXx7JEHbm94BzQOqKWLg-T0CBv3a24A,59462
23
23
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
24
24
  datachain/catalog/loader.py,sha256=UXjYD6BNRoupPvkiz3-b04jepXhtLHCA4gzKFnXxOtQ,5987
25
25
  datachain/cli/__init__.py,sha256=eKCyqT05OMESHXCC93iQdqzusBdk1ptqZbBeaEghkgo,8344
@@ -75,7 +75,7 @@ datachain/lib/dataset_info.py,sha256=d-jz6zeDU5DEgYtyeSF5nK0MU-40FV5km_iOCh4pXzo
75
75
  datachain/lib/file.py,sha256=0oFm1MWU7AatXplxRj-6Xbjjb6A_AvM_awwk9mYb0hc,30466
76
76
  datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
77
77
  datachain/lib/image.py,sha256=butvUY_33PVEYPKX2nVCPeJjJVcBaptZwsE9REQsTS8,3247
78
- datachain/lib/listing.py,sha256=lFG1Ms6jwm_uqlOcsBUpkmyeSO9TZdtNd820PEpAHP4,7077
78
+ datachain/lib/listing.py,sha256=5_GoATtIwCtd1JMqlorPB_vQDxndOQZpiWjNOG3NMw4,7007
79
79
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
80
80
  datachain/lib/meta_formats.py,sha256=Epydbdch1g4CojK8wd_ePzmwmljC4fVWlJtZ16jsX-A,6349
81
81
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
@@ -121,7 +121,7 @@ datachain/model/ultralytics/pose.py,sha256=gXAWfAk4OWZl93hKcQPKZvqJa3nIrECB4RM8K
121
121
  datachain/model/ultralytics/segment.py,sha256=koq1HASo29isf0in6oSlzmU4IzsmOXe87F1ajQQVfh4,2911
122
122
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
123
123
  datachain/query/batch.py,sha256=-goxLpE0EUvaDHu66rstj53UnfHpYfBUGux8GSpJ93k,4306
124
- datachain/query/dataset.py,sha256=ac4mameklmZ-mnL3ZSzv5n8teaPnoXT8aWCdGlgkZE8,59904
124
+ datachain/query/dataset.py,sha256=3c3MAiIl7ZnCii_0dZA-Om73ornNMSKkna32JX3H05E,60587
125
125
  datachain/query/dispatch.py,sha256=15M3zlTUFKM6D2ijITX4o5QxCkRe2klkODsIDi3aQOg,15544
126
126
  datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,858
127
127
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -131,7 +131,7 @@ datachain/query/session.py,sha256=6_ydvPasurmc5tR11dzFj51DpUAo4NxXP9p4ltoTauc,67
131
131
  datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
132
132
  datachain/query/utils.py,sha256=HaSDNH_XGvp_NIcXjcB7j4vJRPi4_tbztDWclYelHY4,1208
133
133
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
- datachain/remote/studio.py,sha256=pkgrhG7Bc5Z8RykgTg0S_XXiI8CpRnEbyXrOb5osgAM,13598
134
+ datachain/remote/studio.py,sha256=kUu4TBTKSky1qwdql5DtXNA045qXeoPn4o8G67eZJI4,13666
135
135
  datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
136
136
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
137
137
  datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -153,9 +153,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
153
153
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
154
154
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
155
155
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
156
- datachain-0.18.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
157
- datachain-0.18.2.dist-info/METADATA,sha256=85UZ7jLkbhT_UI7oaFBD5m1NZ1dL_qn4VKfXCQxTLiY,11331
158
- datachain-0.18.2.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
159
- datachain-0.18.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
160
- datachain-0.18.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
161
- datachain-0.18.2.dist-info/RECORD,,
156
+ datachain-0.18.4.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
157
+ datachain-0.18.4.dist-info/METADATA,sha256=8v8KDSKKk69QhkpKbfXqNg6cW9S5FGBaGVEVVkHXl5g,11331
158
+ datachain-0.18.4.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
159
+ datachain-0.18.4.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
160
+ datachain-0.18.4.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
161
+ datachain-0.18.4.dist-info/RECORD,,