datachain 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -156,8 +156,6 @@ class QueryResult(NamedTuple):
156
156
  dataset: Optional[DatasetRecord]
157
157
  version: Optional[int]
158
158
  output: str
159
- preview: Optional[list[dict]]
160
- metrics: dict[str, Any]
161
159
 
162
160
 
163
161
  class DatasetRowsFetcher(NodesThreadPool):
@@ -1020,20 +1018,6 @@ class Catalog:
1020
1018
 
1021
1019
  return node_groups
1022
1020
 
1023
- def unlist_source(self, uri: StorageURI) -> None:
1024
- self.metastore.clone(uri=uri).mark_storage_not_indexed(uri)
1025
-
1026
- def storage_stats(self, uri: StorageURI) -> Optional[DatasetStats]:
1027
- """
1028
- Returns tuple with storage stats: total number of rows and total dataset size.
1029
- """
1030
- partial_path = self.metastore.get_last_partial_path(uri)
1031
- if partial_path is None:
1032
- return None
1033
- dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
1034
-
1035
- return self.dataset_stats(dataset.name, dataset.latest_version)
1036
-
1037
1021
  def create_dataset(
1038
1022
  self,
1039
1023
  name: str,
@@ -1297,19 +1281,6 @@ class Catalog:
1297
1281
 
1298
1282
  return self.get_dataset(name)
1299
1283
 
1300
- def register_new_dataset(
1301
- self,
1302
- source_dataset: DatasetRecord,
1303
- source_version: int,
1304
- target_name: str,
1305
- ) -> DatasetRecord:
1306
- target_dataset = self.metastore.create_dataset(
1307
- target_name,
1308
- query_script=source_dataset.query_script,
1309
- schema=source_dataset.serialized_schema,
1310
- )
1311
- return self.register_dataset(source_dataset, source_version, target_dataset, 1)
1312
-
1313
1284
  def register_dataset(
1314
1285
  self,
1315
1286
  dataset: DatasetRecord,
@@ -1422,17 +1393,18 @@ class Catalog:
1422
1393
 
1423
1394
  return direct_dependencies
1424
1395
 
1425
- def ls_datasets(self) -> Iterator[DatasetRecord]:
1396
+ def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetRecord]:
1426
1397
  datasets = self.metastore.list_datasets()
1427
1398
  for d in datasets:
1428
- if not d.is_bucket_listing:
1399
+ if not d.is_bucket_listing or include_listing:
1429
1400
  yield d
1430
1401
 
1431
1402
  def list_datasets_versions(
1432
1403
  self,
1404
+ include_listing: bool = False,
1433
1405
  ) -> Iterator[tuple[DatasetRecord, "DatasetVersion", Optional["Job"]]]:
1434
1406
  """Iterate over all dataset versions with related jobs."""
1435
- datasets = list(self.ls_datasets())
1407
+ datasets = list(self.ls_datasets(include_listing=include_listing))
1436
1408
 
1437
1409
  # preselect dataset versions jobs from db to avoid multiple queries
1438
1410
  jobs_ids: set[str] = {
@@ -1560,17 +1532,8 @@ class Catalog:
1560
1532
  version = self.get_dataset(dataset_name).get_version(dataset_version)
1561
1533
 
1562
1534
  file_signals_values = {}
1563
- file_schemas = {}
1564
- # TODO: To remove after we properly fix deserialization
1565
- for signal, type_name in version.feature_schema.items():
1566
- from datachain.lib.model_store import ModelStore
1567
-
1568
- type_name_parsed, v = ModelStore.parse_name_version(type_name)
1569
- fr = ModelStore.get(type_name_parsed, v)
1570
- if fr and issubclass(fr, File):
1571
- file_schemas[signal] = type_name
1572
1535
 
1573
- schema = SignalSchema.deserialize(file_schemas)
1536
+ schema = SignalSchema.deserialize(version.feature_schema)
1574
1537
  for file_signals in schema.get_signals(File):
1575
1538
  prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
1576
1539
  file_signals_values[file_signals] = {
@@ -1641,15 +1604,6 @@ class Catalog:
1641
1604
  for source in data_sources: # type: ignore [union-attr]
1642
1605
  yield source, source.ls(fields)
1643
1606
 
1644
- def ls_storage_uris(self) -> Iterator[str]:
1645
- yield from self.metastore.get_all_storage_uris()
1646
-
1647
- def get_storage(self, uri: StorageURI) -> Storage:
1648
- return self.metastore.get_storage(uri)
1649
-
1650
- def ls_storages(self) -> list[Storage]:
1651
- return self.metastore.list_storages()
1652
-
1653
1607
  def pull_dataset(
1654
1608
  self,
1655
1609
  dataset_uri: str,
@@ -1883,10 +1837,6 @@ class Catalog:
1883
1837
  envs: Optional[Mapping[str, str]] = None,
1884
1838
  python_executable: Optional[str] = None,
1885
1839
  save: bool = False,
1886
- save_as: Optional[str] = None,
1887
- preview_limit: int = 10,
1888
- preview_offset: int = 0,
1889
- preview_columns: Optional[list[str]] = None,
1890
1840
  capture_output: bool = True,
1891
1841
  output_hook: Callable[[str], None] = noop,
1892
1842
  params: Optional[dict[str, str]] = None,
@@ -1914,9 +1864,8 @@ class Catalog:
1914
1864
  C.size > 1000
1915
1865
  )
1916
1866
  """
1917
- from datachain.query.dataset import ExecutionResult
1918
1867
 
1919
- feature_file = tempfile.NamedTemporaryFile(
1868
+ feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
1920
1869
  dir=os.getcwd(), suffix=".py", delete=False
1921
1870
  )
1922
1871
  _, feature_module = os.path.split(feature_file.name)
@@ -1931,11 +1880,7 @@ class Catalog:
1931
1880
  feature_module,
1932
1881
  output_hook,
1933
1882
  params,
1934
- preview_columns,
1935
- preview_limit,
1936
- preview_offset,
1937
1883
  save,
1938
- save_as,
1939
1884
  job_id,
1940
1885
  )
1941
1886
  finally:
@@ -1964,25 +1909,18 @@ class Catalog:
1964
1909
  )
1965
1910
 
1966
1911
  try:
1967
- response = json.loads(response_text)
1912
+ result = json.loads(response_text)
1968
1913
  except ValueError:
1969
- response = {}
1970
- exec_result = ExecutionResult(**response)
1914
+ result = None
1971
1915
 
1972
1916
  dataset: Optional[DatasetRecord] = None
1973
1917
  version: Optional[int] = None
1974
- if save or save_as:
1918
+ if save:
1975
1919
  dataset, version = self.save_result(
1976
- query_script, exec_result, output, version, job_id
1920
+ query_script, result, output, version, job_id
1977
1921
  )
1978
1922
 
1979
- return QueryResult(
1980
- dataset=dataset,
1981
- version=version,
1982
- output=output,
1983
- preview=exec_result.preview,
1984
- metrics=exec_result.metrics,
1985
- )
1923
+ return QueryResult(dataset=dataset, version=version, output=output)
1986
1924
 
1987
1925
  def run_query(
1988
1926
  self,
@@ -1994,11 +1932,7 @@ class Catalog:
1994
1932
  feature_module: str,
1995
1933
  output_hook: Callable[[str], None],
1996
1934
  params: Optional[dict[str, str]],
1997
- preview_columns: Optional[list[str]],
1998
- preview_limit: int,
1999
- preview_offset: int,
2000
1935
  save: bool,
2001
- save_as: Optional[str],
2002
1936
  job_id: Optional[str],
2003
1937
  ) -> tuple[list[str], subprocess.Popen, str]:
2004
1938
  try:
@@ -2013,10 +1947,6 @@ class Catalog:
2013
1947
  raise QueryScriptCompileError(
2014
1948
  f"Query script failed to compile, reason: {exc}"
2015
1949
  ) from exc
2016
- if save_as and save_as.startswith(QUERY_DATASET_PREFIX):
2017
- raise ValueError(
2018
- f"Cannot use {QUERY_DATASET_PREFIX} prefix for dataset name"
2019
- )
2020
1950
  r, w = os.pipe()
2021
1951
  if os.name == "nt":
2022
1952
  import msvcrt
@@ -2039,15 +1969,7 @@ class Catalog:
2039
1969
  {
2040
1970
  "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
2041
1971
  "PYTHONPATH": os.getcwd(), # For local imports
2042
- "DATACHAIN_QUERY_PREVIEW_ARGS": json.dumps(
2043
- {
2044
- "limit": preview_limit,
2045
- "offset": preview_offset,
2046
- "columns": preview_columns,
2047
- }
2048
- ),
2049
1972
  "DATACHAIN_QUERY_SAVE": "1" if save else "",
2050
- "DATACHAIN_QUERY_SAVE_AS": save_as or "",
2051
1973
  "PYTHONUNBUFFERED": "1",
2052
1974
  "DATACHAIN_OUTPUT_FD": str(handle),
2053
1975
  "DATACHAIN_JOB_ID": job_id or "",
@@ -2077,12 +1999,12 @@ class Catalog:
2077
1999
  return lines, proc, response_text
2078
2000
 
2079
2001
  def save_result(self, query_script, exec_result, output, version, job_id):
2080
- if not exec_result.dataset:
2002
+ if not exec_result:
2081
2003
  raise QueryScriptDatasetNotFound(
2082
2004
  "No dataset found after running Query script",
2083
2005
  output=output,
2084
2006
  )
2085
- name, version = exec_result.dataset
2007
+ name, version = exec_result
2086
2008
  # finding returning dataset
2087
2009
  try:
2088
2010
  dataset = self.get_dataset(name)
datachain/cli.py CHANGED
@@ -14,6 +14,7 @@ import shtab
14
14
 
15
15
  from datachain import utils
16
16
  from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
17
+ from datachain.lib.dc import DataChain
17
18
  from datachain.utils import DataChainDir
18
19
 
19
20
  if TYPE_CHECKING:
@@ -472,9 +473,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
472
473
  query_parser.add_argument(
473
474
  "script", metavar="<script.py>", type=str, help="Filepath for script"
474
475
  )
475
- query_parser.add_argument(
476
- "dataset_name", nargs="?", type=str, help="Save result dataset as"
477
- )
478
476
  query_parser.add_argument(
479
477
  "--parallel",
480
478
  nargs="?",
@@ -487,7 +485,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
487
485
  "N defaults to the CPU count."
488
486
  ),
489
487
  )
490
- add_show_args(query_parser)
491
488
  query_parser.add_argument(
492
489
  "-p",
493
490
  "--param",
@@ -619,18 +616,6 @@ def _ls_urls_flat(
619
616
  raise FileNotFoundError(f"No such file or directory: {source}")
620
617
 
621
618
 
622
- def ls_indexed_storages(catalog: "Catalog", long: bool = False) -> Iterator[str]:
623
- from datachain.node import long_line_str
624
-
625
- storage_uris = catalog.ls_storage_uris()
626
- if long:
627
- for uri in storage_uris:
628
- # TODO: add Storage.created so it can be used here
629
- yield long_line_str(uri, None, "")
630
- else:
631
- yield from storage_uris
632
-
633
-
634
619
  def ls_local(
635
620
  sources,
636
621
  long: bool = False,
@@ -661,8 +646,9 @@ def ls_local(
661
646
  for entry in entries:
662
647
  print(format_ls_entry(entry))
663
648
  else:
664
- for entry in ls_indexed_storages(catalog, long=long):
665
- print(format_ls_entry(entry))
649
+ chain = DataChain.listings()
650
+ for ls in chain.collect("listing"):
651
+ print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
666
652
 
667
653
 
668
654
  def format_ls_entry(entry: str) -> str:
@@ -813,16 +799,10 @@ def show(
813
799
  def query(
814
800
  catalog: "Catalog",
815
801
  script: str,
816
- dataset_name: Optional[str] = None,
817
802
  parallel: Optional[int] = None,
818
- limit: int = 10,
819
- offset: int = 0,
820
- columns: Optional[list[str]] = None,
821
- no_collapse: bool = False,
822
803
  params: Optional[dict[str, str]] = None,
823
804
  ) -> None:
824
805
  from datachain.data_storage import JobQueryType, JobStatus
825
- from datachain.utils import show_records
826
806
 
827
807
  with open(script, encoding="utf-8") as f:
828
808
  script_content = f.read()
@@ -843,13 +823,9 @@ def query(
843
823
  )
844
824
 
845
825
  try:
846
- result = catalog.query(
826
+ catalog.query(
847
827
  script_content,
848
828
  python_executable=python_executable,
849
- save_as=dataset_name,
850
- preview_limit=limit,
851
- preview_offset=offset,
852
- preview_columns=columns,
853
829
  capture_output=False,
854
830
  params=params,
855
831
  job_id=job_id,
@@ -864,10 +840,7 @@ def query(
864
840
  error_stack=error_stack,
865
841
  )
866
842
  raise
867
-
868
- catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE, metrics=result.metrics)
869
-
870
- show_records(result.preview, collapse_columns=not no_collapse)
843
+ catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
871
844
 
872
845
 
873
846
  def clear_cache(catalog: "Catalog"):
@@ -1042,12 +1015,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1042
1015
  query(
1043
1016
  catalog,
1044
1017
  args.script,
1045
- dataset_name=args.dataset_name,
1046
1018
  parallel=args.parallel,
1047
- limit=args.limit,
1048
- offset=args.offset,
1049
- columns=args.columns,
1050
- no_collapse=args.no_collapse,
1051
1019
  params=args.param,
1052
1020
  )
1053
1021
  elif args.command == "apply-udf":
@@ -87,6 +87,7 @@ class Client(ABC):
87
87
  def get_implementation(url: str) -> type["Client"]:
88
88
  from .azure import AzureClient
89
89
  from .gcs import GCSClient
90
+ from .hf import HfClient
90
91
  from .local import FileClient
91
92
  from .s3 import ClientS3
92
93
 
@@ -104,6 +105,8 @@ class Client(ABC):
104
105
  return AzureClient
105
106
  if protocol == FileClient.protocol:
106
107
  return FileClient
108
+ if protocol == HfClient.protocol:
109
+ return HfClient
107
110
 
108
111
  raise NotImplementedError(f"Unsupported protocol: {protocol}")
109
112
 
datachain/client/hf.py ADDED
@@ -0,0 +1,47 @@
1
+ import os
2
+ import posixpath
3
+ from typing import Any, cast
4
+
5
+ from huggingface_hub import HfFileSystem
6
+
7
+ from datachain.lib.file import File
8
+ from datachain.node import Entry
9
+
10
+ from .fsspec import Client
11
+
12
+
13
+ class HfClient(Client):
14
+ FS_CLASS = HfFileSystem
15
+ PREFIX = "hf://"
16
+ protocol = "hf"
17
+
18
+ @classmethod
19
+ def create_fs(cls, **kwargs) -> HfFileSystem:
20
+ if os.environ.get("HF_TOKEN"):
21
+ kwargs["token"] = os.environ["HF_TOKEN"]
22
+
23
+ return cast(HfFileSystem, super().create_fs(**kwargs))
24
+
25
+ def convert_info(self, v: dict[str, Any], path: str) -> Entry:
26
+ return Entry.from_file(
27
+ path=path,
28
+ size=v["size"],
29
+ version=v["last_commit"].oid,
30
+ etag=v.get("blob_id", ""),
31
+ last_modified=v["last_commit"].date,
32
+ )
33
+
34
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
35
+ return File(
36
+ path=path,
37
+ size=v["size"],
38
+ version=v["last_commit"].oid,
39
+ etag=v.get("blob_id", ""),
40
+ last_modified=v["last_commit"].date,
41
+ )
42
+
43
+ async def ls_dir(self, path):
44
+ return self.fs.ls(path, detail=True)
45
+
46
+ def rel_path(self, path):
47
+ return posixpath.relpath(path, self.name)
@@ -167,21 +167,10 @@ class AbstractMetastore(ABC, Serializable):
167
167
  This method should be called when index operation is finished.
168
168
  """
169
169
 
170
- @abstractmethod
171
- def mark_storage_not_indexed(self, uri: StorageURI) -> None:
172
- """
173
- Mark storage as not indexed.
174
- This method should be called when storage index is deleted.
175
- """
176
-
177
170
  @abstractmethod
178
171
  def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
179
172
  """Updates last inserted datetime in bucket with current time."""
180
173
 
181
- @abstractmethod
182
- def get_all_storage_uris(self) -> Iterator[StorageURI]:
183
- """Returns all storage uris."""
184
-
185
174
  @abstractmethod
186
175
  def get_storage(self, uri: StorageURI) -> Storage:
187
176
  """
@@ -189,10 +178,6 @@ class AbstractMetastore(ABC, Serializable):
189
178
  E.g. if s3 is used as storage this would be s3 bucket data.
190
179
  """
191
180
 
192
- @abstractmethod
193
- def list_storages(self) -> list[Storage]:
194
- """Returns all storages."""
195
-
196
181
  @abstractmethod
197
182
  def mark_storage_pending(self, storage: Storage) -> Storage:
198
183
  """Marks storage as pending."""
@@ -324,7 +309,7 @@ class AbstractMetastore(ABC, Serializable):
324
309
  self.add_dataset_dependency(
325
310
  source_dataset_name,
326
311
  source_dataset_version,
327
- dependency.name,
312
+ dependency.dataset_name,
328
313
  int(dependency.version),
329
314
  )
330
315
  else:
@@ -906,11 +891,6 @@ class AbstractDBMetastore(AbstractMetastore):
906
891
  self._storages_update().where(s.c.uri == uri).values(**updates) # type: ignore [attr-defined]
907
892
  )
908
893
 
909
- def get_all_storage_uris(self) -> Iterator[StorageURI]:
910
- """Returns all storage uris."""
911
- s = self._storages
912
- yield from (r[0] for r in self.db.execute(self._storages_select(s.c.uri)))
913
-
914
894
  def get_storage(self, uri: StorageURI, conn=None) -> Storage:
915
895
  """
916
896
  Gets storage representation from database.
@@ -926,13 +906,6 @@ class AbstractDBMetastore(AbstractMetastore):
926
906
 
927
907
  return self.storage_class._make(result)
928
908
 
929
- def list_storages(self) -> list[Storage]:
930
- result = self.db.execute(self._storages_select())
931
- if not result:
932
- return []
933
-
934
- return [self.storage_class._make(r) for r in result]
935
-
936
909
  def mark_storage_pending(self, storage: Storage, conn=None) -> Storage:
937
910
  # Update status to pending and dates
938
911
  updates = {
@@ -1503,7 +1476,7 @@ class AbstractDBMetastore(AbstractMetastore):
1503
1476
  return self._jobs.update().where(*where)
1504
1477
 
1505
1478
  def _parse_job(self, rows) -> Job:
1506
- return Job.parse(*rows)
1479
+ return self.job_class.parse(*rows)
1507
1480
 
1508
1481
  def _parse_jobs(self, rows) -> Iterator["Job"]:
1509
1482
  for _, g in groupby(rows, lambda r: r[0]):
@@ -143,7 +143,9 @@ class SQLiteDatabaseEngine(DatabaseEngine):
143
143
  db.execute("PRAGMA synchronous = NORMAL")
144
144
  db.execute("PRAGMA case_sensitive_like = ON")
145
145
  if os.environ.get("DEBUG_SHOW_SQL_QUERIES"):
146
- db.set_trace_callback(print)
146
+ import sys
147
+
148
+ db.set_trace_callback(sys.stderr.write)
147
149
 
148
150
  load_usearch_extension(db)
149
151
 
@@ -515,17 +517,6 @@ class SQLiteMetastore(AbstractDBMetastore):
515
517
  def _datasets_dependencies_insert(self) -> "Insert":
516
518
  return sqlite.insert(self._datasets_dependencies)
517
519
 
518
- #
519
- # Storages
520
- #
521
-
522
- def mark_storage_not_indexed(self, uri: StorageURI) -> None:
523
- """
524
- Mark storage as not indexed.
525
- This method should be called when storage index is deleted.
526
- """
527
- self.db.execute(self._storages_delete().where(self._storages.c.uri == uri))
528
-
529
520
  #
530
521
  # Dataset dependencies
531
522
  #
@@ -218,35 +218,26 @@ class AbstractWarehouse(ABC, Serializable):
218
218
  results = None
219
219
  offset = 0
220
220
  num_yielded = 0
221
- try:
222
- while True:
223
- if limit is not None:
224
- limit -= num_yielded
225
- if limit == 0:
226
- break
227
- if limit < page_size:
228
- paginated_query = paginated_query.limit(None).limit(limit)
229
-
230
- results = self.dataset_rows_select(paginated_query.offset(offset))
231
-
232
- processed = False
233
- for row in results:
234
- processed = True
235
- yield row
236
- num_yielded += 1
237
-
238
- if not processed:
239
- break # no more results
240
- offset += page_size
241
- finally:
242
- # https://www2.sqlite.org/cvstrac/wiki?p=DatabaseIsLocked (SELECT not
243
- # finalized or reset) to prevent database table is locked error when an
244
- # exception is raised in the middle of processing the results (e.g.
245
- # https://github.com/iterative/dvcx/issues/924). Connections close
246
- # apparently is not enough in some cases, at least on sqlite
247
- # https://www.sqlite.org/c3ref/close.html
248
- if results and hasattr(results, "close"):
249
- results.close()
221
+
222
+ while True:
223
+ if limit is not None:
224
+ limit -= num_yielded
225
+ if limit == 0:
226
+ break
227
+ if limit < page_size:
228
+ paginated_query = paginated_query.limit(None).limit(limit)
229
+
230
+ results = self.dataset_rows_select(paginated_query.offset(offset))
231
+
232
+ processed = False
233
+ for row in results:
234
+ processed = True
235
+ yield row
236
+ num_yielded += 1
237
+
238
+ if not processed:
239
+ break # no more results
240
+ offset += page_size
250
241
 
251
242
  #
252
243
  # Table Name Internal Functions
datachain/dataset.py CHANGED
@@ -11,8 +11,6 @@ from typing import (
11
11
  )
12
12
  from urllib.parse import urlparse
13
13
 
14
- from dateutil.parser import isoparse
15
-
16
14
  from datachain.client import Client
17
15
  from datachain.sql.types import NAME_TYPES_MAPPING, SQLType
18
16
 
@@ -25,6 +23,7 @@ DD = TypeVar("DD", bound="DatasetDependency")
25
23
 
26
24
  DATASET_PREFIX = "ds://"
27
25
  QUERY_DATASET_PREFIX = "ds_query_"
26
+ LISTING_PREFIX = "lst__"
28
27
 
29
28
 
30
29
  def parse_dataset_uri(uri: str) -> tuple[str, Optional[int]]:
@@ -72,11 +71,22 @@ class DatasetDependencyType:
72
71
  class DatasetDependency:
73
72
  id: int
74
73
  type: str
75
- name: str # when the type is STORAGE, this is actually StorageURI
76
- version: str # string until we'll have proper bucket listing versions
74
+ name: str
75
+ version: str # TODO change to int
77
76
  created_at: datetime
78
77
  dependencies: list[Optional["DatasetDependency"]]
79
78
 
79
+ @property
80
+ def dataset_name(self) -> str:
81
+ """Returns clean dependency dataset name"""
82
+ from datachain.lib.listing import parse_listing_uri
83
+
84
+ if self.type == DatasetDependencyType.DATASET:
85
+ return self.name
86
+
87
+ list_dataset_name, _, _ = parse_listing_uri(self.name.strip("/"), None, {})
88
+ return list_dataset_name
89
+
80
90
  @classmethod
81
91
  def parse(
82
92
  cls: builtins.type[DD],
@@ -91,33 +101,31 @@ class DatasetDependency:
91
101
  dataset_version_created_at: Optional[datetime],
92
102
  bucket_uri: Optional["StorageURI"],
93
103
  ) -> Optional["DatasetDependency"]:
94
- if dataset_id:
95
- assert dataset_name is not None
96
- return cls(
97
- id,
98
- DatasetDependencyType.DATASET,
99
- dataset_name,
100
- (
101
- str(dataset_version) # type: ignore[arg-type]
102
- if dataset_version
103
- else None
104
- ),
105
- dataset_version_created_at or dataset_created_at, # type: ignore[arg-type]
106
- [],
107
- )
108
- if bucket_uri:
109
- return cls(
110
- id,
111
- DatasetDependencyType.STORAGE,
112
- bucket_uri,
113
- bucket_version, # type: ignore[arg-type]
114
- isoparse(bucket_version), # type: ignore[arg-type]
115
- [],
116
- )
117
- # dependency has been removed
118
- # TODO we should introduce flags for removed datasets, instead of
119
- # removing them from tables so that we can still have references
120
- return None
104
+ from datachain.lib.listing import is_listing_dataset, listing_uri_from_name
105
+
106
+ if not dataset_id:
107
+ return None
108
+
109
+ assert dataset_name is not None
110
+ dependency_type = DatasetDependencyType.DATASET
111
+ dependency_name = dataset_name
112
+
113
+ if is_listing_dataset(dataset_name):
114
+ dependency_type = DatasetDependencyType.STORAGE # type: ignore[arg-type]
115
+ dependency_name = listing_uri_from_name(dataset_name)
116
+
117
+ return cls(
118
+ id,
119
+ dependency_type,
120
+ dependency_name,
121
+ (
122
+ str(dataset_version) # type: ignore[arg-type]
123
+ if dataset_version
124
+ else None
125
+ ),
126
+ dataset_version_created_at or dataset_created_at, # type: ignore[arg-type]
127
+ [],
128
+ )
121
129
 
122
130
  @property
123
131
  def is_dataset(self) -> bool:
@@ -443,7 +451,11 @@ class DatasetRecord:
443
451
  For bucket listing we implicitly create underlying dataset to hold data. This
444
452
  method is checking if this is one of those datasets.
445
453
  """
446
- return Client.is_data_source_uri(self.name)
454
+ # TODO refactor and maybe remove method in
455
+ # https://github.com/iterative/datachain/issues/318
456
+ return Client.is_data_source_uri(self.name) or self.name.startswith(
457
+ LISTING_PREFIX
458
+ )
447
459
 
448
460
  @property
449
461
  def versions_values(self) -> list[int]: