datachain 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -9,7 +9,6 @@ import os.path
9
9
  import posixpath
10
10
  import subprocess
11
11
  import sys
12
- import tempfile
13
12
  import time
14
13
  import traceback
15
14
  from collections.abc import Iterable, Iterator, Mapping, Sequence
@@ -77,7 +76,6 @@ from datachain.utils import (
77
76
  )
78
77
 
79
78
  from .datasource import DataSource
80
- from .subclass import SubclassFinder
81
79
 
82
80
  if TYPE_CHECKING:
83
81
  from datachain.data_storage import (
@@ -92,7 +90,6 @@ logger = logging.getLogger("datachain")
92
90
 
93
91
  DEFAULT_DATASET_DIR = "dataset"
94
92
  DATASET_FILE_SUFFIX = ".edatachain"
95
- FEATURE_CLASSES = ["DataModel"]
96
93
 
97
94
  TTL_INT = 4 * 60 * 60
98
95
 
@@ -156,8 +153,6 @@ class QueryResult(NamedTuple):
156
153
  dataset: Optional[DatasetRecord]
157
154
  version: Optional[int]
158
155
  output: str
159
- preview: Optional[list[dict]]
160
- metrics: dict[str, Any]
161
156
 
162
157
 
163
158
  class DatasetRowsFetcher(NodesThreadPool):
@@ -571,12 +566,6 @@ def find_column_to_str( # noqa: PLR0911
571
566
  return ""
572
567
 
573
568
 
574
- def form_module_source(source_ast):
575
- module = ast.Module(body=source_ast, type_ignores=[])
576
- module = ast.fix_missing_locations(module)
577
- return ast.unparse(module)
578
-
579
-
580
569
  class Catalog:
581
570
  def __init__(
582
571
  self,
@@ -660,33 +649,12 @@ class Catalog:
660
649
  ),
661
650
  ]
662
651
  code_ast.body[-1:] = new_expressions
663
- else:
664
- raise Exception("Last line in a script was not an expression")
665
652
  return code_ast
666
653
 
667
- def compile_query_script(
668
- self, script: str, feature_module_name: str
669
- ) -> tuple[Union[str, None], str]:
654
+ def compile_query_script(self, script: str) -> str:
670
655
  code_ast = ast.parse(script)
671
656
  code_ast = self.attach_query_wrapper(code_ast)
672
- finder = SubclassFinder(FEATURE_CLASSES)
673
- finder.visit(code_ast)
674
-
675
- if not finder.feature_class:
676
- main_module = form_module_source([*finder.imports, *finder.main_body])
677
- return None, main_module
678
-
679
- feature_import = ast.ImportFrom(
680
- module=feature_module_name,
681
- names=[ast.alias(name="*", asname=None)],
682
- level=0,
683
- )
684
- feature_module = form_module_source([*finder.imports, *finder.feature_class])
685
- main_module = form_module_source(
686
- [*finder.imports, feature_import, *finder.main_body]
687
- )
688
-
689
- return feature_module, main_module
657
+ return ast.unparse(code_ast)
690
658
 
691
659
  def parse_url(self, uri: str, **config: Any) -> tuple[Client, str]:
692
660
  config = config or self.client_config
@@ -1020,20 +988,6 @@ class Catalog:
1020
988
 
1021
989
  return node_groups
1022
990
 
1023
- def unlist_source(self, uri: StorageURI) -> None:
1024
- self.metastore.clone(uri=uri).mark_storage_not_indexed(uri)
1025
-
1026
- def storage_stats(self, uri: StorageURI) -> Optional[DatasetStats]:
1027
- """
1028
- Returns tuple with storage stats: total number of rows and total dataset size.
1029
- """
1030
- partial_path = self.metastore.get_last_partial_path(uri)
1031
- if partial_path is None:
1032
- return None
1033
- dataset = self.get_dataset(Storage.dataset_name(uri, partial_path))
1034
-
1035
- return self.dataset_stats(dataset.name, dataset.latest_version)
1036
-
1037
991
  def create_dataset(
1038
992
  self,
1039
993
  name: str,
@@ -1297,19 +1251,6 @@ class Catalog:
1297
1251
 
1298
1252
  return self.get_dataset(name)
1299
1253
 
1300
- def register_new_dataset(
1301
- self,
1302
- source_dataset: DatasetRecord,
1303
- source_version: int,
1304
- target_name: str,
1305
- ) -> DatasetRecord:
1306
- target_dataset = self.metastore.create_dataset(
1307
- target_name,
1308
- query_script=source_dataset.query_script,
1309
- schema=source_dataset.serialized_schema,
1310
- )
1311
- return self.register_dataset(source_dataset, source_version, target_dataset, 1)
1312
-
1313
1254
  def register_dataset(
1314
1255
  self,
1315
1256
  dataset: DatasetRecord,
@@ -1422,17 +1363,18 @@ class Catalog:
1422
1363
 
1423
1364
  return direct_dependencies
1424
1365
 
1425
- def ls_datasets(self) -> Iterator[DatasetRecord]:
1366
+ def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetRecord]:
1426
1367
  datasets = self.metastore.list_datasets()
1427
1368
  for d in datasets:
1428
- if not d.is_bucket_listing:
1369
+ if not d.is_bucket_listing or include_listing:
1429
1370
  yield d
1430
1371
 
1431
1372
  def list_datasets_versions(
1432
1373
  self,
1374
+ include_listing: bool = False,
1433
1375
  ) -> Iterator[tuple[DatasetRecord, "DatasetVersion", Optional["Job"]]]:
1434
1376
  """Iterate over all dataset versions with related jobs."""
1435
- datasets = list(self.ls_datasets())
1377
+ datasets = list(self.ls_datasets(include_listing=include_listing))
1436
1378
 
1437
1379
  # preselect dataset versions jobs from db to avoid multiple queries
1438
1380
  jobs_ids: set[str] = {
@@ -1444,7 +1386,8 @@ class Catalog:
1444
1386
 
1445
1387
  for d in datasets:
1446
1388
  yield from (
1447
- (d, v, jobs.get(v.job_id) if v.job_id else None) for v in d.versions
1389
+ (d, v, jobs.get(str(v.job_id)) if v.job_id else None)
1390
+ for v in d.versions
1448
1391
  )
1449
1392
 
1450
1393
  def ls_dataset_rows(
@@ -1632,15 +1575,6 @@ class Catalog:
1632
1575
  for source in data_sources: # type: ignore [union-attr]
1633
1576
  yield source, source.ls(fields)
1634
1577
 
1635
- def ls_storage_uris(self) -> Iterator[str]:
1636
- yield from self.metastore.get_all_storage_uris()
1637
-
1638
- def get_storage(self, uri: StorageURI) -> Storage:
1639
- return self.metastore.get_storage(uri)
1640
-
1641
- def ls_storages(self) -> list[Storage]:
1642
- return self.metastore.list_storages()
1643
-
1644
1578
  def pull_dataset(
1645
1579
  self,
1646
1580
  dataset_uri: str,
@@ -1874,10 +1808,6 @@ class Catalog:
1874
1808
  envs: Optional[Mapping[str, str]] = None,
1875
1809
  python_executable: Optional[str] = None,
1876
1810
  save: bool = False,
1877
- save_as: Optional[str] = None,
1878
- preview_limit: int = 10,
1879
- preview_offset: int = 0,
1880
- preview_columns: Optional[list[str]] = None,
1881
1811
  capture_output: bool = True,
1882
1812
  output_hook: Callable[[str], None] = noop,
1883
1813
  params: Optional[dict[str, str]] = None,
@@ -1905,34 +1835,25 @@ class Catalog:
1905
1835
  C.size > 1000
1906
1836
  )
1907
1837
  """
1908
- from datachain.query.dataset import ExecutionResult
1909
-
1910
- feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
1911
- dir=os.getcwd(), suffix=".py", delete=False
1912
- )
1913
- _, feature_module = os.path.split(feature_file.name)
1914
-
1915
- try:
1916
- lines, proc, response_text = self.run_query(
1917
- python_executable or sys.executable,
1918
- query_script,
1919
- envs,
1920
- feature_file,
1921
- capture_output,
1922
- feature_module,
1923
- output_hook,
1924
- params,
1925
- preview_columns,
1926
- preview_limit,
1927
- preview_offset,
1928
- save,
1929
- save_as,
1930
- job_id,
1838
+ if not job_id:
1839
+ python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
1840
+ job_id = self.metastore.create_job(
1841
+ name="",
1842
+ query=query_script,
1843
+ params=params,
1844
+ python_version=python_version,
1931
1845
  )
1932
- finally:
1933
- feature_file.close()
1934
- os.unlink(feature_file.name)
1935
1846
 
1847
+ lines, proc = self.run_query(
1848
+ python_executable or sys.executable,
1849
+ query_script,
1850
+ envs,
1851
+ capture_output,
1852
+ output_hook,
1853
+ params,
1854
+ save,
1855
+ job_id,
1856
+ )
1936
1857
  output = "".join(lines)
1937
1858
 
1938
1859
  if proc.returncode:
@@ -1942,105 +1863,69 @@ class Catalog:
1942
1863
  return_code=proc.returncode,
1943
1864
  output=output,
1944
1865
  )
1945
- if proc.returncode == QUERY_SCRIPT_INVALID_LAST_STATEMENT_EXIT_CODE:
1946
- raise QueryScriptRunError(
1947
- "Last line in a script was not an instance of DataChain",
1948
- return_code=proc.returncode,
1949
- output=output,
1950
- )
1951
1866
  raise QueryScriptRunError(
1952
1867
  f"Query script exited with error code {proc.returncode}",
1953
1868
  return_code=proc.returncode,
1954
1869
  output=output,
1955
1870
  )
1956
1871
 
1872
+ def _get_dataset_versions_by_job_id():
1873
+ for dr, dv, job in self.list_datasets_versions():
1874
+ if job and str(job.id) == job_id:
1875
+ yield dr, dv
1876
+
1957
1877
  try:
1958
- response = json.loads(response_text)
1959
- except ValueError:
1960
- response = {}
1961
- exec_result = ExecutionResult(**response)
1962
-
1963
- dataset: Optional[DatasetRecord] = None
1964
- version: Optional[int] = None
1965
- if save or save_as:
1966
- dataset, version = self.save_result(
1967
- query_script, exec_result, output, version, job_id
1878
+ dr, dv = max(
1879
+ _get_dataset_versions_by_job_id(), key=lambda x: x[1].created_at
1968
1880
  )
1881
+ except ValueError as e:
1882
+ if not save:
1883
+ return QueryResult(dataset=None, version=None, output=output)
1969
1884
 
1970
- return QueryResult(
1971
- dataset=dataset,
1972
- version=version,
1973
- output=output,
1974
- preview=exec_result.preview,
1975
- metrics=exec_result.metrics,
1885
+ raise QueryScriptDatasetNotFound(
1886
+ "No dataset found after running Query script",
1887
+ output=output,
1888
+ ) from e
1889
+
1890
+ dr = self.update_dataset(
1891
+ dr,
1892
+ script_output=output,
1893
+ query_script=query_script,
1976
1894
  )
1895
+ self.update_dataset_version_with_warehouse_info(
1896
+ dr,
1897
+ dv.version,
1898
+ script_output=output,
1899
+ query_script=query_script,
1900
+ job_id=job_id,
1901
+ is_job_result=True,
1902
+ )
1903
+ return QueryResult(dataset=dr, version=dv.version, output=output)
1977
1904
 
1978
1905
  def run_query(
1979
1906
  self,
1980
1907
  python_executable: str,
1981
1908
  query_script: str,
1982
1909
  envs: Optional[Mapping[str, str]],
1983
- feature_file: IO[bytes],
1984
1910
  capture_output: bool,
1985
- feature_module: str,
1986
1911
  output_hook: Callable[[str], None],
1987
1912
  params: Optional[dict[str, str]],
1988
- preview_columns: Optional[list[str]],
1989
- preview_limit: int,
1990
- preview_offset: int,
1991
1913
  save: bool,
1992
- save_as: Optional[str],
1993
1914
  job_id: Optional[str],
1994
- ) -> tuple[list[str], subprocess.Popen, str]:
1915
+ ) -> tuple[list[str], subprocess.Popen]:
1995
1916
  try:
1996
- feature_code, query_script_compiled = self.compile_query_script(
1997
- query_script, feature_module[:-3]
1998
- )
1999
- if feature_code:
2000
- feature_file.write(feature_code.encode())
2001
- feature_file.flush()
2002
-
1917
+ query_script_compiled = self.compile_query_script(query_script)
2003
1918
  except Exception as exc:
2004
1919
  raise QueryScriptCompileError(
2005
1920
  f"Query script failed to compile, reason: {exc}"
2006
1921
  ) from exc
2007
- if save_as and save_as.startswith(QUERY_DATASET_PREFIX):
2008
- raise ValueError(
2009
- f"Cannot use {QUERY_DATASET_PREFIX} prefix for dataset name"
2010
- )
2011
- r, w = os.pipe()
2012
- if os.name == "nt":
2013
- import msvcrt
2014
-
2015
- os.set_inheritable(w, True)
2016
-
2017
- startupinfo = subprocess.STARTUPINFO() # type: ignore[attr-defined]
2018
- handle = msvcrt.get_osfhandle(w) # type: ignore[attr-defined]
2019
- startupinfo.lpAttributeList["handle_list"].append(handle)
2020
- kwargs: dict[str, Any] = {"startupinfo": startupinfo}
2021
- else:
2022
- handle = w
2023
- kwargs = {"pass_fds": [w]}
2024
1922
  envs = dict(envs or os.environ)
2025
- if feature_code:
2026
- envs["DATACHAIN_FEATURE_CLASS_SOURCE"] = json.dumps(
2027
- {feature_module: feature_code}
2028
- )
2029
1923
  envs.update(
2030
1924
  {
2031
1925
  "DATACHAIN_QUERY_PARAMS": json.dumps(params or {}),
2032
1926
  "PYTHONPATH": os.getcwd(), # For local imports
2033
- "DATACHAIN_QUERY_PREVIEW_ARGS": json.dumps(
2034
- {
2035
- "limit": preview_limit,
2036
- "offset": preview_offset,
2037
- "columns": preview_columns,
2038
- }
2039
- ),
2040
1927
  "DATACHAIN_QUERY_SAVE": "1" if save else "",
2041
- "DATACHAIN_QUERY_SAVE_AS": save_as or "",
2042
1928
  "PYTHONUNBUFFERED": "1",
2043
- "DATACHAIN_OUTPUT_FD": str(handle),
2044
1929
  "DATACHAIN_JOB_ID": job_id or "",
2045
1930
  },
2046
1931
  )
@@ -2051,52 +1936,12 @@ class Catalog:
2051
1936
  stderr=subprocess.STDOUT if capture_output else None,
2052
1937
  bufsize=1,
2053
1938
  text=False,
2054
- **kwargs,
2055
1939
  ) as proc:
2056
- os.close(w)
2057
-
2058
1940
  out = proc.stdout
2059
1941
  _lines: list[str] = []
2060
1942
  ctx = print_and_capture(out, output_hook) if out else nullcontext(_lines)
2061
-
2062
- with ctx as lines, open(r) as f:
2063
- response_text = ""
2064
- while proc.poll() is None:
2065
- response_text += f.readline()
2066
- time.sleep(0.1)
2067
- response_text += f.readline()
2068
- return lines, proc, response_text
2069
-
2070
- def save_result(self, query_script, exec_result, output, version, job_id):
2071
- if not exec_result.dataset:
2072
- raise QueryScriptDatasetNotFound(
2073
- "No dataset found after running Query script",
2074
- output=output,
2075
- )
2076
- name, version = exec_result.dataset
2077
- # finding returning dataset
2078
- try:
2079
- dataset = self.get_dataset(name)
2080
- dataset.get_version(version)
2081
- except (DatasetNotFoundError, ValueError) as e:
2082
- raise QueryScriptDatasetNotFound(
2083
- "No dataset found after running Query script",
2084
- output=output,
2085
- ) from e
2086
- dataset = self.update_dataset(
2087
- dataset,
2088
- script_output=output,
2089
- query_script=query_script,
2090
- )
2091
- self.update_dataset_version_with_warehouse_info(
2092
- dataset,
2093
- version,
2094
- script_output=output,
2095
- query_script=query_script,
2096
- job_id=job_id,
2097
- is_job_result=True,
2098
- )
2099
- return dataset, version
1943
+ with ctx as lines:
1944
+ return lines, proc
2100
1945
 
2101
1946
  def cp(
2102
1947
  self,
datachain/cli.py CHANGED
@@ -14,6 +14,7 @@ import shtab
14
14
 
15
15
  from datachain import utils
16
16
  from datachain.cli_utils import BooleanOptionalAction, CommaSeparatedArgs, KeyValueArgs
17
+ from datachain.lib.dc import DataChain
17
18
  from datachain.utils import DataChainDir
18
19
 
19
20
  if TYPE_CHECKING:
@@ -472,9 +473,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
472
473
  query_parser.add_argument(
473
474
  "script", metavar="<script.py>", type=str, help="Filepath for script"
474
475
  )
475
- query_parser.add_argument(
476
- "dataset_name", nargs="?", type=str, help="Save result dataset as"
477
- )
478
476
  query_parser.add_argument(
479
477
  "--parallel",
480
478
  nargs="?",
@@ -487,7 +485,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
487
485
  "N defaults to the CPU count."
488
486
  ),
489
487
  )
490
- add_show_args(query_parser)
491
488
  query_parser.add_argument(
492
489
  "-p",
493
490
  "--param",
@@ -619,18 +616,6 @@ def _ls_urls_flat(
619
616
  raise FileNotFoundError(f"No such file or directory: {source}")
620
617
 
621
618
 
622
- def ls_indexed_storages(catalog: "Catalog", long: bool = False) -> Iterator[str]:
623
- from datachain.node import long_line_str
624
-
625
- storage_uris = catalog.ls_storage_uris()
626
- if long:
627
- for uri in storage_uris:
628
- # TODO: add Storage.created so it can be used here
629
- yield long_line_str(uri, None, "")
630
- else:
631
- yield from storage_uris
632
-
633
-
634
619
  def ls_local(
635
620
  sources,
636
621
  long: bool = False,
@@ -661,8 +646,9 @@ def ls_local(
661
646
  for entry in entries:
662
647
  print(format_ls_entry(entry))
663
648
  else:
664
- for entry in ls_indexed_storages(catalog, long=long):
665
- print(format_ls_entry(entry))
649
+ chain = DataChain.listings()
650
+ for ls in chain.collect("listing"):
651
+ print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
666
652
 
667
653
 
668
654
  def format_ls_entry(entry: str) -> str:
@@ -813,16 +799,10 @@ def show(
813
799
  def query(
814
800
  catalog: "Catalog",
815
801
  script: str,
816
- dataset_name: Optional[str] = None,
817
802
  parallel: Optional[int] = None,
818
- limit: int = 10,
819
- offset: int = 0,
820
- columns: Optional[list[str]] = None,
821
- no_collapse: bool = False,
822
803
  params: Optional[dict[str, str]] = None,
823
804
  ) -> None:
824
805
  from datachain.data_storage import JobQueryType, JobStatus
825
- from datachain.utils import show_records
826
806
 
827
807
  with open(script, encoding="utf-8") as f:
828
808
  script_content = f.read()
@@ -843,13 +823,9 @@ def query(
843
823
  )
844
824
 
845
825
  try:
846
- result = catalog.query(
826
+ catalog.query(
847
827
  script_content,
848
828
  python_executable=python_executable,
849
- save_as=dataset_name,
850
- preview_limit=limit,
851
- preview_offset=offset,
852
- preview_columns=columns,
853
829
  capture_output=False,
854
830
  params=params,
855
831
  job_id=job_id,
@@ -864,10 +840,7 @@ def query(
864
840
  error_stack=error_stack,
865
841
  )
866
842
  raise
867
-
868
- catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE, metrics=result.metrics)
869
-
870
- show_records(result.preview, collapse_columns=not no_collapse)
843
+ catalog.metastore.set_job_status(job_id, JobStatus.COMPLETE)
871
844
 
872
845
 
873
846
  def clear_cache(catalog: "Catalog"):
@@ -1042,12 +1015,7 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
1042
1015
  query(
1043
1016
  catalog,
1044
1017
  args.script,
1045
- dataset_name=args.dataset_name,
1046
1018
  parallel=args.parallel,
1047
- limit=args.limit,
1048
- offset=args.offset,
1049
- columns=args.columns,
1050
- no_collapse=args.no_collapse,
1051
1019
  params=args.param,
1052
1020
  )
1053
1021
  elif args.command == "apply-udf":
@@ -87,6 +87,7 @@ class Client(ABC):
87
87
  def get_implementation(url: str) -> type["Client"]:
88
88
  from .azure import AzureClient
89
89
  from .gcs import GCSClient
90
+ from .hf import HfClient
90
91
  from .local import FileClient
91
92
  from .s3 import ClientS3
92
93
 
@@ -104,6 +105,8 @@ class Client(ABC):
104
105
  return AzureClient
105
106
  if protocol == FileClient.protocol:
106
107
  return FileClient
108
+ if protocol == HfClient.protocol:
109
+ return HfClient
107
110
 
108
111
  raise NotImplementedError(f"Unsupported protocol: {protocol}")
109
112
 
datachain/client/hf.py ADDED
@@ -0,0 +1,47 @@
1
+ import os
2
+ import posixpath
3
+ from typing import Any, cast
4
+
5
+ from huggingface_hub import HfFileSystem
6
+
7
+ from datachain.lib.file import File
8
+ from datachain.node import Entry
9
+
10
+ from .fsspec import Client
11
+
12
+
13
+ class HfClient(Client):
14
+ FS_CLASS = HfFileSystem
15
+ PREFIX = "hf://"
16
+ protocol = "hf"
17
+
18
+ @classmethod
19
+ def create_fs(cls, **kwargs) -> HfFileSystem:
20
+ if os.environ.get("HF_TOKEN"):
21
+ kwargs["token"] = os.environ["HF_TOKEN"]
22
+
23
+ return cast(HfFileSystem, super().create_fs(**kwargs))
24
+
25
+ def convert_info(self, v: dict[str, Any], path: str) -> Entry:
26
+ return Entry.from_file(
27
+ path=path,
28
+ size=v["size"],
29
+ version=v["last_commit"].oid,
30
+ etag=v.get("blob_id", ""),
31
+ last_modified=v["last_commit"].date,
32
+ )
33
+
34
+ def info_to_file(self, v: dict[str, Any], path: str) -> File:
35
+ return File(
36
+ path=path,
37
+ size=v["size"],
38
+ version=v["last_commit"].oid,
39
+ etag=v.get("blob_id", ""),
40
+ last_modified=v["last_commit"].date,
41
+ )
42
+
43
+ async def ls_dir(self, path):
44
+ return self.fs.ls(path, detail=True)
45
+
46
+ def rel_path(self, path):
47
+ return posixpath.relpath(path, self.name)
@@ -167,21 +167,10 @@ class AbstractMetastore(ABC, Serializable):
167
167
  This method should be called when index operation is finished.
168
168
  """
169
169
 
170
- @abstractmethod
171
- def mark_storage_not_indexed(self, uri: StorageURI) -> None:
172
- """
173
- Mark storage as not indexed.
174
- This method should be called when storage index is deleted.
175
- """
176
-
177
170
  @abstractmethod
178
171
  def update_last_inserted_at(self, uri: Optional[StorageURI] = None) -> None:
179
172
  """Updates last inserted datetime in bucket with current time."""
180
173
 
181
- @abstractmethod
182
- def get_all_storage_uris(self) -> Iterator[StorageURI]:
183
- """Returns all storage uris."""
184
-
185
174
  @abstractmethod
186
175
  def get_storage(self, uri: StorageURI) -> Storage:
187
176
  """
@@ -189,10 +178,6 @@ class AbstractMetastore(ABC, Serializable):
189
178
  E.g. if s3 is used as storage this would be s3 bucket data.
190
179
  """
191
180
 
192
- @abstractmethod
193
- def list_storages(self) -> list[Storage]:
194
- """Returns all storages."""
195
-
196
181
  @abstractmethod
197
182
  def mark_storage_pending(self, storage: Storage) -> Storage:
198
183
  """Marks storage as pending."""
@@ -324,7 +309,7 @@ class AbstractMetastore(ABC, Serializable):
324
309
  self.add_dataset_dependency(
325
310
  source_dataset_name,
326
311
  source_dataset_version,
327
- dependency.name,
312
+ dependency.dataset_name,
328
313
  int(dependency.version),
329
314
  )
330
315
  else:
@@ -906,11 +891,6 @@ class AbstractDBMetastore(AbstractMetastore):
906
891
  self._storages_update().where(s.c.uri == uri).values(**updates) # type: ignore [attr-defined]
907
892
  )
908
893
 
909
- def get_all_storage_uris(self) -> Iterator[StorageURI]:
910
- """Returns all storage uris."""
911
- s = self._storages
912
- yield from (r[0] for r in self.db.execute(self._storages_select(s.c.uri)))
913
-
914
894
  def get_storage(self, uri: StorageURI, conn=None) -> Storage:
915
895
  """
916
896
  Gets storage representation from database.
@@ -926,13 +906,6 @@ class AbstractDBMetastore(AbstractMetastore):
926
906
 
927
907
  return self.storage_class._make(result)
928
908
 
929
- def list_storages(self) -> list[Storage]:
930
- result = self.db.execute(self._storages_select())
931
- if not result:
932
- return []
933
-
934
- return [self.storage_class._make(r) for r in result]
935
-
936
909
  def mark_storage_pending(self, storage: Storage, conn=None) -> Storage:
937
910
  # Update status to pending and dates
938
911
  updates = {
@@ -1503,7 +1476,7 @@ class AbstractDBMetastore(AbstractMetastore):
1503
1476
  return self._jobs.update().where(*where)
1504
1477
 
1505
1478
  def _parse_job(self, rows) -> Job:
1506
- return Job.parse(*rows)
1479
+ return self.job_class.parse(*rows)
1507
1480
 
1508
1481
  def _parse_jobs(self, rows) -> Iterator["Job"]:
1509
1482
  for _, g in groupby(rows, lambda r: r[0]):