deriva-ml 1.13.1__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/dataset.py CHANGED
@@ -1,18 +1,30 @@
1
1
  """
2
- This module defines the DataSet class with is used to manipulate datasets in DerivaML,
3
- The intended use of this class is as a base class in DerivaML so all the methods documented here are
2
+ This module defines the DataSet class with is used to manipulate datasets in DerivaML.
3
+ The intended use of this class is as a base class in DerivaML, so all the methods documented here are
4
4
  accessible via a DerivaML class instance.
5
5
 
6
-
7
6
  """
8
7
 
9
8
  from __future__ import annotations
10
- from bdbag.fetch.fetcher import fetch_single_file
11
9
  from bdbag import bdbag_api as bdb
10
+ from bdbag.fetch.fetcher import fetch_single_file
12
11
  from collections import defaultdict
12
+ from graphlib import TopologicalSorter
13
+ import json
14
+ import logging
15
+ from pathlib import Path
16
+ from pydantic import (
17
+ validate_call,
18
+ ConfigDict,
19
+ )
20
+ import requests
21
+ from tempfile import TemporaryDirectory
22
+ from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
23
+
13
24
 
14
25
  from deriva.core.ermrest_model import Table
15
26
  from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
27
+ import deriva.core.utils.hash_utils as hash_utils
16
28
  from deriva.transfer.download.deriva_export import DerivaExport
17
29
  from deriva.transfer.download.deriva_download import (
18
30
  DerivaDownloadConfigurationError,
@@ -22,24 +34,12 @@ from deriva.transfer.download.deriva_download import (
22
34
  DerivaDownloadTimeoutError,
23
35
  )
24
36
 
37
+
25
38
  try:
26
39
  from icecream import ic
27
40
  except ImportError: # Graceful fallback if IceCream isn't installed.
28
41
  ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
29
42
 
30
- from graphlib import TopologicalSorter
31
- import json
32
- import logging
33
- from pathlib import Path
34
- from pydantic import (
35
- validate_call,
36
- ConfigDict,
37
- )
38
- import requests
39
-
40
- from tempfile import TemporaryDirectory, NamedTemporaryFile
41
- from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
42
-
43
43
  from deriva_ml import DatasetBag
44
44
  from .deriva_definitions import (
45
45
  ML_SCHEMA,
@@ -49,7 +49,6 @@ from .deriva_definitions import (
49
49
  RID,
50
50
  DRY_RUN_RID,
51
51
  )
52
- from .history import iso_to_snap
53
52
  from .deriva_model import DerivaModel
54
53
  from .database_model import DatabaseModel
55
54
  from .dataset_aux_classes import (
@@ -74,13 +73,20 @@ class Dataset:
74
73
 
75
74
  _Logger = logging.getLogger("deriva_ml")
76
75
 
77
- def __init__(self, model: DerivaModel, cache_dir: Path, working_dir: Path):
76
+ def __init__(
77
+ self,
78
+ model: DerivaModel,
79
+ cache_dir: Path,
80
+ working_dir: Path,
81
+ use_minid: bool = True,
82
+ ):
78
83
  self._model = model
79
84
  self._ml_schema = ML_SCHEMA
80
85
  self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
81
86
  self._cache_dir = cache_dir
82
87
  self._working_dir = working_dir
83
88
  self._logger = logging.getLogger("deriva_ml")
89
+ self._use_minid = use_minid
84
90
 
85
91
  def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
86
92
  try:
@@ -100,27 +106,28 @@ class Dataset:
100
106
  dataset_list: list[DatasetSpec],
101
107
  description: Optional[str] = "",
102
108
  execution_rid: Optional[RID] = None,
103
- ) -> list[dict[str, Any]]:
109
+ ) -> None:
104
110
  schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
105
-
111
+ # determine snapshot after changes were made
112
+ snap = self._model.catalog.get("/").json()["snaptime"]
106
113
  # Construct version records for insert
107
- version_records = [
108
- {
109
- "Dataset": dataset.rid,
110
- "Version": str(dataset.version),
111
- "Description": description,
112
- "Execution": execution_rid,
113
- }
114
- for dataset in dataset_list
115
- ]
114
+ version_records = schema_path.tables["Dataset_Version"].insert(
115
+ [
116
+ {
117
+ "Dataset": dataset.rid,
118
+ "Version": str(dataset.version),
119
+ "Description": description,
120
+ "Execution": execution_rid,
121
+ "Snapshot": snap,
122
+ }
123
+ for dataset in dataset_list
124
+ ]
125
+ )
116
126
 
117
- # Insert version records and construct entities for updating the dataset version column.
118
- version_rids = [
119
- {"Version": v["RID"], "RID": v["Dataset"]}
120
- for v in schema_path.tables["Dataset_Version"].insert(version_records)
121
- ]
122
- schema_path.tables["Dataset"].update(version_rids)
123
- return version_rids
127
+ # And update the dataset records.
128
+ schema_path.tables["Dataset"].update(
129
+ [{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records]
130
+ )
124
131
 
125
132
  def _bootstrap_versions(self):
126
133
  datasets = [ds["RID"] for ds in self.find_datasets()]
@@ -170,6 +177,9 @@ class Dataset:
170
177
  Returns:
171
178
  A list of DatasetHistory objects which indicate the version-number, creation time, and bag instantiation of the dataset.
172
179
  """
180
+
181
+ if not self._is_dataset_rid(dataset_rid):
182
+ raise DerivaMLException(f"RID is not for a data set: {dataset_rid}")
173
183
  version_path = (
174
184
  self._model.catalog.getPathBuilder()
175
185
  .schemas[self._ml_schema]
@@ -179,7 +189,7 @@ class Dataset:
179
189
  DatasetHistory(
180
190
  dataset_version=DatasetVersion.parse(v["Version"]),
181
191
  minid=v["Minid"],
182
- timestamp=v["RCT"],
192
+ snapshot=v["Snapshot"],
183
193
  dataset_rid=dataset_rid,
184
194
  version_rid=v["RID"],
185
195
  description=v["Description"],
@@ -240,7 +250,7 @@ class Dataset:
240
250
 
241
251
  Args:
242
252
  dataset_rid: RID of the dataset whose version is to be incremented.
243
- component: Which version of the dataset_table to increment. Major, Minor or Patch
253
+ component: Which version of the dataset_table to increment. Major, Minor, or Patch
244
254
  description: Description of the version update of the dataset_table.
245
255
  execution_rid: Which execution is performing increment.
246
256
 
@@ -248,7 +258,7 @@ class Dataset:
248
258
  new semantic version of the dataset_table as a 3-tuple
249
259
 
250
260
  Raises:
251
- DerivaMLException: if provided RID is not to a dataset_table.
261
+ DerivaMLException: if provided, RID is not to a dataset_table.
252
262
  """
253
263
 
254
264
  # Find all the datasets that are reachable from this dataset and determine their new version numbers.
@@ -268,7 +278,7 @@ class Dataset:
268
278
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
269
279
  def create_dataset(
270
280
  self,
271
- type: str | list[str],
281
+ dataset_types: str | list[str],
272
282
  description: str,
273
283
  execution_rid: Optional[RID] = None,
274
284
  version: Optional[DatasetVersion] = None,
@@ -276,7 +286,7 @@ class Dataset:
276
286
  """Create a new dataset_table from the specified list of RIDs.
277
287
 
278
288
  Args:
279
- type: One or more dataset_table types. Must be a term from the DatasetType controlled vocabulary.
289
+ dataset_types: One or more dataset_table types. Must be a term from the DatasetType controlled vocabulary.
280
290
  description: Description of the dataset_table.
281
291
  execution_rid: Execution under which the dataset_table will be created.
282
292
  version: Version of the dataset_table.
@@ -304,7 +314,7 @@ class Dataset:
304
314
  return False
305
315
 
306
316
  # Create the entry for the new dataset_table and get its RID.
307
- ds_types = [type] if isinstance(type, str) else type
317
+ ds_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
308
318
  pb = self._model.catalog.getPathBuilder()
309
319
  for ds_type in ds_types:
310
320
  if not check_dataset_type(ds_type):
@@ -452,7 +462,9 @@ class Dataset:
452
462
  )
453
463
 
454
464
  # self.model = self.catalog.getCatalogModel()
455
- self.dataset_table.annotations.update(self._generate_dataset_annotations())
465
+ self.dataset_table.annotations.update(
466
+ self._generate_dataset_download_annotations()
467
+ )
456
468
  self._model.model.apply()
457
469
  return table
458
470
 
@@ -464,7 +476,7 @@ class Dataset:
464
476
 
465
477
  Args:
466
478
  dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
467
- recurse: (Default value = False)
479
+ recurse: (Default value = False)
468
480
  limit: If provided, the maximum number of members to return for each element type.
469
481
 
470
482
  Returns:
@@ -530,8 +542,8 @@ class Dataset:
530
542
  dataset is incremented and the description, if provide is applied to that new version.
531
543
 
532
544
  Args:
533
- dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
534
- members: List of RIDs of members to add to the dataset_table.
545
+ dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
546
+ members: List of member RIDs to add to the dataset_table.
535
547
  validate: Check rid_list to make sure elements are not already in the dataset_table.
536
548
  description: Markdown description of the updated dataset.
537
549
  execution_rid: Optional RID of execution associated with this dataset.
@@ -544,7 +556,7 @@ class Dataset:
544
556
 
545
557
  Args:
546
558
  member_rid:
547
- path: (Default value = None)
559
+ path: (Default value = None)
548
560
 
549
561
  Returns:
550
562
 
@@ -570,7 +582,7 @@ class Dataset:
570
582
  a.other_fkeys.pop().pk_table.name: a.table.name
571
583
  for a in self.dataset_table.find_associations()
572
584
  }
573
- # Get a list of all the types of objects that can be linked to a dataset_table.
585
+ # Get a list of all the object types that can be linked to a dataset_table.
574
586
  for m in members:
575
587
  try:
576
588
  rid_info = self._model.catalog.resolve_rid(m)
@@ -618,8 +630,8 @@ class Dataset:
618
630
  dataset is incremented and the description, if provide is applied to that new version.
619
631
 
620
632
  Args:
621
- dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
622
- members: List of RIDs of members to add to the dataset_table.
633
+ dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
634
+ members: List of member RIDs to add to the dataset_table.
623
635
  description: Markdown description of the updated dataset.
624
636
  execution_rid: Optional RID of execution associated with this operation.
625
637
  """
@@ -634,7 +646,7 @@ class Dataset:
634
646
  a.other_fkeys.pop().pk_table.name: a.table.name
635
647
  for a in self.dataset_table.find_associations()
636
648
  }
637
- # Get a list of all the types of objects that can be linked to a dataset_table.
649
+ # Get a list of all the object types that can be linked to a dataset_table.
638
650
  for m in members:
639
651
  try:
640
652
  rid_info = self._model.catalog.resolve_rid(m)
@@ -670,7 +682,7 @@ class Dataset:
670
682
  )
671
683
 
672
684
  @validate_call
673
- def list_dataset_parents(self, dataset_rid: RID) -> list[RID]:
685
+ def list_dataset_parents(self, dataset_rid: RID) -> list[str]:
674
686
  """Given a dataset_table RID, return a list of RIDs of the parent datasets if this is included in a
675
687
  nested dataset.
676
688
 
@@ -696,14 +708,14 @@ class Dataset:
696
708
 
697
709
  @validate_call
698
710
  def list_dataset_children(self, dataset_rid: RID, recurse=False) -> list[RID]:
699
- """Given a dataset_table RID, return a list of RIDs of any nested datasets.
711
+ """Given a dataset_table RID, return a list of RIDs for any nested datasets.
700
712
 
701
713
  Args:
702
714
  dataset_rid: A dataset_table RID.
703
- recurse: If True, return a list of RIDs of any nested datasets.
715
+ recurse: If True, return a list of nested datasets RIDs.
704
716
 
705
717
  Returns:
706
- list of RIDs of nested datasets.
718
+ list of nested dataset RIDs.
707
719
 
708
720
  """
709
721
  dataset_dataset_path = (
@@ -726,7 +738,7 @@ class Dataset:
726
738
 
727
739
  return find_children(dataset_rid)
728
740
 
729
- def _vocabulary_specification(
741
+ def _export_vocabulary(
730
742
  self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
731
743
  ) -> list[dict[str, Any]]:
732
744
  """
@@ -756,10 +768,10 @@ class Dataset:
756
768
  ) -> Iterator[tuple[str, str, Table]]:
757
769
  paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
758
770
 
759
- def source_path(path: tuple[Table, ...]):
771
+ def source_path(path: tuple[Table, ...]) -> list[str]:
760
772
  """Convert a tuple representing a path into a source path component with FK linkage"""
761
773
  path = list(path)
762
- p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
774
+ p = [f"{self._model.ml_schema}:Dataset/RID={{RID}}"]
763
775
  for table in path[1:]:
764
776
  if table.name == "Dataset_Dataset":
765
777
  p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
@@ -803,7 +815,7 @@ class Dataset:
803
815
  dataset_elements = [
804
816
  snapshot_catalog._model.name_to_table(e)
805
817
  for e, m in snapshot_catalog.list_dataset_members(
806
- dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
818
+ dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
807
819
  ).items()
808
820
  if m
809
821
  ]
@@ -857,7 +869,7 @@ class Dataset:
857
869
  """
858
870
 
859
871
  def children_depth(
860
- dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
872
+ dataset_rid: RID, nested_datasets: dict[str, list[str]]
861
873
  ) -> int:
862
874
  """Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
863
875
  try:
@@ -899,13 +911,13 @@ class Dataset:
899
911
  def _dataset_specification(
900
912
  self,
901
913
  writer: Callable[[str, str, Table], list[dict[str, Any]]],
902
- dataset: DatasetSpec,
914
+ dataset: Optional[DatasetSpec] = None,
903
915
  snapshot_catalog: Optional[DerivaML] = None,
904
916
  ) -> list[dict[str, Any]]:
905
917
  """Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
906
- The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
918
+ The top level data directory of the resulting BDBag will have one subdirectory for element type. The subdirectory
907
919
  will contain the CSV indicating which elements of that type are present in the dataset_table, and then there will be a
908
- subdirectories for each object that is reachable from the dataset_table members.
920
+ subdirectory for each object that is reachable from the dataset_table members.
909
921
 
910
922
  To simplify reconstructing the relationship between tables, the CVS for each
911
923
  The top level data directory will also contain a subdirectory for any controlled vocabularies used in the dataset_table.
@@ -913,7 +925,7 @@ class Dataset:
913
925
 
914
926
  For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign key relationships to
915
927
  objects in tables T3 and T4. There are also two controlled vocabularies, CV1 and CV2. T2 is an asset table
916
- which has two asset in it. The layout of the resulting bdbag would be:
928
+ which has two assets in it. The layout of the resulting bdbag would be:
917
929
  data
918
930
  CV1/
919
931
  cv1.csv
@@ -939,12 +951,12 @@ class Dataset:
939
951
  Returns:
940
952
  A dataset_table specification.
941
953
  """
942
- element_spec = []
954
+ element_spec = self._export_vocabulary(writer)
943
955
  for path in self._table_paths(
944
956
  dataset=dataset, snapshot_catalog=snapshot_catalog
945
957
  ):
946
958
  element_spec.extend(writer(*path))
947
- return self._vocabulary_specification(writer) + element_spec
959
+ return element_spec
948
960
 
949
961
  def _download_dataset_bag(
950
962
  self,
@@ -964,7 +976,8 @@ class Dataset:
964
976
  for the dataset.
965
977
  """
966
978
  if (
967
- execution_rid != DRY_RUN_RID
979
+ execution_rid
980
+ and execution_rid != DRY_RUN_RID
968
981
  and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
969
982
  ):
970
983
  raise DerivaMLException(f"RID {execution_rid} is not an execution")
@@ -984,7 +997,7 @@ class Dataset:
984
997
  for h in self.dataset_history(dataset_rid=dataset.rid)
985
998
  if h.dataset_version == dataset.version
986
999
  ][0]
987
- return f"{self._model.catalog.catalog_id}@{iso_to_snap(version_record.timestamp.isoformat())}"
1000
+ return f"{self._model.catalog.catalog_id}@{version_record.snapshot}"
988
1001
 
989
1002
  def _create_dataset_minid(
990
1003
  self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
@@ -999,7 +1012,7 @@ class Dataset:
999
1012
  )
1000
1013
  try:
1001
1014
  self._logger.info(
1002
- f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
1015
+ f"Downloading dataset {'minid' if self._use_minid else 'bag'} for catalog: {dataset.rid}@{str(dataset.version)}"
1003
1016
  )
1004
1017
  # Generate the bag and put into S3 storage.
1005
1018
  exporter = DerivaExport(
@@ -1008,9 +1021,10 @@ class Dataset:
1008
1021
  output_dir=tmp_dir,
1009
1022
  defer_download=True,
1010
1023
  timeout=(10, 610),
1011
- envars={"Dataset_RID": dataset.rid},
1024
+ envars={"RID": dataset.rid},
1012
1025
  )
1013
1026
  minid_page_url = exporter.export()[0] # Get the MINID launch page
1027
+
1014
1028
  except (
1015
1029
  DerivaDownloadError,
1016
1030
  DerivaDownloadConfigurationError,
@@ -1020,17 +1034,18 @@ class Dataset:
1020
1034
  ) as e:
1021
1035
  raise DerivaMLException(format_exception(e))
1022
1036
  # Update version table with MINID.
1023
- version_path = (
1024
- self._model.catalog.getPathBuilder()
1025
- .schemas[self._ml_schema]
1026
- .tables["Dataset_Version"]
1027
- )
1028
- version_rid = [
1029
- h
1030
- for h in self.dataset_history(dataset_rid=dataset.rid)
1031
- if h.dataset_version == dataset.version
1032
- ][0].version_rid
1033
- version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
1037
+ if self._use_minid:
1038
+ version_path = (
1039
+ self._model.catalog.getPathBuilder()
1040
+ .schemas[self._ml_schema]
1041
+ .tables["Dataset_Version"]
1042
+ )
1043
+ version_rid = [
1044
+ h
1045
+ for h in self.dataset_history(dataset_rid=dataset.rid)
1046
+ if h.dataset_version == dataset.version
1047
+ ][0].version_rid
1048
+ version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
1034
1049
  return minid_page_url
1035
1050
 
1036
1051
  def _get_dataset_minid(
@@ -1073,14 +1088,25 @@ class Dataset:
1073
1088
  raise DerivaMLException(
1074
1089
  f"Minid for dataset {dataset.rid} doesn't exist"
1075
1090
  )
1076
- self._logger.info("Creating new MINID for dataset %s", dataset.rid)
1091
+ if self._use_minid:
1092
+ self._logger.info("Creating new MINID for dataset %s", dataset.rid)
1077
1093
  minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
1078
1094
  # If provided a MINID, use the MINID metadata to get the checksum and download the bag.
1079
- r = requests.get(minid_url, headers={"accept": "application/json"})
1080
- return DatasetMinid(dataset_version=dataset.version, **r.json())
1095
+ if self._use_minid:
1096
+ r = requests.get(minid_url, headers={"accept": "application/json"})
1097
+ dataset_minid = DatasetMinid(
1098
+ dataset_version=dataset.version, **r.json()
1099
+ )
1100
+ else:
1101
+ dataset_minid = DatasetMinid(
1102
+ dataset_version=dataset.version,
1103
+ RID=f"{dataset.rid}@{dataset_version_record.snapshot}",
1104
+ location=minid_url,
1105
+ )
1106
+ return dataset_minid
1081
1107
 
1082
1108
  def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
1083
- """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
1109
+ """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and validate
1084
1110
  that all the metadata is correct
1085
1111
 
1086
1112
  Args:
@@ -1089,19 +1115,37 @@ class Dataset:
1089
1115
  the location of the unpacked and validated dataset_table bag and the RID of the bag and the bag MINID
1090
1116
  """
1091
1117
 
1092
- # Check to see if we have an existing idempotent materialization of the desired bag. If so, then just reuse
1118
+ # Check to see if we have an existing idempotent materialization of the desired bag. If so, then reuse
1093
1119
  # it. If not, then we need to extract the contents of the archive into our cache directory.
1094
1120
  bag_dir = self._cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
1095
1121
  if bag_dir.exists():
1096
- bag_path = (bag_dir / f"Dataset_{minid.dataset_rid}").as_posix()
1097
- else:
1098
- bag_dir.mkdir(parents=True, exist_ok=True)
1099
- with NamedTemporaryFile(
1100
- delete=False, suffix=f"Dataset_{minid.dataset_rid}.zip"
1101
- ) as zip_file:
1102
- archive_path = fetch_single_file(minid.bag_url, zip_file.name)
1103
- bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
1104
- bdb.validate_bag_structure(bag_path)
1122
+ self._logger.info(
1123
+ f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
1124
+ )
1125
+ return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
1126
+
1127
+ # Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
1128
+ with TemporaryDirectory() as tmp_dir:
1129
+ if self._use_minid:
1130
+ # Get bag from S3
1131
+ archive_path = fetch_single_file(minid.bag_url)
1132
+ else:
1133
+ exporter = DerivaExport(
1134
+ host=self._model.catalog.deriva_server.server, output_dir=tmp_dir
1135
+ )
1136
+ archive_path = exporter.retrieve_file(minid.bag_url)
1137
+ hashes = hash_utils.compute_file_hashes(
1138
+ archive_path, hashes=["md5", "sha256"]
1139
+ )
1140
+ checksum = hashes["sha256"][0]
1141
+ bag_dir = self._cache_dir / f"{minid.dataset_rid}_{checksum}"
1142
+ if bag_dir.exists():
1143
+ self._logger.info(
1144
+ f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
1145
+ )
1146
+ return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
1147
+ bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
1148
+ bdb.validate_bag_structure(bag_path)
1105
1149
  return Path(bag_path)
1106
1150
 
1107
1151
  def _materialize_dataset_bag(
@@ -1120,17 +1164,18 @@ class Dataset:
1120
1164
 
1121
1165
  def update_status(status: Status, msg: str) -> None:
1122
1166
  """Update the current status for this execution in the catalog"""
1123
- self._model.catalog.getPathBuilder().schemas[
1124
- self._ml_schema
1125
- ].Execution.update(
1126
- [
1127
- {
1128
- "RID": execution_rid,
1129
- "Status": status.value,
1130
- "Status_Detail": msg,
1131
- }
1132
- ]
1133
- )
1167
+ if execution_rid and execution_rid != DRY_RUN_RID:
1168
+ self._model.catalog.getPathBuilder().schemas[
1169
+ self._ml_schema
1170
+ ].Execution.update(
1171
+ [
1172
+ {
1173
+ "RID": execution_rid,
1174
+ "Status": status.value,
1175
+ "Status_Detail": msg,
1176
+ }
1177
+ ]
1178
+ )
1134
1179
  self._logger.info(msg)
1135
1180
 
1136
1181
  def fetch_progress_callback(current, total):
@@ -1152,6 +1197,9 @@ class Dataset:
1152
1197
 
1153
1198
  # If this bag has already been validated, our work is done. Otherwise, materialize the bag.
1154
1199
  if not validated_check.exists():
1200
+ self._logger.info(
1201
+ f"Materializing bag {minid.dataset_rid} Version:{minid.dataset_version}"
1202
+ )
1155
1203
  bdb.materialize(
1156
1204
  bag_path.as_posix(),
1157
1205
  fetch_callback=fetch_progress_callback,
@@ -1160,9 +1208,8 @@ class Dataset:
1160
1208
  validated_check.touch()
1161
1209
  return Path(bag_path)
1162
1210
 
1163
- def _export_outputs(
1211
+ def _export_annotation(
1164
1212
  self,
1165
- dataset: Optional[DatasetSpec] = None,
1166
1213
  snapshot_catalog: Optional[DerivaML] = None,
1167
1214
  ) -> list[dict[str, Any]]:
1168
1215
  """Return and output specification for the datasets in the provided model
@@ -1171,19 +1218,6 @@ class Dataset:
1171
1218
  An export specification suitable for Chaise.
1172
1219
  """
1173
1220
 
1174
- def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
1175
- """
1176
-
1177
- Args:
1178
- spath: list[Table]:
1179
- dpath: list[Table]:
1180
- table: Table
1181
-
1182
- Returns:
1183
- An export specification suitable for Chaise.
1184
- """
1185
- return self._export_dataset_element(spath, dpath, table)
1186
-
1187
1221
  # Export specification is a specification for the datasets, plus any controlled vocabulary
1188
1222
  return [
1189
1223
  {
@@ -1202,41 +1236,34 @@ class Dataset:
1202
1236
  "destination": {"type": "json", "name": "schema"},
1203
1237
  },
1204
1238
  ] + self._dataset_specification(
1205
- writer, dataset, snapshot_catalog=snapshot_catalog
1239
+ self._export_annotation_dataset_element,
1240
+ None,
1241
+ snapshot_catalog=snapshot_catalog,
1206
1242
  )
1207
1243
 
1208
- def _processor_params(
1244
+ def _export_specification(
1209
1245
  self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
1210
1246
  ) -> list[dict[str, Any]]:
1211
1247
  """
1248
+ Generate a specification for export engine for specific dataset.
1249
+
1212
1250
  Returns:
1213
1251
  a download specification for the datasets in the provided model.
1214
1252
 
1215
1253
  """
1216
1254
 
1217
- def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
1218
- """
1219
-
1220
- Args:
1221
- spath:
1222
- dpath:
1223
- table: Table
1224
-
1225
- Returns:
1226
-
1227
- """
1228
- return self._download_dataset_element(spath, dpath, table)
1229
-
1230
1255
  # Download spec is the spec for any controlled vocabulary and for the dataset_table.
1231
1256
  return [
1232
1257
  {
1233
1258
  "processor": "json",
1234
1259
  "processor_params": {"query_path": "/schema", "output_path": "schema"},
1235
1260
  }
1236
- ] + self._dataset_specification(writer, dataset, snapshot_catalog)
1261
+ ] + self._dataset_specification(
1262
+ self._export_specification_dataset_element, dataset, snapshot_catalog
1263
+ )
1237
1264
 
1238
1265
  @staticmethod
1239
- def _download_dataset_element(
1266
+ def _export_specification_dataset_element(
1240
1267
  spath: str, dpath: str, table: Table
1241
1268
  ) -> list[dict[str, Any]]:
1242
1269
  """Return the download specification for the data object indicated by a path through the data model.
@@ -1253,7 +1280,7 @@ class Dataset:
1253
1280
  {
1254
1281
  "processor": "csv",
1255
1282
  "processor_params": {
1256
- "query_path": f"/entity/{spath}?limit=none",
1283
+ "query_path": f"/entity/{spath}",
1257
1284
  "output_path": dpath,
1258
1285
  },
1259
1286
  }
@@ -1266,16 +1293,15 @@ class Dataset:
1266
1293
  {
1267
1294
  "processor": "fetch",
1268
1295
  "processor_params": {
1269
- "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
1296
+ "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
1270
1297
  "output_path": f"asset/{table.name}",
1271
1298
  },
1272
1299
  }
1273
1300
  )
1274
1301
  return exports
1275
1302
 
1276
- @staticmethod
1277
- def _export_dataset_element(
1278
- spath: str, dpath: str, table: Table
1303
+ def _export_annotation_dataset_element(
1304
+ self, spath: str, dpath: str, table: Table
1279
1305
  ) -> list[dict[str, Any]]:
1280
1306
  """Given a path in the data model, output an export specification for the path taken to get to the current table.
1281
1307
 
@@ -1291,9 +1317,23 @@ class Dataset:
1291
1317
  # into a path in the form of /S:T1/S:T2/S:Table
1292
1318
  # Generate the destination path in the file system using just the table names.
1293
1319
 
1320
+ skip_root_path = False
1321
+ if spath.startswith(f"{self._ml_schema}:Dataset/"):
1322
+ # Chaise will add table name and RID filter, so strip it off.
1323
+ spath = "/".join(spath.split("/")[2:])
1324
+ if spath == "":
1325
+ # This path is to just the dataset table.
1326
+ return []
1327
+ else:
1328
+ # A vocabulary table, so we don't want the root_path.
1329
+ skip_root_path = True
1294
1330
  exports = [
1295
1331
  {
1296
- "source": {"api": "entity", "path": spath},
1332
+ "source": {
1333
+ "api": "entity",
1334
+ "path": spath,
1335
+ "skip_root_path": skip_root_path,
1336
+ },
1297
1337
  "destination": {"name": dpath, "type": "csv"},
1298
1338
  }
1299
1339
  ]
@@ -1304,6 +1344,7 @@ class Dataset:
1304
1344
  exports.append(
1305
1345
  {
1306
1346
  "source": {
1347
+ "skip_root_path": False,
1307
1348
  "api": "attribute",
1308
1349
  "path": f"{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
1309
1350
  },
@@ -1313,44 +1354,53 @@ class Dataset:
1313
1354
  return exports
1314
1355
 
1315
1356
  def _generate_dataset_download_spec(
1316
- self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
1357
+ self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
1317
1358
  ) -> dict[str, Any]:
1318
1359
  """
1360
+ Generate a specification for downloading a specific dataset.
1319
1361
 
1362
+ This routine creates a download specification that can be used by the Deriva export processor to download
1363
+ a specific dataset as a MINID.
1320
1364
  Returns:
1321
1365
  """
1322
1366
  s3_target = "s3://eye-ai-shared"
1323
1367
  minid_test = False
1324
1368
 
1325
1369
  catalog_id = self._version_snapshot(dataset)
1326
- return {
1327
- "env": {"Dataset_RID": "{Dataset_RID}"},
1370
+ post_processors = (
1371
+ {
1372
+ "post_processors": [
1373
+ {
1374
+ "processor": "cloud_upload",
1375
+ "processor_params": {
1376
+ "acl": "public-read",
1377
+ "target_url": s3_target,
1378
+ },
1379
+ },
1380
+ {
1381
+ "processor": "identifier",
1382
+ "processor_params": {
1383
+ "test": minid_test,
1384
+ "env_column_map": {
1385
+ "RID": "{RID}@{snaptime}",
1386
+ "Description": "{Description}",
1387
+ },
1388
+ },
1389
+ },
1390
+ ]
1391
+ }
1392
+ if self._use_minid
1393
+ else {}
1394
+ )
1395
+ return post_processors | {
1396
+ "env": {"RID": "{RID}"},
1328
1397
  "bag": {
1329
- "bag_name": "Dataset_{Dataset_RID}",
1398
+ "bag_name": "Dataset_{RID}",
1330
1399
  "bag_algorithms": ["md5"],
1331
1400
  "bag_archiver": "zip",
1332
1401
  "bag_metadata": {},
1333
1402
  "bag_idempotent": True,
1334
1403
  },
1335
- "post_processors": [
1336
- {
1337
- "processor": "cloud_upload",
1338
- "processor_params": {
1339
- "acl": "public-read",
1340
- "target_url": s3_target,
1341
- },
1342
- },
1343
- {
1344
- "processor": "identifier",
1345
- "processor_params": {
1346
- "test": minid_test,
1347
- "env_column_map": {
1348
- "Dataset_RID": "{RID}@{snaptime}",
1349
- "Description": "{Description}",
1350
- },
1351
- },
1352
- },
1353
- ],
1354
1404
  "catalog": {
1355
1405
  "host": f"{self._model.catalog.deriva_server.scheme}://{self._model.catalog.deriva_server.server}",
1356
1406
  "catalog_id": catalog_id,
@@ -1366,125 +1416,50 @@ class Dataset:
1366
1416
  {
1367
1417
  "processor": "env",
1368
1418
  "processor_params": {
1369
- "query_path": "/entity/M:=deriva-ml:Dataset/RID={Dataset_RID}?limit=none",
1419
+ "query_path": "/entity/M:=deriva-ml:Dataset/RID={RID}",
1370
1420
  "output_path": "Dataset",
1371
1421
  "query_keys": ["RID", "Description"],
1372
1422
  },
1373
1423
  },
1374
1424
  ]
1375
- + self._processor_params(dataset, snapshot_catalog),
1425
+ + self._export_specification(dataset, snapshot_catalog),
1376
1426
  },
1377
1427
  }
1378
1428
 
1379
- def dataset_visible_columns(self) -> dict[str, Any]:
1380
- dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
1381
- rcb_name = next(
1382
- [fk.name[0].name, fk.name[1]]
1383
- for fk in dataset_table.foreign_keys
1384
- if fk.name[1] == "Dataset_RCB_fkey"
1385
- )
1386
- rmb_name = next(
1387
- [fk.name[0].name, fk.name[1]]
1388
- for fk in dataset_table.foreign_keys
1389
- if fk.name[1] == "Dataset_RMB_fkey"
1390
- )
1391
- return {
1392
- "*": [
1393
- "RID",
1394
- "Description",
1395
- {
1396
- "display": {
1397
- "markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
1398
- },
1399
- "markdown_name": "Annotation App",
1400
- },
1401
- rcb_name,
1402
- rmb_name,
1403
- ],
1404
- "detailed": [
1405
- "RID",
1406
- "Description",
1407
- {
1408
- "source": [
1409
- {"inbound": ["deriva-ml", "Dataset_Dataset_Type_Dataset_fkey"]},
1410
- {
1411
- "outbound": [
1412
- "deriva-ml",
1413
- "Dataset_Dataset_Type_Dataset_Type_fkey",
1414
- ]
1429
+ def _generate_dataset_download_annotations(self) -> dict[str, Any]:
1430
+ post_processors = (
1431
+ {
1432
+ "type": "BAG",
1433
+ "outputs": [{"fragment_key": "dataset_export_outputs"}],
1434
+ "displayname": "BDBag to Cloud",
1435
+ "bag_idempotent": True,
1436
+ "postprocessors": [
1437
+ {
1438
+ "processor": "cloud_upload",
1439
+ "processor_params": {
1440
+ "acl": "public-read",
1441
+ "target_url": "s3://eye-ai-shared/",
1415
1442
  },
1416
- "RID",
1417
- ],
1418
- "markdown_name": "Dataset Types",
1419
- },
1420
- {
1421
- "display": {
1422
- "markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
1423
1443
  },
1424
- "markdown_name": "Annotation App",
1425
- },
1426
- rcb_name,
1427
- rmb_name,
1428
- ],
1429
- "filter": {
1430
- "and": [
1431
- {"source": "RID"},
1432
- {"source": "Description"},
1433
1444
  {
1434
- "source": [
1435
- {
1436
- "inbound": [
1437
- "deriva-ml",
1438
- "Dataset_Dataset_Type_Dataset_fkey",
1439
- ]
1440
- },
1441
- {
1442
- "outbound": [
1443
- "deriva-ml",
1444
- "Dataset_Dataset_Type_Dataset_Type_fkey",
1445
- ]
1445
+ "processor": "identifier",
1446
+ "processor_params": {
1447
+ "test": False,
1448
+ "env_column_map": {
1449
+ "RID": "{RID}@{snaptime}",
1450
+ "Description": "{Description}",
1446
1451
  },
1447
- "RID",
1448
- ],
1449
- "markdown_name": "Dataset Types",
1450
- },
1451
- {
1452
- "source": [{"outbound": rcb_name}, "RID"],
1453
- "markdown_name": "Created By",
1454
- },
1455
- {
1456
- "source": [{"outbound": rmb_name}, "RID"],
1457
- "markdown_name": "Modified By",
1452
+ },
1458
1453
  },
1459
- ]
1460
- },
1461
- }
1462
-
1463
- def _dataset_visible_fkeys(self) -> dict[str, Any]:
1464
- def fkey_name(fk):
1465
- return [fk.name[0].name, fk.name[1]]
1466
-
1467
- dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
1468
-
1469
- source_list = [
1470
- {
1471
- "source": [
1472
- {"inbound": fkey_name(fkey.self_fkey)},
1473
- {"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
1474
- "RID",
1475
1454
  ],
1476
- "markdown_name": other_fkey.pk_table.name,
1477
1455
  }
1478
- for fkey in dataset_table.find_associations(max_arity=3, pure=False)
1479
- ]
1480
- return {"detailed": source_list}
1481
-
1482
- def _generate_dataset_annotations(self) -> dict[str, Any]:
1456
+ if self._use_minid
1457
+ else {}
1458
+ )
1483
1459
  return {
1484
1460
  deriva_tags.export_fragment_definitions: {
1485
- "dataset_export_outputs": self._export_outputs()
1461
+ "dataset_export_outputs": self._export_annotation()
1486
1462
  },
1487
- deriva_tags.visible_columns: self.dataset_visible_columns(),
1488
1463
  deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
1489
1464
  deriva_tags.export_2019: {
1490
1465
  "detailed": {
@@ -1494,45 +1469,56 @@ class Dataset:
1494
1469
  "outputs": [{"fragment_key": "dataset_export_outputs"}],
1495
1470
  "displayname": "BDBag Download",
1496
1471
  "bag_idempotent": True,
1497
- "postprocessors": [
1498
- {
1499
- "processor": "identifier",
1500
- "processor_params": {
1501
- "test": False,
1502
- "env_column_map": {
1503
- "Dataset_RID": "{RID}@{snaptime}",
1504
- "Description": "{Description}",
1505
- },
1506
- },
1507
- }
1508
- ],
1509
- },
1510
- {
1511
- "type": "BAG",
1512
- "outputs": [{"fragment_key": "dataset_export_outputs"}],
1513
- "displayname": "BDBag to Cloud",
1514
- "bag_idempotent": True,
1515
- "postprocessors": [
1516
- {
1517
- "processor": "cloud_upload",
1518
- "processor_params": {
1519
- "acl": "public-read",
1520
- "target_url": "s3://eye-ai-shared/",
1521
- },
1522
- },
1523
- {
1524
- "processor": "identifier",
1525
- "processor_params": {
1526
- "test": False,
1527
- "env_column_map": {
1528
- "Dataset_RID": "{RID}@{snaptime}",
1529
- "Description": "{Description}",
1530
- },
1531
- },
1532
- },
1533
- ],
1534
- },
1472
+ }
1473
+ | post_processors
1535
1474
  ]
1536
1475
  }
1537
1476
  },
1538
1477
  }
1478
+
1479
+ def _dataset_visible_fkeys(self) -> dict[str, Any]:
1480
+ def fkey_name(fk):
1481
+ return [fk.name[0].name, fk.name[1]]
1482
+
1483
+ dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
1484
+
1485
+ source_list = [
1486
+ {
1487
+ "source": [
1488
+ {"inbound": ["deriva-ml", "Dataset_Version_Dataset_fkey"]},
1489
+ "RID",
1490
+ ],
1491
+ "markdown_name": "Previous Versions",
1492
+ "entity": True,
1493
+ },
1494
+ {
1495
+ "source": [
1496
+ {"inbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
1497
+ {"outbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
1498
+ "RID",
1499
+ ],
1500
+ "markdown_name": "Parent Datasets",
1501
+ },
1502
+ {
1503
+ "source": [
1504
+ {"inbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
1505
+ {"outbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
1506
+ "RID",
1507
+ ],
1508
+ "markdown_name": "Child Datasets",
1509
+ },
1510
+ ]
1511
+ source_list.extend(
1512
+ [
1513
+ {
1514
+ "source": [
1515
+ {"inbound": fkey_name(fkey.self_fkey)},
1516
+ {"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
1517
+ "RID",
1518
+ ],
1519
+ "markdown_name": other_fkey.pk_table.name,
1520
+ }
1521
+ for fkey in dataset_table.find_associations(max_arity=3, pure=False)
1522
+ ]
1523
+ )
1524
+ return {"detailed": source_list}