deriva-ml 1.13.2__py3-none-any.whl → 1.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/dataset.py CHANGED
@@ -1,18 +1,30 @@
1
1
  """
2
- This module defines the DataSet class with is used to manipulate datasets in DerivaML,
3
- The intended use of this class is as a base class in DerivaML so all the methods documented here are
2
+ This module defines the DataSet class with is used to manipulate datasets in DerivaML.
3
+ The intended use of this class is as a base class in DerivaML, so all the methods documented here are
4
4
  accessible via a DerivaML class instance.
5
5
 
6
-
7
6
  """
8
7
 
9
8
  from __future__ import annotations
10
- from bdbag.fetch.fetcher import fetch_single_file
11
9
  from bdbag import bdbag_api as bdb
10
+ from bdbag.fetch.fetcher import fetch_single_file
12
11
  from collections import defaultdict
12
+ from graphlib import TopologicalSorter
13
+ import json
14
+ import logging
15
+ from pathlib import Path
16
+ from pydantic import (
17
+ validate_call,
18
+ ConfigDict,
19
+ )
20
+ import requests
21
+ from tempfile import TemporaryDirectory
22
+ from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
13
23
 
24
+ from .history import iso_to_snap
14
25
  from deriva.core.ermrest_model import Table
15
26
  from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
27
+ import deriva.core.utils.hash_utils as hash_utils
16
28
  from deriva.transfer.download.deriva_export import DerivaExport
17
29
  from deriva.transfer.download.deriva_download import (
18
30
  DerivaDownloadConfigurationError,
@@ -22,24 +34,12 @@ from deriva.transfer.download.deriva_download import (
22
34
  DerivaDownloadTimeoutError,
23
35
  )
24
36
 
37
+
25
38
  try:
26
39
  from icecream import ic
27
40
  except ImportError: # Graceful fallback if IceCream isn't installed.
28
41
  ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
29
42
 
30
- from graphlib import TopologicalSorter
31
- import json
32
- import logging
33
- from pathlib import Path
34
- from pydantic import (
35
- validate_call,
36
- ConfigDict,
37
- )
38
- import requests
39
-
40
- from tempfile import TemporaryDirectory, NamedTemporaryFile
41
- from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
42
-
43
43
  from deriva_ml import DatasetBag
44
44
  from .deriva_definitions import (
45
45
  ML_SCHEMA,
@@ -49,7 +49,6 @@ from .deriva_definitions import (
49
49
  RID,
50
50
  DRY_RUN_RID,
51
51
  )
52
- from .history import iso_to_snap
53
52
  from .deriva_model import DerivaModel
54
53
  from .database_model import DatabaseModel
55
54
  from .dataset_aux_classes import (
@@ -74,13 +73,20 @@ class Dataset:
74
73
 
75
74
  _Logger = logging.getLogger("deriva_ml")
76
75
 
77
- def __init__(self, model: DerivaModel, cache_dir: Path, working_dir: Path):
76
+ def __init__(
77
+ self,
78
+ model: DerivaModel,
79
+ cache_dir: Path,
80
+ working_dir: Path,
81
+ use_minid: bool = True,
82
+ ):
78
83
  self._model = model
79
84
  self._ml_schema = ML_SCHEMA
80
85
  self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
81
86
  self._cache_dir = cache_dir
82
87
  self._working_dir = working_dir
83
88
  self._logger = logging.getLogger("deriva_ml")
89
+ self._use_minid = use_minid
84
90
 
85
91
  def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
86
92
  try:
@@ -100,27 +106,28 @@ class Dataset:
100
106
  dataset_list: list[DatasetSpec],
101
107
  description: Optional[str] = "",
102
108
  execution_rid: Optional[RID] = None,
103
- ) -> list[dict[str, Any]]:
109
+ ) -> None:
104
110
  schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
105
-
111
+ # determine snapshot after changes were made
112
+ snap = self._model.catalog.get("/").json()["snaptime"]
106
113
  # Construct version records for insert
107
- version_records = [
108
- {
109
- "Dataset": dataset.rid,
110
- "Version": str(dataset.version),
111
- "Description": description,
112
- "Execution": execution_rid,
113
- }
114
- for dataset in dataset_list
115
- ]
114
+ version_records = schema_path.tables["Dataset_Version"].insert(
115
+ [
116
+ {
117
+ "Dataset": dataset.rid,
118
+ "Version": str(dataset.version),
119
+ "Description": description,
120
+ "Execution": execution_rid,
121
+ "Snapshot": snap,
122
+ }
123
+ for dataset in dataset_list
124
+ ]
125
+ )
116
126
 
117
- # Insert version records and construct entities for updating the dataset version column.
118
- version_rids = [
119
- {"Version": v["RID"], "RID": v["Dataset"]}
120
- for v in schema_path.tables["Dataset_Version"].insert(version_records)
121
- ]
122
- schema_path.tables["Dataset"].update(version_rids)
123
- return version_rids
127
+ # And update the dataset records.
128
+ schema_path.tables["Dataset"].update(
129
+ [{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records]
130
+ )
124
131
 
125
132
  def _bootstrap_versions(self):
126
133
  datasets = [ds["RID"] for ds in self.find_datasets()]
@@ -161,6 +168,21 @@ class Dataset:
161
168
  ]
162
169
  )
163
170
 
171
+ def _set_version_snapshot(self):
172
+ dataset_version_path = (
173
+ self._model.catalog.getPathBuilder()
174
+ .schemas[self._ml_schema]
175
+ .tables["Dataset_Version"]
176
+ )
177
+ versions = dataset_version_path.entities().fetch()
178
+ dataset_version_path.update(
179
+ [
180
+ {"RID": h["RID"], "Snapshot": iso_to_snap(h["RCT"])}
181
+ for h in versions
182
+ if not h["Snapshot"]
183
+ ]
184
+ )
185
+
164
186
  def dataset_history(self, dataset_rid: RID) -> list[DatasetHistory]:
165
187
  """Return a list of DatasetHistory objects representing the dataset
166
188
 
@@ -170,6 +192,9 @@ class Dataset:
170
192
  Returns:
171
193
  A list of DatasetHistory objects which indicate the version-number, creation time, and bag instantiation of the dataset.
172
194
  """
195
+
196
+ if not self._is_dataset_rid(dataset_rid):
197
+ raise DerivaMLException(f"RID is not for a data set: {dataset_rid}")
173
198
  version_path = (
174
199
  self._model.catalog.getPathBuilder()
175
200
  .schemas[self._ml_schema]
@@ -179,7 +204,7 @@ class Dataset:
179
204
  DatasetHistory(
180
205
  dataset_version=DatasetVersion.parse(v["Version"]),
181
206
  minid=v["Minid"],
182
- timestamp=v["RCT"],
207
+ snapshot=v["Snapshot"],
183
208
  dataset_rid=dataset_rid,
184
209
  version_rid=v["RID"],
185
210
  description=v["Description"],
@@ -240,7 +265,7 @@ class Dataset:
240
265
 
241
266
  Args:
242
267
  dataset_rid: RID of the dataset whose version is to be incremented.
243
- component: Which version of the dataset_table to increment. Major, Minor or Patch
268
+ component: Which version of the dataset_table to increment. Major, Minor, or Patch
244
269
  description: Description of the version update of the dataset_table.
245
270
  execution_rid: Which execution is performing increment.
246
271
 
@@ -248,7 +273,7 @@ class Dataset:
248
273
  new semantic version of the dataset_table as a 3-tuple
249
274
 
250
275
  Raises:
251
- DerivaMLException: if provided RID is not to a dataset_table.
276
+ DerivaMLException: if provided, RID is not to a dataset_table.
252
277
  """
253
278
 
254
279
  # Find all the datasets that are reachable from this dataset and determine their new version numbers.
@@ -268,7 +293,7 @@ class Dataset:
268
293
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
269
294
  def create_dataset(
270
295
  self,
271
- type: str | list[str],
296
+ dataset_types: str | list[str],
272
297
  description: str,
273
298
  execution_rid: Optional[RID] = None,
274
299
  version: Optional[DatasetVersion] = None,
@@ -276,7 +301,7 @@ class Dataset:
276
301
  """Create a new dataset_table from the specified list of RIDs.
277
302
 
278
303
  Args:
279
- type: One or more dataset_table types. Must be a term from the DatasetType controlled vocabulary.
304
+ dataset_types: One or more dataset_table types. Must be a term from the DatasetType controlled vocabulary.
280
305
  description: Description of the dataset_table.
281
306
  execution_rid: Execution under which the dataset_table will be created.
282
307
  version: Version of the dataset_table.
@@ -304,7 +329,7 @@ class Dataset:
304
329
  return False
305
330
 
306
331
  # Create the entry for the new dataset_table and get its RID.
307
- ds_types = [type] if isinstance(type, str) else type
332
+ ds_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
308
333
  pb = self._model.catalog.getPathBuilder()
309
334
  for ds_type in ds_types:
310
335
  if not check_dataset_type(ds_type):
@@ -452,7 +477,9 @@ class Dataset:
452
477
  )
453
478
 
454
479
  # self.model = self.catalog.getCatalogModel()
455
- self.dataset_table.annotations.update(self._generate_dataset_annotations())
480
+ self.dataset_table.annotations.update(
481
+ self._generate_dataset_download_annotations()
482
+ )
456
483
  self._model.model.apply()
457
484
  return table
458
485
 
@@ -464,7 +491,7 @@ class Dataset:
464
491
 
465
492
  Args:
466
493
  dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
467
- recurse: (Default value = False)
494
+ recurse: (Default value = False)
468
495
  limit: If provided, the maximum number of members to return for each element type.
469
496
 
470
497
  Returns:
@@ -530,8 +557,8 @@ class Dataset:
530
557
  dataset is incremented and the description, if provide is applied to that new version.
531
558
 
532
559
  Args:
533
- dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
534
- members: List of RIDs of members to add to the dataset_table.
560
+ dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
561
+ members: List of member RIDs to add to the dataset_table.
535
562
  validate: Check rid_list to make sure elements are not already in the dataset_table.
536
563
  description: Markdown description of the updated dataset.
537
564
  execution_rid: Optional RID of execution associated with this dataset.
@@ -544,7 +571,7 @@ class Dataset:
544
571
 
545
572
  Args:
546
573
  member_rid:
547
- path: (Default value = None)
574
+ path: (Default value = None)
548
575
 
549
576
  Returns:
550
577
 
@@ -570,7 +597,7 @@ class Dataset:
570
597
  a.other_fkeys.pop().pk_table.name: a.table.name
571
598
  for a in self.dataset_table.find_associations()
572
599
  }
573
- # Get a list of all the types of objects that can be linked to a dataset_table.
600
+ # Get a list of all the object types that can be linked to a dataset_table.
574
601
  for m in members:
575
602
  try:
576
603
  rid_info = self._model.catalog.resolve_rid(m)
@@ -618,8 +645,8 @@ class Dataset:
618
645
  dataset is incremented and the description, if provide is applied to that new version.
619
646
 
620
647
  Args:
621
- dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
622
- members: List of RIDs of members to add to the dataset_table.
648
+ dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
649
+ members: List of member RIDs to add to the dataset_table.
623
650
  description: Markdown description of the updated dataset.
624
651
  execution_rid: Optional RID of execution associated with this operation.
625
652
  """
@@ -634,7 +661,7 @@ class Dataset:
634
661
  a.other_fkeys.pop().pk_table.name: a.table.name
635
662
  for a in self.dataset_table.find_associations()
636
663
  }
637
- # Get a list of all the types of objects that can be linked to a dataset_table.
664
+ # Get a list of all the object types that can be linked to a dataset_table.
638
665
  for m in members:
639
666
  try:
640
667
  rid_info = self._model.catalog.resolve_rid(m)
@@ -670,7 +697,7 @@ class Dataset:
670
697
  )
671
698
 
672
699
  @validate_call
673
- def list_dataset_parents(self, dataset_rid: RID) -> list[RID]:
700
+ def list_dataset_parents(self, dataset_rid: RID) -> list[str]:
674
701
  """Given a dataset_table RID, return a list of RIDs of the parent datasets if this is included in a
675
702
  nested dataset.
676
703
 
@@ -696,14 +723,14 @@ class Dataset:
696
723
 
697
724
  @validate_call
698
725
  def list_dataset_children(self, dataset_rid: RID, recurse=False) -> list[RID]:
699
- """Given a dataset_table RID, return a list of RIDs of any nested datasets.
726
+ """Given a dataset_table RID, return a list of RIDs for any nested datasets.
700
727
 
701
728
  Args:
702
729
  dataset_rid: A dataset_table RID.
703
- recurse: If True, return a list of RIDs of any nested datasets.
730
+ recurse: If True, return a list of nested datasets RIDs.
704
731
 
705
732
  Returns:
706
- list of RIDs of nested datasets.
733
+ list of nested dataset RIDs.
707
734
 
708
735
  """
709
736
  dataset_dataset_path = (
@@ -726,7 +753,7 @@ class Dataset:
726
753
 
727
754
  return find_children(dataset_rid)
728
755
 
729
- def _vocabulary_specification(
756
+ def _export_vocabulary(
730
757
  self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
731
758
  ) -> list[dict[str, Any]]:
732
759
  """
@@ -756,10 +783,10 @@ class Dataset:
756
783
  ) -> Iterator[tuple[str, str, Table]]:
757
784
  paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
758
785
 
759
- def source_path(path: tuple[Table, ...]):
786
+ def source_path(path: tuple[Table, ...]) -> list[str]:
760
787
  """Convert a tuple representing a path into a source path component with FK linkage"""
761
788
  path = list(path)
762
- p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
789
+ p = [f"{self._model.ml_schema}:Dataset/RID={{RID}}"]
763
790
  for table in path[1:]:
764
791
  if table.name == "Dataset_Dataset":
765
792
  p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
@@ -803,7 +830,7 @@ class Dataset:
803
830
  dataset_elements = [
804
831
  snapshot_catalog._model.name_to_table(e)
805
832
  for e, m in snapshot_catalog.list_dataset_members(
806
- dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
833
+ dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
807
834
  ).items()
808
835
  if m
809
836
  ]
@@ -857,7 +884,7 @@ class Dataset:
857
884
  """
858
885
 
859
886
  def children_depth(
860
- dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
887
+ dataset_rid: RID, nested_datasets: dict[str, list[str]]
861
888
  ) -> int:
862
889
  """Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
863
890
  try:
@@ -899,13 +926,13 @@ class Dataset:
899
926
  def _dataset_specification(
900
927
  self,
901
928
  writer: Callable[[str, str, Table], list[dict[str, Any]]],
902
- dataset: DatasetSpec,
929
+ dataset: Optional[DatasetSpec] = None,
903
930
  snapshot_catalog: Optional[DerivaML] = None,
904
931
  ) -> list[dict[str, Any]]:
905
932
  """Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
906
- The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
933
+ The top level data directory of the resulting BDBag will have one subdirectory for element type. The subdirectory
907
934
  will contain the CSV indicating which elements of that type are present in the dataset_table, and then there will be a
908
- subdirectories for each object that is reachable from the dataset_table members.
935
+ subdirectory for each object that is reachable from the dataset_table members.
909
936
 
910
937
  To simplify reconstructing the relationship between tables, the CVS for each
911
938
  The top level data directory will also contain a subdirectory for any controlled vocabularies used in the dataset_table.
@@ -913,7 +940,7 @@ class Dataset:
913
940
 
914
941
  For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign key relationships to
915
942
  objects in tables T3 and T4. There are also two controlled vocabularies, CV1 and CV2. T2 is an asset table
916
- which has two asset in it. The layout of the resulting bdbag would be:
943
+ which has two assets in it. The layout of the resulting bdbag would be:
917
944
  data
918
945
  CV1/
919
946
  cv1.csv
@@ -939,12 +966,12 @@ class Dataset:
939
966
  Returns:
940
967
  A dataset_table specification.
941
968
  """
942
- element_spec = []
969
+ element_spec = self._export_vocabulary(writer)
943
970
  for path in self._table_paths(
944
971
  dataset=dataset, snapshot_catalog=snapshot_catalog
945
972
  ):
946
973
  element_spec.extend(writer(*path))
947
- return self._vocabulary_specification(writer) + element_spec
974
+ return element_spec
948
975
 
949
976
  def _download_dataset_bag(
950
977
  self,
@@ -985,7 +1012,7 @@ class Dataset:
985
1012
  for h in self.dataset_history(dataset_rid=dataset.rid)
986
1013
  if h.dataset_version == dataset.version
987
1014
  ][0]
988
- return f"{self._model.catalog.catalog_id}@{iso_to_snap(version_record.timestamp.isoformat())}"
1015
+ return f"{self._model.catalog.catalog_id}@{version_record.snapshot}"
989
1016
 
990
1017
  def _create_dataset_minid(
991
1018
  self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
@@ -1000,7 +1027,7 @@ class Dataset:
1000
1027
  )
1001
1028
  try:
1002
1029
  self._logger.info(
1003
- f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
1030
+ f"Downloading dataset {'minid' if self._use_minid else 'bag'} for catalog: {dataset.rid}@{str(dataset.version)}"
1004
1031
  )
1005
1032
  # Generate the bag and put into S3 storage.
1006
1033
  exporter = DerivaExport(
@@ -1009,9 +1036,10 @@ class Dataset:
1009
1036
  output_dir=tmp_dir,
1010
1037
  defer_download=True,
1011
1038
  timeout=(10, 610),
1012
- envars={"Dataset_RID": dataset.rid},
1039
+ envars={"RID": dataset.rid},
1013
1040
  )
1014
1041
  minid_page_url = exporter.export()[0] # Get the MINID launch page
1042
+
1015
1043
  except (
1016
1044
  DerivaDownloadError,
1017
1045
  DerivaDownloadConfigurationError,
@@ -1021,17 +1049,18 @@ class Dataset:
1021
1049
  ) as e:
1022
1050
  raise DerivaMLException(format_exception(e))
1023
1051
  # Update version table with MINID.
1024
- version_path = (
1025
- self._model.catalog.getPathBuilder()
1026
- .schemas[self._ml_schema]
1027
- .tables["Dataset_Version"]
1028
- )
1029
- version_rid = [
1030
- h
1031
- for h in self.dataset_history(dataset_rid=dataset.rid)
1032
- if h.dataset_version == dataset.version
1033
- ][0].version_rid
1034
- version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
1052
+ if self._use_minid:
1053
+ version_path = (
1054
+ self._model.catalog.getPathBuilder()
1055
+ .schemas[self._ml_schema]
1056
+ .tables["Dataset_Version"]
1057
+ )
1058
+ version_rid = [
1059
+ h
1060
+ for h in self.dataset_history(dataset_rid=dataset.rid)
1061
+ if h.dataset_version == dataset.version
1062
+ ][0].version_rid
1063
+ version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
1035
1064
  return minid_page_url
1036
1065
 
1037
1066
  def _get_dataset_minid(
@@ -1074,14 +1103,25 @@ class Dataset:
1074
1103
  raise DerivaMLException(
1075
1104
  f"Minid for dataset {dataset.rid} doesn't exist"
1076
1105
  )
1077
- self._logger.info("Creating new MINID for dataset %s", dataset.rid)
1106
+ if self._use_minid:
1107
+ self._logger.info("Creating new MINID for dataset %s", dataset.rid)
1078
1108
  minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
1079
1109
  # If provided a MINID, use the MINID metadata to get the checksum and download the bag.
1080
- r = requests.get(minid_url, headers={"accept": "application/json"})
1081
- return DatasetMinid(dataset_version=dataset.version, **r.json())
1110
+ if self._use_minid:
1111
+ r = requests.get(minid_url, headers={"accept": "application/json"})
1112
+ dataset_minid = DatasetMinid(
1113
+ dataset_version=dataset.version, **r.json()
1114
+ )
1115
+ else:
1116
+ dataset_minid = DatasetMinid(
1117
+ dataset_version=dataset.version,
1118
+ RID=f"{dataset.rid}@{dataset_version_record.snapshot}",
1119
+ location=minid_url,
1120
+ )
1121
+ return dataset_minid
1082
1122
 
1083
1123
  def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
1084
- """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
1124
+ """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and validate
1085
1125
  that all the metadata is correct
1086
1126
 
1087
1127
  Args:
@@ -1090,19 +1130,37 @@ class Dataset:
1090
1130
  the location of the unpacked and validated dataset_table bag and the RID of the bag and the bag MINID
1091
1131
  """
1092
1132
 
1093
- # Check to see if we have an existing idempotent materialization of the desired bag. If so, then just reuse
1133
+ # Check to see if we have an existing idempotent materialization of the desired bag. If so, then reuse
1094
1134
  # it. If not, then we need to extract the contents of the archive into our cache directory.
1095
1135
  bag_dir = self._cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
1096
1136
  if bag_dir.exists():
1097
- bag_path = (bag_dir / f"Dataset_{minid.dataset_rid}").as_posix()
1098
- else:
1099
- bag_dir.mkdir(parents=True, exist_ok=True)
1100
- with NamedTemporaryFile(
1101
- delete=False, suffix=f"Dataset_{minid.dataset_rid}.zip"
1102
- ) as zip_file:
1103
- archive_path = fetch_single_file(minid.bag_url, zip_file.name)
1104
- bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
1105
- bdb.validate_bag_structure(bag_path)
1137
+ self._logger.info(
1138
+ f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
1139
+ )
1140
+ return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
1141
+
1142
+ # Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
1143
+ with TemporaryDirectory() as tmp_dir:
1144
+ if self._use_minid:
1145
+ # Get bag from S3
1146
+ archive_path = fetch_single_file(minid.bag_url)
1147
+ else:
1148
+ exporter = DerivaExport(
1149
+ host=self._model.catalog.deriva_server.server, output_dir=tmp_dir
1150
+ )
1151
+ archive_path = exporter.retrieve_file(minid.bag_url)
1152
+ hashes = hash_utils.compute_file_hashes(
1153
+ archive_path, hashes=["md5", "sha256"]
1154
+ )
1155
+ checksum = hashes["sha256"][0]
1156
+ bag_dir = self._cache_dir / f"{minid.dataset_rid}_{checksum}"
1157
+ if bag_dir.exists():
1158
+ self._logger.info(
1159
+ f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
1160
+ )
1161
+ return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
1162
+ bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
1163
+ bdb.validate_bag_structure(bag_path)
1106
1164
  return Path(bag_path)
1107
1165
 
1108
1166
  def _materialize_dataset_bag(
@@ -1154,6 +1212,9 @@ class Dataset:
1154
1212
 
1155
1213
  # If this bag has already been validated, our work is done. Otherwise, materialize the bag.
1156
1214
  if not validated_check.exists():
1215
+ self._logger.info(
1216
+ f"Materializing bag {minid.dataset_rid} Version:{minid.dataset_version}"
1217
+ )
1157
1218
  bdb.materialize(
1158
1219
  bag_path.as_posix(),
1159
1220
  fetch_callback=fetch_progress_callback,
@@ -1162,9 +1223,8 @@ class Dataset:
1162
1223
  validated_check.touch()
1163
1224
  return Path(bag_path)
1164
1225
 
1165
- def _export_outputs(
1226
+ def _export_annotation(
1166
1227
  self,
1167
- dataset: Optional[DatasetSpec] = None,
1168
1228
  snapshot_catalog: Optional[DerivaML] = None,
1169
1229
  ) -> list[dict[str, Any]]:
1170
1230
  """Return and output specification for the datasets in the provided model
@@ -1173,19 +1233,6 @@ class Dataset:
1173
1233
  An export specification suitable for Chaise.
1174
1234
  """
1175
1235
 
1176
- def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
1177
- """
1178
-
1179
- Args:
1180
- spath: list[Table]:
1181
- dpath: list[Table]:
1182
- table: Table
1183
-
1184
- Returns:
1185
- An export specification suitable for Chaise.
1186
- """
1187
- return self._export_dataset_element(spath, dpath, table)
1188
-
1189
1236
  # Export specification is a specification for the datasets, plus any controlled vocabulary
1190
1237
  return [
1191
1238
  {
@@ -1204,41 +1251,34 @@ class Dataset:
1204
1251
  "destination": {"type": "json", "name": "schema"},
1205
1252
  },
1206
1253
  ] + self._dataset_specification(
1207
- writer, dataset, snapshot_catalog=snapshot_catalog
1254
+ self._export_annotation_dataset_element,
1255
+ None,
1256
+ snapshot_catalog=snapshot_catalog,
1208
1257
  )
1209
1258
 
1210
- def _processor_params(
1259
+ def _export_specification(
1211
1260
  self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
1212
1261
  ) -> list[dict[str, Any]]:
1213
1262
  """
1263
+ Generate a specification for export engine for specific dataset.
1264
+
1214
1265
  Returns:
1215
1266
  a download specification for the datasets in the provided model.
1216
1267
 
1217
1268
  """
1218
1269
 
1219
- def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
1220
- """
1221
-
1222
- Args:
1223
- spath:
1224
- dpath:
1225
- table: Table
1226
-
1227
- Returns:
1228
-
1229
- """
1230
- return self._download_dataset_element(spath, dpath, table)
1231
-
1232
1270
  # Download spec is the spec for any controlled vocabulary and for the dataset_table.
1233
1271
  return [
1234
1272
  {
1235
1273
  "processor": "json",
1236
1274
  "processor_params": {"query_path": "/schema", "output_path": "schema"},
1237
1275
  }
1238
- ] + self._dataset_specification(writer, dataset, snapshot_catalog)
1276
+ ] + self._dataset_specification(
1277
+ self._export_specification_dataset_element, dataset, snapshot_catalog
1278
+ )
1239
1279
 
1240
1280
  @staticmethod
1241
- def _download_dataset_element(
1281
+ def _export_specification_dataset_element(
1242
1282
  spath: str, dpath: str, table: Table
1243
1283
  ) -> list[dict[str, Any]]:
1244
1284
  """Return the download specification for the data object indicated by a path through the data model.
@@ -1255,7 +1295,7 @@ class Dataset:
1255
1295
  {
1256
1296
  "processor": "csv",
1257
1297
  "processor_params": {
1258
- "query_path": f"/entity/{spath}?limit=none",
1298
+ "query_path": f"/entity/{spath}",
1259
1299
  "output_path": dpath,
1260
1300
  },
1261
1301
  }
@@ -1268,16 +1308,15 @@ class Dataset:
1268
1308
  {
1269
1309
  "processor": "fetch",
1270
1310
  "processor_params": {
1271
- "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
1311
+ "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
1272
1312
  "output_path": f"asset/{table.name}",
1273
1313
  },
1274
1314
  }
1275
1315
  )
1276
1316
  return exports
1277
1317
 
1278
- @staticmethod
1279
- def _export_dataset_element(
1280
- spath: str, dpath: str, table: Table
1318
+ def _export_annotation_dataset_element(
1319
+ self, spath: str, dpath: str, table: Table
1281
1320
  ) -> list[dict[str, Any]]:
1282
1321
  """Given a path in the data model, output an export specification for the path taken to get to the current table.
1283
1322
 
@@ -1293,9 +1332,23 @@ class Dataset:
1293
1332
  # into a path in the form of /S:T1/S:T2/S:Table
1294
1333
  # Generate the destination path in the file system using just the table names.
1295
1334
 
1335
+ skip_root_path = False
1336
+ if spath.startswith(f"{self._ml_schema}:Dataset/"):
1337
+ # Chaise will add table name and RID filter, so strip it off.
1338
+ spath = "/".join(spath.split("/")[2:])
1339
+ if spath == "":
1340
+ # This path is to just the dataset table.
1341
+ return []
1342
+ else:
1343
+ # A vocabulary table, so we don't want the root_path.
1344
+ skip_root_path = True
1296
1345
  exports = [
1297
1346
  {
1298
- "source": {"api": "entity", "path": spath},
1347
+ "source": {
1348
+ "api": "entity",
1349
+ "path": spath,
1350
+ "skip_root_path": skip_root_path,
1351
+ },
1299
1352
  "destination": {"name": dpath, "type": "csv"},
1300
1353
  }
1301
1354
  ]
@@ -1306,6 +1359,7 @@ class Dataset:
1306
1359
  exports.append(
1307
1360
  {
1308
1361
  "source": {
1362
+ "skip_root_path": False,
1309
1363
  "api": "attribute",
1310
1364
  "path": f"{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
1311
1365
  },
@@ -1315,44 +1369,53 @@ class Dataset:
1315
1369
  return exports
1316
1370
 
1317
1371
  def _generate_dataset_download_spec(
1318
- self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
1372
+ self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
1319
1373
  ) -> dict[str, Any]:
1320
1374
  """
1375
+ Generate a specification for downloading a specific dataset.
1321
1376
 
1377
+ This routine creates a download specification that can be used by the Deriva export processor to download
1378
+ a specific dataset as a MINID.
1322
1379
  Returns:
1323
1380
  """
1324
1381
  s3_target = "s3://eye-ai-shared"
1325
1382
  minid_test = False
1326
1383
 
1327
1384
  catalog_id = self._version_snapshot(dataset)
1328
- return {
1329
- "env": {"Dataset_RID": "{Dataset_RID}"},
1385
+ post_processors = (
1386
+ {
1387
+ "post_processors": [
1388
+ {
1389
+ "processor": "cloud_upload",
1390
+ "processor_params": {
1391
+ "acl": "public-read",
1392
+ "target_url": s3_target,
1393
+ },
1394
+ },
1395
+ {
1396
+ "processor": "identifier",
1397
+ "processor_params": {
1398
+ "test": minid_test,
1399
+ "env_column_map": {
1400
+ "RID": "{RID}@{snaptime}",
1401
+ "Description": "{Description}",
1402
+ },
1403
+ },
1404
+ },
1405
+ ]
1406
+ }
1407
+ if self._use_minid
1408
+ else {}
1409
+ )
1410
+ return post_processors | {
1411
+ "env": {"RID": "{RID}"},
1330
1412
  "bag": {
1331
- "bag_name": "Dataset_{Dataset_RID}",
1413
+ "bag_name": "Dataset_{RID}",
1332
1414
  "bag_algorithms": ["md5"],
1333
1415
  "bag_archiver": "zip",
1334
1416
  "bag_metadata": {},
1335
1417
  "bag_idempotent": True,
1336
1418
  },
1337
- "post_processors": [
1338
- {
1339
- "processor": "cloud_upload",
1340
- "processor_params": {
1341
- "acl": "public-read",
1342
- "target_url": s3_target,
1343
- },
1344
- },
1345
- {
1346
- "processor": "identifier",
1347
- "processor_params": {
1348
- "test": minid_test,
1349
- "env_column_map": {
1350
- "Dataset_RID": "{RID}@{snaptime}",
1351
- "Description": "{Description}",
1352
- },
1353
- },
1354
- },
1355
- ],
1356
1419
  "catalog": {
1357
1420
  "host": f"{self._model.catalog.deriva_server.scheme}://{self._model.catalog.deriva_server.server}",
1358
1421
  "catalog_id": catalog_id,
@@ -1368,125 +1431,50 @@ class Dataset:
1368
1431
  {
1369
1432
  "processor": "env",
1370
1433
  "processor_params": {
1371
- "query_path": "/entity/M:=deriva-ml:Dataset/RID={Dataset_RID}?limit=none",
1434
+ "query_path": "/entity/M:=deriva-ml:Dataset/RID={RID}",
1372
1435
  "output_path": "Dataset",
1373
1436
  "query_keys": ["RID", "Description"],
1374
1437
  },
1375
1438
  },
1376
1439
  ]
1377
- + self._processor_params(dataset, snapshot_catalog),
1440
+ + self._export_specification(dataset, snapshot_catalog),
1378
1441
  },
1379
1442
  }
1380
1443
 
1381
- def dataset_visible_columns(self) -> dict[str, Any]:
1382
- dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
1383
- rcb_name = next(
1384
- [fk.name[0].name, fk.name[1]]
1385
- for fk in dataset_table.foreign_keys
1386
- if fk.name[1] == "Dataset_RCB_fkey"
1387
- )
1388
- rmb_name = next(
1389
- [fk.name[0].name, fk.name[1]]
1390
- for fk in dataset_table.foreign_keys
1391
- if fk.name[1] == "Dataset_RMB_fkey"
1392
- )
1393
- return {
1394
- "*": [
1395
- "RID",
1396
- "Description",
1397
- {
1398
- "display": {
1399
- "markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
1400
- },
1401
- "markdown_name": "Annotation App",
1402
- },
1403
- rcb_name,
1404
- rmb_name,
1405
- ],
1406
- "detailed": [
1407
- "RID",
1408
- "Description",
1409
- {
1410
- "source": [
1411
- {"inbound": ["deriva-ml", "Dataset_Dataset_Type_Dataset_fkey"]},
1412
- {
1413
- "outbound": [
1414
- "deriva-ml",
1415
- "Dataset_Dataset_Type_Dataset_Type_fkey",
1416
- ]
1444
+ def _generate_dataset_download_annotations(self) -> dict[str, Any]:
1445
+ post_processors = (
1446
+ {
1447
+ "type": "BAG",
1448
+ "outputs": [{"fragment_key": "dataset_export_outputs"}],
1449
+ "displayname": "BDBag to Cloud",
1450
+ "bag_idempotent": True,
1451
+ "postprocessors": [
1452
+ {
1453
+ "processor": "cloud_upload",
1454
+ "processor_params": {
1455
+ "acl": "public-read",
1456
+ "target_url": "s3://eye-ai-shared/",
1417
1457
  },
1418
- "RID",
1419
- ],
1420
- "markdown_name": "Dataset Types",
1421
- },
1422
- {
1423
- "display": {
1424
- "markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
1425
1458
  },
1426
- "markdown_name": "Annotation App",
1427
- },
1428
- rcb_name,
1429
- rmb_name,
1430
- ],
1431
- "filter": {
1432
- "and": [
1433
- {"source": "RID"},
1434
- {"source": "Description"},
1435
1459
  {
1436
- "source": [
1437
- {
1438
- "inbound": [
1439
- "deriva-ml",
1440
- "Dataset_Dataset_Type_Dataset_fkey",
1441
- ]
1442
- },
1443
- {
1444
- "outbound": [
1445
- "deriva-ml",
1446
- "Dataset_Dataset_Type_Dataset_Type_fkey",
1447
- ]
1460
+ "processor": "identifier",
1461
+ "processor_params": {
1462
+ "test": False,
1463
+ "env_column_map": {
1464
+ "RID": "{RID}@{snaptime}",
1465
+ "Description": "{Description}",
1448
1466
  },
1449
- "RID",
1450
- ],
1451
- "markdown_name": "Dataset Types",
1452
- },
1453
- {
1454
- "source": [{"outbound": rcb_name}, "RID"],
1455
- "markdown_name": "Created By",
1456
- },
1457
- {
1458
- "source": [{"outbound": rmb_name}, "RID"],
1459
- "markdown_name": "Modified By",
1467
+ },
1460
1468
  },
1461
- ]
1462
- },
1463
- }
1464
-
1465
- def _dataset_visible_fkeys(self) -> dict[str, Any]:
1466
- def fkey_name(fk):
1467
- return [fk.name[0].name, fk.name[1]]
1468
-
1469
- dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
1470
-
1471
- source_list = [
1472
- {
1473
- "source": [
1474
- {"inbound": fkey_name(fkey.self_fkey)},
1475
- {"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
1476
- "RID",
1477
1469
  ],
1478
- "markdown_name": other_fkey.pk_table.name,
1479
1470
  }
1480
- for fkey in dataset_table.find_associations(max_arity=3, pure=False)
1481
- ]
1482
- return {"detailed": source_list}
1483
-
1484
- def _generate_dataset_annotations(self) -> dict[str, Any]:
1471
+ if self._use_minid
1472
+ else {}
1473
+ )
1485
1474
  return {
1486
1475
  deriva_tags.export_fragment_definitions: {
1487
- "dataset_export_outputs": self._export_outputs()
1476
+ "dataset_export_outputs": self._export_annotation()
1488
1477
  },
1489
- deriva_tags.visible_columns: self.dataset_visible_columns(),
1490
1478
  deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
1491
1479
  deriva_tags.export_2019: {
1492
1480
  "detailed": {
@@ -1496,45 +1484,56 @@ class Dataset:
1496
1484
  "outputs": [{"fragment_key": "dataset_export_outputs"}],
1497
1485
  "displayname": "BDBag Download",
1498
1486
  "bag_idempotent": True,
1499
- "postprocessors": [
1500
- {
1501
- "processor": "identifier",
1502
- "processor_params": {
1503
- "test": False,
1504
- "env_column_map": {
1505
- "Dataset_RID": "{RID}@{snaptime}",
1506
- "Description": "{Description}",
1507
- },
1508
- },
1509
- }
1510
- ],
1511
- },
1512
- {
1513
- "type": "BAG",
1514
- "outputs": [{"fragment_key": "dataset_export_outputs"}],
1515
- "displayname": "BDBag to Cloud",
1516
- "bag_idempotent": True,
1517
- "postprocessors": [
1518
- {
1519
- "processor": "cloud_upload",
1520
- "processor_params": {
1521
- "acl": "public-read",
1522
- "target_url": "s3://eye-ai-shared/",
1523
- },
1524
- },
1525
- {
1526
- "processor": "identifier",
1527
- "processor_params": {
1528
- "test": False,
1529
- "env_column_map": {
1530
- "Dataset_RID": "{RID}@{snaptime}",
1531
- "Description": "{Description}",
1532
- },
1533
- },
1534
- },
1535
- ],
1536
- },
1487
+ }
1488
+ | post_processors
1537
1489
  ]
1538
1490
  }
1539
1491
  },
1540
1492
  }
1493
+
1494
+ def _dataset_visible_fkeys(self) -> dict[str, Any]:
1495
+ def fkey_name(fk):
1496
+ return [fk.name[0].name, fk.name[1]]
1497
+
1498
+ dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
1499
+
1500
+ source_list = [
1501
+ {
1502
+ "source": [
1503
+ {"inbound": ["deriva-ml", "Dataset_Version_Dataset_fkey"]},
1504
+ "RID",
1505
+ ],
1506
+ "markdown_name": "Previous Versions",
1507
+ "entity": True,
1508
+ },
1509
+ {
1510
+ "source": [
1511
+ {"inbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
1512
+ {"outbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
1513
+ "RID",
1514
+ ],
1515
+ "markdown_name": "Parent Datasets",
1516
+ },
1517
+ {
1518
+ "source": [
1519
+ {"inbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
1520
+ {"outbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
1521
+ "RID",
1522
+ ],
1523
+ "markdown_name": "Child Datasets",
1524
+ },
1525
+ ]
1526
+ source_list.extend(
1527
+ [
1528
+ {
1529
+ "source": [
1530
+ {"inbound": fkey_name(fkey.self_fkey)},
1531
+ {"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
1532
+ "RID",
1533
+ ],
1534
+ "markdown_name": other_fkey.pk_table.name,
1535
+ }
1536
+ for fkey in dataset_table.find_associations(max_arity=3, pure=False)
1537
+ ]
1538
+ )
1539
+ return {"detailed": source_list}