deriva-ml 1.13.2__py3-none-any.whl → 1.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deriva_ml/dataset.py CHANGED
@@ -1,18 +1,30 @@
1
1
  """
2
- This module defines the DataSet class with is used to manipulate datasets in DerivaML,
3
- The intended use of this class is as a base class in DerivaML so all the methods documented here are
2
+ This module defines the DataSet class with is used to manipulate datasets in DerivaML.
3
+ The intended use of this class is as a base class in DerivaML, so all the methods documented here are
4
4
  accessible via a DerivaML class instance.
5
5
 
6
-
7
6
  """
8
7
 
9
8
  from __future__ import annotations
10
- from bdbag.fetch.fetcher import fetch_single_file
11
9
  from bdbag import bdbag_api as bdb
10
+ from bdbag.fetch.fetcher import fetch_single_file
12
11
  from collections import defaultdict
12
+ from graphlib import TopologicalSorter
13
+ import json
14
+ import logging
15
+ from pathlib import Path
16
+ from pydantic import (
17
+ validate_call,
18
+ ConfigDict,
19
+ )
20
+ import requests
21
+ from tempfile import TemporaryDirectory
22
+ from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
23
+
13
24
 
14
25
  from deriva.core.ermrest_model import Table
15
26
  from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
27
+ import deriva.core.utils.hash_utils as hash_utils
16
28
  from deriva.transfer.download.deriva_export import DerivaExport
17
29
  from deriva.transfer.download.deriva_download import (
18
30
  DerivaDownloadConfigurationError,
@@ -22,24 +34,12 @@ from deriva.transfer.download.deriva_download import (
22
34
  DerivaDownloadTimeoutError,
23
35
  )
24
36
 
37
+
25
38
  try:
26
39
  from icecream import ic
27
40
  except ImportError: # Graceful fallback if IceCream isn't installed.
28
41
  ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
29
42
 
30
- from graphlib import TopologicalSorter
31
- import json
32
- import logging
33
- from pathlib import Path
34
- from pydantic import (
35
- validate_call,
36
- ConfigDict,
37
- )
38
- import requests
39
-
40
- from tempfile import TemporaryDirectory, NamedTemporaryFile
41
- from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
42
-
43
43
  from deriva_ml import DatasetBag
44
44
  from .deriva_definitions import (
45
45
  ML_SCHEMA,
@@ -49,7 +49,6 @@ from .deriva_definitions import (
49
49
  RID,
50
50
  DRY_RUN_RID,
51
51
  )
52
- from .history import iso_to_snap
53
52
  from .deriva_model import DerivaModel
54
53
  from .database_model import DatabaseModel
55
54
  from .dataset_aux_classes import (
@@ -74,13 +73,20 @@ class Dataset:
74
73
 
75
74
  _Logger = logging.getLogger("deriva_ml")
76
75
 
77
- def __init__(self, model: DerivaModel, cache_dir: Path, working_dir: Path):
76
+ def __init__(
77
+ self,
78
+ model: DerivaModel,
79
+ cache_dir: Path,
80
+ working_dir: Path,
81
+ use_minid: bool = True,
82
+ ):
78
83
  self._model = model
79
84
  self._ml_schema = ML_SCHEMA
80
85
  self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
81
86
  self._cache_dir = cache_dir
82
87
  self._working_dir = working_dir
83
88
  self._logger = logging.getLogger("deriva_ml")
89
+ self._use_minid = use_minid
84
90
 
85
91
  def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
86
92
  try:
@@ -100,27 +106,28 @@ class Dataset:
100
106
  dataset_list: list[DatasetSpec],
101
107
  description: Optional[str] = "",
102
108
  execution_rid: Optional[RID] = None,
103
- ) -> list[dict[str, Any]]:
109
+ ) -> None:
104
110
  schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
105
-
111
+ # determine snapshot after changes were made
112
+ snap = self._model.catalog.get("/").json()["snaptime"]
106
113
  # Construct version records for insert
107
- version_records = [
108
- {
109
- "Dataset": dataset.rid,
110
- "Version": str(dataset.version),
111
- "Description": description,
112
- "Execution": execution_rid,
113
- }
114
- for dataset in dataset_list
115
- ]
114
+ version_records = schema_path.tables["Dataset_Version"].insert(
115
+ [
116
+ {
117
+ "Dataset": dataset.rid,
118
+ "Version": str(dataset.version),
119
+ "Description": description,
120
+ "Execution": execution_rid,
121
+ "Snapshot": snap,
122
+ }
123
+ for dataset in dataset_list
124
+ ]
125
+ )
116
126
 
117
- # Insert version records and construct entities for updating the dataset version column.
118
- version_rids = [
119
- {"Version": v["RID"], "RID": v["Dataset"]}
120
- for v in schema_path.tables["Dataset_Version"].insert(version_records)
121
- ]
122
- schema_path.tables["Dataset"].update(version_rids)
123
- return version_rids
127
+ # And update the dataset records.
128
+ schema_path.tables["Dataset"].update(
129
+ [{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records]
130
+ )
124
131
 
125
132
  def _bootstrap_versions(self):
126
133
  datasets = [ds["RID"] for ds in self.find_datasets()]
@@ -170,6 +177,9 @@ class Dataset:
170
177
  Returns:
171
178
  A list of DatasetHistory objects which indicate the version-number, creation time, and bag instantiation of the dataset.
172
179
  """
180
+
181
+ if not self._is_dataset_rid(dataset_rid):
182
+ raise DerivaMLException(f"RID is not for a data set: {dataset_rid}")
173
183
  version_path = (
174
184
  self._model.catalog.getPathBuilder()
175
185
  .schemas[self._ml_schema]
@@ -179,7 +189,7 @@ class Dataset:
179
189
  DatasetHistory(
180
190
  dataset_version=DatasetVersion.parse(v["Version"]),
181
191
  minid=v["Minid"],
182
- timestamp=v["RCT"],
192
+ snapshot=v["Snapshot"],
183
193
  dataset_rid=dataset_rid,
184
194
  version_rid=v["RID"],
185
195
  description=v["Description"],
@@ -240,7 +250,7 @@ class Dataset:
240
250
 
241
251
  Args:
242
252
  dataset_rid: RID of the dataset whose version is to be incremented.
243
- component: Which version of the dataset_table to increment. Major, Minor or Patch
253
+ component: Which version of the dataset_table to increment. Major, Minor, or Patch
244
254
  description: Description of the version update of the dataset_table.
245
255
  execution_rid: Which execution is performing increment.
246
256
 
@@ -248,7 +258,7 @@ class Dataset:
248
258
  new semantic version of the dataset_table as a 3-tuple
249
259
 
250
260
  Raises:
251
- DerivaMLException: if provided RID is not to a dataset_table.
261
+ DerivaMLException: if provided, RID is not to a dataset_table.
252
262
  """
253
263
 
254
264
  # Find all the datasets that are reachable from this dataset and determine their new version numbers.
@@ -268,7 +278,7 @@ class Dataset:
268
278
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
269
279
  def create_dataset(
270
280
  self,
271
- type: str | list[str],
281
+ dataset_types: str | list[str],
272
282
  description: str,
273
283
  execution_rid: Optional[RID] = None,
274
284
  version: Optional[DatasetVersion] = None,
@@ -276,7 +286,7 @@ class Dataset:
276
286
  """Create a new dataset_table from the specified list of RIDs.
277
287
 
278
288
  Args:
279
- type: One or more dataset_table types. Must be a term from the DatasetType controlled vocabulary.
289
+ dataset_types: One or more dataset_table types. Must be a term from the DatasetType controlled vocabulary.
280
290
  description: Description of the dataset_table.
281
291
  execution_rid: Execution under which the dataset_table will be created.
282
292
  version: Version of the dataset_table.
@@ -304,7 +314,7 @@ class Dataset:
304
314
  return False
305
315
 
306
316
  # Create the entry for the new dataset_table and get its RID.
307
- ds_types = [type] if isinstance(type, str) else type
317
+ ds_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
308
318
  pb = self._model.catalog.getPathBuilder()
309
319
  for ds_type in ds_types:
310
320
  if not check_dataset_type(ds_type):
@@ -452,7 +462,9 @@ class Dataset:
452
462
  )
453
463
 
454
464
  # self.model = self.catalog.getCatalogModel()
455
- self.dataset_table.annotations.update(self._generate_dataset_annotations())
465
+ self.dataset_table.annotations.update(
466
+ self._generate_dataset_download_annotations()
467
+ )
456
468
  self._model.model.apply()
457
469
  return table
458
470
 
@@ -464,7 +476,7 @@ class Dataset:
464
476
 
465
477
  Args:
466
478
  dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
467
- recurse: (Default value = False)
479
+ recurse: (Default value = False)
468
480
  limit: If provided, the maximum number of members to return for each element type.
469
481
 
470
482
  Returns:
@@ -530,8 +542,8 @@ class Dataset:
530
542
  dataset is incremented and the description, if provide is applied to that new version.
531
543
 
532
544
  Args:
533
- dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
534
- members: List of RIDs of members to add to the dataset_table.
545
+ dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
546
+ members: List of member RIDs to add to the dataset_table.
535
547
  validate: Check rid_list to make sure elements are not already in the dataset_table.
536
548
  description: Markdown description of the updated dataset.
537
549
  execution_rid: Optional RID of execution associated with this dataset.
@@ -544,7 +556,7 @@ class Dataset:
544
556
 
545
557
  Args:
546
558
  member_rid:
547
- path: (Default value = None)
559
+ path: (Default value = None)
548
560
 
549
561
  Returns:
550
562
 
@@ -570,7 +582,7 @@ class Dataset:
570
582
  a.other_fkeys.pop().pk_table.name: a.table.name
571
583
  for a in self.dataset_table.find_associations()
572
584
  }
573
- # Get a list of all the types of objects that can be linked to a dataset_table.
585
+ # Get a list of all the object types that can be linked to a dataset_table.
574
586
  for m in members:
575
587
  try:
576
588
  rid_info = self._model.catalog.resolve_rid(m)
@@ -618,8 +630,8 @@ class Dataset:
618
630
  dataset is incremented and the description, if provide is applied to that new version.
619
631
 
620
632
  Args:
621
- dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
622
- members: List of RIDs of members to add to the dataset_table.
633
+ dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
634
+ members: List of member RIDs to add to the dataset_table.
623
635
  description: Markdown description of the updated dataset.
624
636
  execution_rid: Optional RID of execution associated with this operation.
625
637
  """
@@ -634,7 +646,7 @@ class Dataset:
634
646
  a.other_fkeys.pop().pk_table.name: a.table.name
635
647
  for a in self.dataset_table.find_associations()
636
648
  }
637
- # Get a list of all the types of objects that can be linked to a dataset_table.
649
+ # Get a list of all the object types that can be linked to a dataset_table.
638
650
  for m in members:
639
651
  try:
640
652
  rid_info = self._model.catalog.resolve_rid(m)
@@ -670,7 +682,7 @@ class Dataset:
670
682
  )
671
683
 
672
684
  @validate_call
673
- def list_dataset_parents(self, dataset_rid: RID) -> list[RID]:
685
+ def list_dataset_parents(self, dataset_rid: RID) -> list[str]:
674
686
  """Given a dataset_table RID, return a list of RIDs of the parent datasets if this is included in a
675
687
  nested dataset.
676
688
 
@@ -696,14 +708,14 @@ class Dataset:
696
708
 
697
709
  @validate_call
698
710
  def list_dataset_children(self, dataset_rid: RID, recurse=False) -> list[RID]:
699
- """Given a dataset_table RID, return a list of RIDs of any nested datasets.
711
+ """Given a dataset_table RID, return a list of RIDs for any nested datasets.
700
712
 
701
713
  Args:
702
714
  dataset_rid: A dataset_table RID.
703
- recurse: If True, return a list of RIDs of any nested datasets.
715
+ recurse: If True, return a list of nested datasets RIDs.
704
716
 
705
717
  Returns:
706
- list of RIDs of nested datasets.
718
+ list of nested dataset RIDs.
707
719
 
708
720
  """
709
721
  dataset_dataset_path = (
@@ -726,7 +738,7 @@ class Dataset:
726
738
 
727
739
  return find_children(dataset_rid)
728
740
 
729
- def _vocabulary_specification(
741
+ def _export_vocabulary(
730
742
  self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
731
743
  ) -> list[dict[str, Any]]:
732
744
  """
@@ -756,10 +768,10 @@ class Dataset:
756
768
  ) -> Iterator[tuple[str, str, Table]]:
757
769
  paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
758
770
 
759
- def source_path(path: tuple[Table, ...]):
771
+ def source_path(path: tuple[Table, ...]) -> list[str]:
760
772
  """Convert a tuple representing a path into a source path component with FK linkage"""
761
773
  path = list(path)
762
- p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
774
+ p = [f"{self._model.ml_schema}:Dataset/RID={{RID}}"]
763
775
  for table in path[1:]:
764
776
  if table.name == "Dataset_Dataset":
765
777
  p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
@@ -803,7 +815,7 @@ class Dataset:
803
815
  dataset_elements = [
804
816
  snapshot_catalog._model.name_to_table(e)
805
817
  for e, m in snapshot_catalog.list_dataset_members(
806
- dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
818
+ dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
807
819
  ).items()
808
820
  if m
809
821
  ]
@@ -857,7 +869,7 @@ class Dataset:
857
869
  """
858
870
 
859
871
  def children_depth(
860
- dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
872
+ dataset_rid: RID, nested_datasets: dict[str, list[str]]
861
873
  ) -> int:
862
874
  """Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
863
875
  try:
@@ -899,13 +911,13 @@ class Dataset:
899
911
  def _dataset_specification(
900
912
  self,
901
913
  writer: Callable[[str, str, Table], list[dict[str, Any]]],
902
- dataset: DatasetSpec,
914
+ dataset: Optional[DatasetSpec] = None,
903
915
  snapshot_catalog: Optional[DerivaML] = None,
904
916
  ) -> list[dict[str, Any]]:
905
917
  """Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
906
- The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
918
+ The top level data directory of the resulting BDBag will have one subdirectory for element type. The subdirectory
907
919
  will contain the CSV indicating which elements of that type are present in the dataset_table, and then there will be a
908
- subdirectories for each object that is reachable from the dataset_table members.
920
+ subdirectory for each object that is reachable from the dataset_table members.
909
921
 
910
922
  To simplify reconstructing the relationship between tables, the CVS for each
911
923
  The top level data directory will also contain a subdirectory for any controlled vocabularies used in the dataset_table.
@@ -913,7 +925,7 @@ class Dataset:
913
925
 
914
926
  For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign key relationships to
915
927
  objects in tables T3 and T4. There are also two controlled vocabularies, CV1 and CV2. T2 is an asset table
916
- which has two asset in it. The layout of the resulting bdbag would be:
928
+ which has two assets in it. The layout of the resulting bdbag would be:
917
929
  data
918
930
  CV1/
919
931
  cv1.csv
@@ -939,12 +951,12 @@ class Dataset:
939
951
  Returns:
940
952
  A dataset_table specification.
941
953
  """
942
- element_spec = []
954
+ element_spec = self._export_vocabulary(writer)
943
955
  for path in self._table_paths(
944
956
  dataset=dataset, snapshot_catalog=snapshot_catalog
945
957
  ):
946
958
  element_spec.extend(writer(*path))
947
- return self._vocabulary_specification(writer) + element_spec
959
+ return element_spec
948
960
 
949
961
  def _download_dataset_bag(
950
962
  self,
@@ -985,7 +997,7 @@ class Dataset:
985
997
  for h in self.dataset_history(dataset_rid=dataset.rid)
986
998
  if h.dataset_version == dataset.version
987
999
  ][0]
988
- return f"{self._model.catalog.catalog_id}@{iso_to_snap(version_record.timestamp.isoformat())}"
1000
+ return f"{self._model.catalog.catalog_id}@{version_record.snapshot}"
989
1001
 
990
1002
  def _create_dataset_minid(
991
1003
  self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
@@ -1000,7 +1012,7 @@ class Dataset:
1000
1012
  )
1001
1013
  try:
1002
1014
  self._logger.info(
1003
- f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
1015
+ f"Downloading dataset {'minid' if self._use_minid else 'bag'} for catalog: {dataset.rid}@{str(dataset.version)}"
1004
1016
  )
1005
1017
  # Generate the bag and put into S3 storage.
1006
1018
  exporter = DerivaExport(
@@ -1009,9 +1021,10 @@ class Dataset:
1009
1021
  output_dir=tmp_dir,
1010
1022
  defer_download=True,
1011
1023
  timeout=(10, 610),
1012
- envars={"Dataset_RID": dataset.rid},
1024
+ envars={"RID": dataset.rid},
1013
1025
  )
1014
1026
  minid_page_url = exporter.export()[0] # Get the MINID launch page
1027
+
1015
1028
  except (
1016
1029
  DerivaDownloadError,
1017
1030
  DerivaDownloadConfigurationError,
@@ -1021,17 +1034,18 @@ class Dataset:
1021
1034
  ) as e:
1022
1035
  raise DerivaMLException(format_exception(e))
1023
1036
  # Update version table with MINID.
1024
- version_path = (
1025
- self._model.catalog.getPathBuilder()
1026
- .schemas[self._ml_schema]
1027
- .tables["Dataset_Version"]
1028
- )
1029
- version_rid = [
1030
- h
1031
- for h in self.dataset_history(dataset_rid=dataset.rid)
1032
- if h.dataset_version == dataset.version
1033
- ][0].version_rid
1034
- version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
1037
+ if self._use_minid:
1038
+ version_path = (
1039
+ self._model.catalog.getPathBuilder()
1040
+ .schemas[self._ml_schema]
1041
+ .tables["Dataset_Version"]
1042
+ )
1043
+ version_rid = [
1044
+ h
1045
+ for h in self.dataset_history(dataset_rid=dataset.rid)
1046
+ if h.dataset_version == dataset.version
1047
+ ][0].version_rid
1048
+ version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
1035
1049
  return minid_page_url
1036
1050
 
1037
1051
  def _get_dataset_minid(
@@ -1074,14 +1088,25 @@ class Dataset:
1074
1088
  raise DerivaMLException(
1075
1089
  f"Minid for dataset {dataset.rid} doesn't exist"
1076
1090
  )
1077
- self._logger.info("Creating new MINID for dataset %s", dataset.rid)
1091
+ if self._use_minid:
1092
+ self._logger.info("Creating new MINID for dataset %s", dataset.rid)
1078
1093
  minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
1079
1094
  # If provided a MINID, use the MINID metadata to get the checksum and download the bag.
1080
- r = requests.get(minid_url, headers={"accept": "application/json"})
1081
- return DatasetMinid(dataset_version=dataset.version, **r.json())
1095
+ if self._use_minid:
1096
+ r = requests.get(minid_url, headers={"accept": "application/json"})
1097
+ dataset_minid = DatasetMinid(
1098
+ dataset_version=dataset.version, **r.json()
1099
+ )
1100
+ else:
1101
+ dataset_minid = DatasetMinid(
1102
+ dataset_version=dataset.version,
1103
+ RID=f"{dataset.rid}@{dataset_version_record.snapshot}",
1104
+ location=minid_url,
1105
+ )
1106
+ return dataset_minid
1082
1107
 
1083
1108
  def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
1084
- """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
1109
+ """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and validate
1085
1110
  that all the metadata is correct
1086
1111
 
1087
1112
  Args:
@@ -1090,19 +1115,37 @@ class Dataset:
1090
1115
  the location of the unpacked and validated dataset_table bag and the RID of the bag and the bag MINID
1091
1116
  """
1092
1117
 
1093
- # Check to see if we have an existing idempotent materialization of the desired bag. If so, then just reuse
1118
+ # Check to see if we have an existing idempotent materialization of the desired bag. If so, then reuse
1094
1119
  # it. If not, then we need to extract the contents of the archive into our cache directory.
1095
1120
  bag_dir = self._cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
1096
1121
  if bag_dir.exists():
1097
- bag_path = (bag_dir / f"Dataset_{minid.dataset_rid}").as_posix()
1098
- else:
1099
- bag_dir.mkdir(parents=True, exist_ok=True)
1100
- with NamedTemporaryFile(
1101
- delete=False, suffix=f"Dataset_{minid.dataset_rid}.zip"
1102
- ) as zip_file:
1103
- archive_path = fetch_single_file(minid.bag_url, zip_file.name)
1104
- bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
1105
- bdb.validate_bag_structure(bag_path)
1122
+ self._logger.info(
1123
+ f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
1124
+ )
1125
+ return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
1126
+
1127
+ # Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
1128
+ with TemporaryDirectory() as tmp_dir:
1129
+ if self._use_minid:
1130
+ # Get bag from S3
1131
+ archive_path = fetch_single_file(minid.bag_url)
1132
+ else:
1133
+ exporter = DerivaExport(
1134
+ host=self._model.catalog.deriva_server.server, output_dir=tmp_dir
1135
+ )
1136
+ archive_path = exporter.retrieve_file(minid.bag_url)
1137
+ hashes = hash_utils.compute_file_hashes(
1138
+ archive_path, hashes=["md5", "sha256"]
1139
+ )
1140
+ checksum = hashes["sha256"][0]
1141
+ bag_dir = self._cache_dir / f"{minid.dataset_rid}_{checksum}"
1142
+ if bag_dir.exists():
1143
+ self._logger.info(
1144
+ f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
1145
+ )
1146
+ return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
1147
+ bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
1148
+ bdb.validate_bag_structure(bag_path)
1106
1149
  return Path(bag_path)
1107
1150
 
1108
1151
  def _materialize_dataset_bag(
@@ -1154,6 +1197,9 @@ class Dataset:
1154
1197
 
1155
1198
  # If this bag has already been validated, our work is done. Otherwise, materialize the bag.
1156
1199
  if not validated_check.exists():
1200
+ self._logger.info(
1201
+ f"Materializing bag {minid.dataset_rid} Version:{minid.dataset_version}"
1202
+ )
1157
1203
  bdb.materialize(
1158
1204
  bag_path.as_posix(),
1159
1205
  fetch_callback=fetch_progress_callback,
@@ -1162,9 +1208,8 @@ class Dataset:
1162
1208
  validated_check.touch()
1163
1209
  return Path(bag_path)
1164
1210
 
1165
- def _export_outputs(
1211
+ def _export_annotation(
1166
1212
  self,
1167
- dataset: Optional[DatasetSpec] = None,
1168
1213
  snapshot_catalog: Optional[DerivaML] = None,
1169
1214
  ) -> list[dict[str, Any]]:
1170
1215
  """Return and output specification for the datasets in the provided model
@@ -1173,19 +1218,6 @@ class Dataset:
1173
1218
  An export specification suitable for Chaise.
1174
1219
  """
1175
1220
 
1176
- def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
1177
- """
1178
-
1179
- Args:
1180
- spath: list[Table]:
1181
- dpath: list[Table]:
1182
- table: Table
1183
-
1184
- Returns:
1185
- An export specification suitable for Chaise.
1186
- """
1187
- return self._export_dataset_element(spath, dpath, table)
1188
-
1189
1221
  # Export specification is a specification for the datasets, plus any controlled vocabulary
1190
1222
  return [
1191
1223
  {
@@ -1204,41 +1236,34 @@ class Dataset:
1204
1236
  "destination": {"type": "json", "name": "schema"},
1205
1237
  },
1206
1238
  ] + self._dataset_specification(
1207
- writer, dataset, snapshot_catalog=snapshot_catalog
1239
+ self._export_annotation_dataset_element,
1240
+ None,
1241
+ snapshot_catalog=snapshot_catalog,
1208
1242
  )
1209
1243
 
1210
- def _processor_params(
1244
+ def _export_specification(
1211
1245
  self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
1212
1246
  ) -> list[dict[str, Any]]:
1213
1247
  """
1248
+ Generate a specification for export engine for specific dataset.
1249
+
1214
1250
  Returns:
1215
1251
  a download specification for the datasets in the provided model.
1216
1252
 
1217
1253
  """
1218
1254
 
1219
- def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
1220
- """
1221
-
1222
- Args:
1223
- spath:
1224
- dpath:
1225
- table: Table
1226
-
1227
- Returns:
1228
-
1229
- """
1230
- return self._download_dataset_element(spath, dpath, table)
1231
-
1232
1255
  # Download spec is the spec for any controlled vocabulary and for the dataset_table.
1233
1256
  return [
1234
1257
  {
1235
1258
  "processor": "json",
1236
1259
  "processor_params": {"query_path": "/schema", "output_path": "schema"},
1237
1260
  }
1238
- ] + self._dataset_specification(writer, dataset, snapshot_catalog)
1261
+ ] + self._dataset_specification(
1262
+ self._export_specification_dataset_element, dataset, snapshot_catalog
1263
+ )
1239
1264
 
1240
1265
  @staticmethod
1241
- def _download_dataset_element(
1266
+ def _export_specification_dataset_element(
1242
1267
  spath: str, dpath: str, table: Table
1243
1268
  ) -> list[dict[str, Any]]:
1244
1269
  """Return the download specification for the data object indicated by a path through the data model.
@@ -1255,7 +1280,7 @@ class Dataset:
1255
1280
  {
1256
1281
  "processor": "csv",
1257
1282
  "processor_params": {
1258
- "query_path": f"/entity/{spath}?limit=none",
1283
+ "query_path": f"/entity/{spath}",
1259
1284
  "output_path": dpath,
1260
1285
  },
1261
1286
  }
@@ -1268,16 +1293,15 @@ class Dataset:
1268
1293
  {
1269
1294
  "processor": "fetch",
1270
1295
  "processor_params": {
1271
- "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5?limit=none",
1296
+ "query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
1272
1297
  "output_path": f"asset/{table.name}",
1273
1298
  },
1274
1299
  }
1275
1300
  )
1276
1301
  return exports
1277
1302
 
1278
- @staticmethod
1279
- def _export_dataset_element(
1280
- spath: str, dpath: str, table: Table
1303
+ def _export_annotation_dataset_element(
1304
+ self, spath: str, dpath: str, table: Table
1281
1305
  ) -> list[dict[str, Any]]:
1282
1306
  """Given a path in the data model, output an export specification for the path taken to get to the current table.
1283
1307
 
@@ -1293,9 +1317,23 @@ class Dataset:
1293
1317
  # into a path in the form of /S:T1/S:T2/S:Table
1294
1318
  # Generate the destination path in the file system using just the table names.
1295
1319
 
1320
+ skip_root_path = False
1321
+ if spath.startswith(f"{self._ml_schema}:Dataset/"):
1322
+ # Chaise will add table name and RID filter, so strip it off.
1323
+ spath = "/".join(spath.split("/")[2:])
1324
+ if spath == "":
1325
+ # This path is to just the dataset table.
1326
+ return []
1327
+ else:
1328
+ # A vocabulary table, so we don't want the root_path.
1329
+ skip_root_path = True
1296
1330
  exports = [
1297
1331
  {
1298
- "source": {"api": "entity", "path": spath},
1332
+ "source": {
1333
+ "api": "entity",
1334
+ "path": spath,
1335
+ "skip_root_path": skip_root_path,
1336
+ },
1299
1337
  "destination": {"name": dpath, "type": "csv"},
1300
1338
  }
1301
1339
  ]
@@ -1306,6 +1344,7 @@ class Dataset:
1306
1344
  exports.append(
1307
1345
  {
1308
1346
  "source": {
1347
+ "skip_root_path": False,
1309
1348
  "api": "attribute",
1310
1349
  "path": f"{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
1311
1350
  },
@@ -1315,44 +1354,53 @@ class Dataset:
1315
1354
  return exports
1316
1355
 
1317
1356
  def _generate_dataset_download_spec(
1318
- self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
1357
+ self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
1319
1358
  ) -> dict[str, Any]:
1320
1359
  """
1360
+ Generate a specification for downloading a specific dataset.
1321
1361
 
1362
+ This routine creates a download specification that can be used by the Deriva export processor to download
1363
+ a specific dataset as a MINID.
1322
1364
  Returns:
1323
1365
  """
1324
1366
  s3_target = "s3://eye-ai-shared"
1325
1367
  minid_test = False
1326
1368
 
1327
1369
  catalog_id = self._version_snapshot(dataset)
1328
- return {
1329
- "env": {"Dataset_RID": "{Dataset_RID}"},
1370
+ post_processors = (
1371
+ {
1372
+ "post_processors": [
1373
+ {
1374
+ "processor": "cloud_upload",
1375
+ "processor_params": {
1376
+ "acl": "public-read",
1377
+ "target_url": s3_target,
1378
+ },
1379
+ },
1380
+ {
1381
+ "processor": "identifier",
1382
+ "processor_params": {
1383
+ "test": minid_test,
1384
+ "env_column_map": {
1385
+ "RID": "{RID}@{snaptime}",
1386
+ "Description": "{Description}",
1387
+ },
1388
+ },
1389
+ },
1390
+ ]
1391
+ }
1392
+ if self._use_minid
1393
+ else {}
1394
+ )
1395
+ return post_processors | {
1396
+ "env": {"RID": "{RID}"},
1330
1397
  "bag": {
1331
- "bag_name": "Dataset_{Dataset_RID}",
1398
+ "bag_name": "Dataset_{RID}",
1332
1399
  "bag_algorithms": ["md5"],
1333
1400
  "bag_archiver": "zip",
1334
1401
  "bag_metadata": {},
1335
1402
  "bag_idempotent": True,
1336
1403
  },
1337
- "post_processors": [
1338
- {
1339
- "processor": "cloud_upload",
1340
- "processor_params": {
1341
- "acl": "public-read",
1342
- "target_url": s3_target,
1343
- },
1344
- },
1345
- {
1346
- "processor": "identifier",
1347
- "processor_params": {
1348
- "test": minid_test,
1349
- "env_column_map": {
1350
- "Dataset_RID": "{RID}@{snaptime}",
1351
- "Description": "{Description}",
1352
- },
1353
- },
1354
- },
1355
- ],
1356
1404
  "catalog": {
1357
1405
  "host": f"{self._model.catalog.deriva_server.scheme}://{self._model.catalog.deriva_server.server}",
1358
1406
  "catalog_id": catalog_id,
@@ -1368,125 +1416,50 @@ class Dataset:
1368
1416
  {
1369
1417
  "processor": "env",
1370
1418
  "processor_params": {
1371
- "query_path": "/entity/M:=deriva-ml:Dataset/RID={Dataset_RID}?limit=none",
1419
+ "query_path": "/entity/M:=deriva-ml:Dataset/RID={RID}",
1372
1420
  "output_path": "Dataset",
1373
1421
  "query_keys": ["RID", "Description"],
1374
1422
  },
1375
1423
  },
1376
1424
  ]
1377
- + self._processor_params(dataset, snapshot_catalog),
1425
+ + self._export_specification(dataset, snapshot_catalog),
1378
1426
  },
1379
1427
  }
1380
1428
 
1381
- def dataset_visible_columns(self) -> dict[str, Any]:
1382
- dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
1383
- rcb_name = next(
1384
- [fk.name[0].name, fk.name[1]]
1385
- for fk in dataset_table.foreign_keys
1386
- if fk.name[1] == "Dataset_RCB_fkey"
1387
- )
1388
- rmb_name = next(
1389
- [fk.name[0].name, fk.name[1]]
1390
- for fk in dataset_table.foreign_keys
1391
- if fk.name[1] == "Dataset_RMB_fkey"
1392
- )
1393
- return {
1394
- "*": [
1395
- "RID",
1396
- "Description",
1397
- {
1398
- "display": {
1399
- "markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
1400
- },
1401
- "markdown_name": "Annotation App",
1402
- },
1403
- rcb_name,
1404
- rmb_name,
1405
- ],
1406
- "detailed": [
1407
- "RID",
1408
- "Description",
1409
- {
1410
- "source": [
1411
- {"inbound": ["deriva-ml", "Dataset_Dataset_Type_Dataset_fkey"]},
1412
- {
1413
- "outbound": [
1414
- "deriva-ml",
1415
- "Dataset_Dataset_Type_Dataset_Type_fkey",
1416
- ]
1429
+ def _generate_dataset_download_annotations(self) -> dict[str, Any]:
1430
+ post_processors = (
1431
+ {
1432
+ "type": "BAG",
1433
+ "outputs": [{"fragment_key": "dataset_export_outputs"}],
1434
+ "displayname": "BDBag to Cloud",
1435
+ "bag_idempotent": True,
1436
+ "postprocessors": [
1437
+ {
1438
+ "processor": "cloud_upload",
1439
+ "processor_params": {
1440
+ "acl": "public-read",
1441
+ "target_url": "s3://eye-ai-shared/",
1417
1442
  },
1418
- "RID",
1419
- ],
1420
- "markdown_name": "Dataset Types",
1421
- },
1422
- {
1423
- "display": {
1424
- "markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
1425
1443
  },
1426
- "markdown_name": "Annotation App",
1427
- },
1428
- rcb_name,
1429
- rmb_name,
1430
- ],
1431
- "filter": {
1432
- "and": [
1433
- {"source": "RID"},
1434
- {"source": "Description"},
1435
1444
  {
1436
- "source": [
1437
- {
1438
- "inbound": [
1439
- "deriva-ml",
1440
- "Dataset_Dataset_Type_Dataset_fkey",
1441
- ]
1442
- },
1443
- {
1444
- "outbound": [
1445
- "deriva-ml",
1446
- "Dataset_Dataset_Type_Dataset_Type_fkey",
1447
- ]
1445
+ "processor": "identifier",
1446
+ "processor_params": {
1447
+ "test": False,
1448
+ "env_column_map": {
1449
+ "RID": "{RID}@{snaptime}",
1450
+ "Description": "{Description}",
1448
1451
  },
1449
- "RID",
1450
- ],
1451
- "markdown_name": "Dataset Types",
1452
- },
1453
- {
1454
- "source": [{"outbound": rcb_name}, "RID"],
1455
- "markdown_name": "Created By",
1456
- },
1457
- {
1458
- "source": [{"outbound": rmb_name}, "RID"],
1459
- "markdown_name": "Modified By",
1452
+ },
1460
1453
  },
1461
- ]
1462
- },
1463
- }
1464
-
1465
- def _dataset_visible_fkeys(self) -> dict[str, Any]:
1466
- def fkey_name(fk):
1467
- return [fk.name[0].name, fk.name[1]]
1468
-
1469
- dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
1470
-
1471
- source_list = [
1472
- {
1473
- "source": [
1474
- {"inbound": fkey_name(fkey.self_fkey)},
1475
- {"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
1476
- "RID",
1477
1454
  ],
1478
- "markdown_name": other_fkey.pk_table.name,
1479
1455
  }
1480
- for fkey in dataset_table.find_associations(max_arity=3, pure=False)
1481
- ]
1482
- return {"detailed": source_list}
1483
-
1484
- def _generate_dataset_annotations(self) -> dict[str, Any]:
1456
+ if self._use_minid
1457
+ else {}
1458
+ )
1485
1459
  return {
1486
1460
  deriva_tags.export_fragment_definitions: {
1487
- "dataset_export_outputs": self._export_outputs()
1461
+ "dataset_export_outputs": self._export_annotation()
1488
1462
  },
1489
- deriva_tags.visible_columns: self.dataset_visible_columns(),
1490
1463
  deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
1491
1464
  deriva_tags.export_2019: {
1492
1465
  "detailed": {
@@ -1496,45 +1469,56 @@ class Dataset:
1496
1469
  "outputs": [{"fragment_key": "dataset_export_outputs"}],
1497
1470
  "displayname": "BDBag Download",
1498
1471
  "bag_idempotent": True,
1499
- "postprocessors": [
1500
- {
1501
- "processor": "identifier",
1502
- "processor_params": {
1503
- "test": False,
1504
- "env_column_map": {
1505
- "Dataset_RID": "{RID}@{snaptime}",
1506
- "Description": "{Description}",
1507
- },
1508
- },
1509
- }
1510
- ],
1511
- },
1512
- {
1513
- "type": "BAG",
1514
- "outputs": [{"fragment_key": "dataset_export_outputs"}],
1515
- "displayname": "BDBag to Cloud",
1516
- "bag_idempotent": True,
1517
- "postprocessors": [
1518
- {
1519
- "processor": "cloud_upload",
1520
- "processor_params": {
1521
- "acl": "public-read",
1522
- "target_url": "s3://eye-ai-shared/",
1523
- },
1524
- },
1525
- {
1526
- "processor": "identifier",
1527
- "processor_params": {
1528
- "test": False,
1529
- "env_column_map": {
1530
- "Dataset_RID": "{RID}@{snaptime}",
1531
- "Description": "{Description}",
1532
- },
1533
- },
1534
- },
1535
- ],
1536
- },
1472
+ }
1473
+ | post_processors
1537
1474
  ]
1538
1475
  }
1539
1476
  },
1540
1477
  }
1478
+
1479
+ def _dataset_visible_fkeys(self) -> dict[str, Any]:
1480
+ def fkey_name(fk):
1481
+ return [fk.name[0].name, fk.name[1]]
1482
+
1483
+ dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
1484
+
1485
+ source_list = [
1486
+ {
1487
+ "source": [
1488
+ {"inbound": ["deriva-ml", "Dataset_Version_Dataset_fkey"]},
1489
+ "RID",
1490
+ ],
1491
+ "markdown_name": "Previous Versions",
1492
+ "entity": True,
1493
+ },
1494
+ {
1495
+ "source": [
1496
+ {"inbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
1497
+ {"outbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
1498
+ "RID",
1499
+ ],
1500
+ "markdown_name": "Parent Datasets",
1501
+ },
1502
+ {
1503
+ "source": [
1504
+ {"inbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
1505
+ {"outbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
1506
+ "RID",
1507
+ ],
1508
+ "markdown_name": "Child Datasets",
1509
+ },
1510
+ ]
1511
+ source_list.extend(
1512
+ [
1513
+ {
1514
+ "source": [
1515
+ {"inbound": fkey_name(fkey.self_fkey)},
1516
+ {"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
1517
+ "RID",
1518
+ ],
1519
+ "markdown_name": other_fkey.pk_table.name,
1520
+ }
1521
+ for fkey in dataset_table.find_associations(max_arity=3, pure=False)
1522
+ ]
1523
+ )
1524
+ return {"detailed": source_list}