deriva-ml 1.13.1__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/database_model.py +5 -11
- deriva_ml/dataset.py +293 -307
- deriva_ml/dataset_aux_classes.py +10 -10
- deriva_ml/demo_catalog.py +90 -67
- deriva_ml/deriva_definitions.py +43 -4
- deriva_ml/deriva_ml_base.py +31 -30
- deriva_ml/deriva_model.py +17 -5
- deriva_ml/execution.py +102 -89
- deriva_ml/execution_configuration.py +2 -1
- deriva_ml/history.py +2 -0
- deriva_ml/schema_setup/annotations.py +341 -126
- deriva_ml/schema_setup/create_schema.py +33 -65
- deriva_ml/schema_setup/policy.json +7 -3
- deriva_ml/upload.py +3 -3
- {deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/METADATA +2 -2
- deriva_ml-1.13.3.dist-info/RECORD +31 -0
- {deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/WHEEL +1 -1
- deriva_ml-1.13.1.dist-info/RECORD +0 -31
- {deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/top_level.txt +0 -0
deriva_ml/dataset.py
CHANGED
|
@@ -1,18 +1,30 @@
|
|
|
1
1
|
"""
|
|
2
|
-
This module defines the DataSet class with is used to manipulate datasets in DerivaML
|
|
3
|
-
The intended use of this class is as a base class in DerivaML so all the methods documented here are
|
|
2
|
+
This module defines the DataSet class with is used to manipulate datasets in DerivaML.
|
|
3
|
+
The intended use of this class is as a base class in DerivaML, so all the methods documented here are
|
|
4
4
|
accessible via a DerivaML class instance.
|
|
5
5
|
|
|
6
|
-
|
|
7
6
|
"""
|
|
8
7
|
|
|
9
8
|
from __future__ import annotations
|
|
10
|
-
from bdbag.fetch.fetcher import fetch_single_file
|
|
11
9
|
from bdbag import bdbag_api as bdb
|
|
10
|
+
from bdbag.fetch.fetcher import fetch_single_file
|
|
12
11
|
from collections import defaultdict
|
|
12
|
+
from graphlib import TopologicalSorter
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from pydantic import (
|
|
17
|
+
validate_call,
|
|
18
|
+
ConfigDict,
|
|
19
|
+
)
|
|
20
|
+
import requests
|
|
21
|
+
from tempfile import TemporaryDirectory
|
|
22
|
+
from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
|
|
23
|
+
|
|
13
24
|
|
|
14
25
|
from deriva.core.ermrest_model import Table
|
|
15
26
|
from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
|
|
27
|
+
import deriva.core.utils.hash_utils as hash_utils
|
|
16
28
|
from deriva.transfer.download.deriva_export import DerivaExport
|
|
17
29
|
from deriva.transfer.download.deriva_download import (
|
|
18
30
|
DerivaDownloadConfigurationError,
|
|
@@ -22,24 +34,12 @@ from deriva.transfer.download.deriva_download import (
|
|
|
22
34
|
DerivaDownloadTimeoutError,
|
|
23
35
|
)
|
|
24
36
|
|
|
37
|
+
|
|
25
38
|
try:
|
|
26
39
|
from icecream import ic
|
|
27
40
|
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
28
41
|
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
29
42
|
|
|
30
|
-
from graphlib import TopologicalSorter
|
|
31
|
-
import json
|
|
32
|
-
import logging
|
|
33
|
-
from pathlib import Path
|
|
34
|
-
from pydantic import (
|
|
35
|
-
validate_call,
|
|
36
|
-
ConfigDict,
|
|
37
|
-
)
|
|
38
|
-
import requests
|
|
39
|
-
|
|
40
|
-
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
|
41
|
-
from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
|
|
42
|
-
|
|
43
43
|
from deriva_ml import DatasetBag
|
|
44
44
|
from .deriva_definitions import (
|
|
45
45
|
ML_SCHEMA,
|
|
@@ -49,7 +49,6 @@ from .deriva_definitions import (
|
|
|
49
49
|
RID,
|
|
50
50
|
DRY_RUN_RID,
|
|
51
51
|
)
|
|
52
|
-
from .history import iso_to_snap
|
|
53
52
|
from .deriva_model import DerivaModel
|
|
54
53
|
from .database_model import DatabaseModel
|
|
55
54
|
from .dataset_aux_classes import (
|
|
@@ -74,13 +73,20 @@ class Dataset:
|
|
|
74
73
|
|
|
75
74
|
_Logger = logging.getLogger("deriva_ml")
|
|
76
75
|
|
|
77
|
-
def __init__(
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
model: DerivaModel,
|
|
79
|
+
cache_dir: Path,
|
|
80
|
+
working_dir: Path,
|
|
81
|
+
use_minid: bool = True,
|
|
82
|
+
):
|
|
78
83
|
self._model = model
|
|
79
84
|
self._ml_schema = ML_SCHEMA
|
|
80
85
|
self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
|
|
81
86
|
self._cache_dir = cache_dir
|
|
82
87
|
self._working_dir = working_dir
|
|
83
88
|
self._logger = logging.getLogger("deriva_ml")
|
|
89
|
+
self._use_minid = use_minid
|
|
84
90
|
|
|
85
91
|
def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
|
|
86
92
|
try:
|
|
@@ -100,27 +106,28 @@ class Dataset:
|
|
|
100
106
|
dataset_list: list[DatasetSpec],
|
|
101
107
|
description: Optional[str] = "",
|
|
102
108
|
execution_rid: Optional[RID] = None,
|
|
103
|
-
) ->
|
|
109
|
+
) -> None:
|
|
104
110
|
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
105
|
-
|
|
111
|
+
# determine snapshot after changes were made
|
|
112
|
+
snap = self._model.catalog.get("/").json()["snaptime"]
|
|
106
113
|
# Construct version records for insert
|
|
107
|
-
version_records = [
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
114
|
+
version_records = schema_path.tables["Dataset_Version"].insert(
|
|
115
|
+
[
|
|
116
|
+
{
|
|
117
|
+
"Dataset": dataset.rid,
|
|
118
|
+
"Version": str(dataset.version),
|
|
119
|
+
"Description": description,
|
|
120
|
+
"Execution": execution_rid,
|
|
121
|
+
"Snapshot": snap,
|
|
122
|
+
}
|
|
123
|
+
for dataset in dataset_list
|
|
124
|
+
]
|
|
125
|
+
)
|
|
116
126
|
|
|
117
|
-
#
|
|
118
|
-
|
|
119
|
-
{"Version": v["RID"], "RID": v["Dataset"]}
|
|
120
|
-
|
|
121
|
-
]
|
|
122
|
-
schema_path.tables["Dataset"].update(version_rids)
|
|
123
|
-
return version_rids
|
|
127
|
+
# And update the dataset records.
|
|
128
|
+
schema_path.tables["Dataset"].update(
|
|
129
|
+
[{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records]
|
|
130
|
+
)
|
|
124
131
|
|
|
125
132
|
def _bootstrap_versions(self):
|
|
126
133
|
datasets = [ds["RID"] for ds in self.find_datasets()]
|
|
@@ -170,6 +177,9 @@ class Dataset:
|
|
|
170
177
|
Returns:
|
|
171
178
|
A list of DatasetHistory objects which indicate the version-number, creation time, and bag instantiation of the dataset.
|
|
172
179
|
"""
|
|
180
|
+
|
|
181
|
+
if not self._is_dataset_rid(dataset_rid):
|
|
182
|
+
raise DerivaMLException(f"RID is not for a data set: {dataset_rid}")
|
|
173
183
|
version_path = (
|
|
174
184
|
self._model.catalog.getPathBuilder()
|
|
175
185
|
.schemas[self._ml_schema]
|
|
@@ -179,7 +189,7 @@ class Dataset:
|
|
|
179
189
|
DatasetHistory(
|
|
180
190
|
dataset_version=DatasetVersion.parse(v["Version"]),
|
|
181
191
|
minid=v["Minid"],
|
|
182
|
-
|
|
192
|
+
snapshot=v["Snapshot"],
|
|
183
193
|
dataset_rid=dataset_rid,
|
|
184
194
|
version_rid=v["RID"],
|
|
185
195
|
description=v["Description"],
|
|
@@ -240,7 +250,7 @@ class Dataset:
|
|
|
240
250
|
|
|
241
251
|
Args:
|
|
242
252
|
dataset_rid: RID of the dataset whose version is to be incremented.
|
|
243
|
-
component: Which version of the dataset_table to increment. Major, Minor or Patch
|
|
253
|
+
component: Which version of the dataset_table to increment. Major, Minor, or Patch
|
|
244
254
|
description: Description of the version update of the dataset_table.
|
|
245
255
|
execution_rid: Which execution is performing increment.
|
|
246
256
|
|
|
@@ -248,7 +258,7 @@ class Dataset:
|
|
|
248
258
|
new semantic version of the dataset_table as a 3-tuple
|
|
249
259
|
|
|
250
260
|
Raises:
|
|
251
|
-
DerivaMLException: if provided RID is not to a dataset_table.
|
|
261
|
+
DerivaMLException: if provided, RID is not to a dataset_table.
|
|
252
262
|
"""
|
|
253
263
|
|
|
254
264
|
# Find all the datasets that are reachable from this dataset and determine their new version numbers.
|
|
@@ -268,7 +278,7 @@ class Dataset:
|
|
|
268
278
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
269
279
|
def create_dataset(
|
|
270
280
|
self,
|
|
271
|
-
|
|
281
|
+
dataset_types: str | list[str],
|
|
272
282
|
description: str,
|
|
273
283
|
execution_rid: Optional[RID] = None,
|
|
274
284
|
version: Optional[DatasetVersion] = None,
|
|
@@ -276,7 +286,7 @@ class Dataset:
|
|
|
276
286
|
"""Create a new dataset_table from the specified list of RIDs.
|
|
277
287
|
|
|
278
288
|
Args:
|
|
279
|
-
|
|
289
|
+
dataset_types: One or more dataset_table types. Must be a term from the DatasetType controlled vocabulary.
|
|
280
290
|
description: Description of the dataset_table.
|
|
281
291
|
execution_rid: Execution under which the dataset_table will be created.
|
|
282
292
|
version: Version of the dataset_table.
|
|
@@ -304,7 +314,7 @@ class Dataset:
|
|
|
304
314
|
return False
|
|
305
315
|
|
|
306
316
|
# Create the entry for the new dataset_table and get its RID.
|
|
307
|
-
ds_types = [
|
|
317
|
+
ds_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
|
|
308
318
|
pb = self._model.catalog.getPathBuilder()
|
|
309
319
|
for ds_type in ds_types:
|
|
310
320
|
if not check_dataset_type(ds_type):
|
|
@@ -452,7 +462,9 @@ class Dataset:
|
|
|
452
462
|
)
|
|
453
463
|
|
|
454
464
|
# self.model = self.catalog.getCatalogModel()
|
|
455
|
-
self.dataset_table.annotations.update(
|
|
465
|
+
self.dataset_table.annotations.update(
|
|
466
|
+
self._generate_dataset_download_annotations()
|
|
467
|
+
)
|
|
456
468
|
self._model.model.apply()
|
|
457
469
|
return table
|
|
458
470
|
|
|
@@ -464,7 +476,7 @@ class Dataset:
|
|
|
464
476
|
|
|
465
477
|
Args:
|
|
466
478
|
dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
|
|
467
|
-
recurse:
|
|
479
|
+
recurse: (Default value = False)
|
|
468
480
|
limit: If provided, the maximum number of members to return for each element type.
|
|
469
481
|
|
|
470
482
|
Returns:
|
|
@@ -530,8 +542,8 @@ class Dataset:
|
|
|
530
542
|
dataset is incremented and the description, if provide is applied to that new version.
|
|
531
543
|
|
|
532
544
|
Args:
|
|
533
|
-
dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
|
|
534
|
-
members: List of RIDs
|
|
545
|
+
dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
|
|
546
|
+
members: List of member RIDs to add to the dataset_table.
|
|
535
547
|
validate: Check rid_list to make sure elements are not already in the dataset_table.
|
|
536
548
|
description: Markdown description of the updated dataset.
|
|
537
549
|
execution_rid: Optional RID of execution associated with this dataset.
|
|
@@ -544,7 +556,7 @@ class Dataset:
|
|
|
544
556
|
|
|
545
557
|
Args:
|
|
546
558
|
member_rid:
|
|
547
|
-
path:
|
|
559
|
+
path: (Default value = None)
|
|
548
560
|
|
|
549
561
|
Returns:
|
|
550
562
|
|
|
@@ -570,7 +582,7 @@ class Dataset:
|
|
|
570
582
|
a.other_fkeys.pop().pk_table.name: a.table.name
|
|
571
583
|
for a in self.dataset_table.find_associations()
|
|
572
584
|
}
|
|
573
|
-
# Get a list of all the types
|
|
585
|
+
# Get a list of all the object types that can be linked to a dataset_table.
|
|
574
586
|
for m in members:
|
|
575
587
|
try:
|
|
576
588
|
rid_info = self._model.catalog.resolve_rid(m)
|
|
@@ -618,8 +630,8 @@ class Dataset:
|
|
|
618
630
|
dataset is incremented and the description, if provide is applied to that new version.
|
|
619
631
|
|
|
620
632
|
Args:
|
|
621
|
-
dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
|
|
622
|
-
members: List of RIDs
|
|
633
|
+
dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
|
|
634
|
+
members: List of member RIDs to add to the dataset_table.
|
|
623
635
|
description: Markdown description of the updated dataset.
|
|
624
636
|
execution_rid: Optional RID of execution associated with this operation.
|
|
625
637
|
"""
|
|
@@ -634,7 +646,7 @@ class Dataset:
|
|
|
634
646
|
a.other_fkeys.pop().pk_table.name: a.table.name
|
|
635
647
|
for a in self.dataset_table.find_associations()
|
|
636
648
|
}
|
|
637
|
-
# Get a list of all the types
|
|
649
|
+
# Get a list of all the object types that can be linked to a dataset_table.
|
|
638
650
|
for m in members:
|
|
639
651
|
try:
|
|
640
652
|
rid_info = self._model.catalog.resolve_rid(m)
|
|
@@ -670,7 +682,7 @@ class Dataset:
|
|
|
670
682
|
)
|
|
671
683
|
|
|
672
684
|
@validate_call
|
|
673
|
-
def list_dataset_parents(self, dataset_rid: RID) -> list[
|
|
685
|
+
def list_dataset_parents(self, dataset_rid: RID) -> list[str]:
|
|
674
686
|
"""Given a dataset_table RID, return a list of RIDs of the parent datasets if this is included in a
|
|
675
687
|
nested dataset.
|
|
676
688
|
|
|
@@ -696,14 +708,14 @@ class Dataset:
|
|
|
696
708
|
|
|
697
709
|
@validate_call
|
|
698
710
|
def list_dataset_children(self, dataset_rid: RID, recurse=False) -> list[RID]:
|
|
699
|
-
"""Given a dataset_table RID, return a list of RIDs
|
|
711
|
+
"""Given a dataset_table RID, return a list of RIDs for any nested datasets.
|
|
700
712
|
|
|
701
713
|
Args:
|
|
702
714
|
dataset_rid: A dataset_table RID.
|
|
703
|
-
recurse: If True, return a list of
|
|
715
|
+
recurse: If True, return a list of nested datasets RIDs.
|
|
704
716
|
|
|
705
717
|
Returns:
|
|
706
|
-
list of
|
|
718
|
+
list of nested dataset RIDs.
|
|
707
719
|
|
|
708
720
|
"""
|
|
709
721
|
dataset_dataset_path = (
|
|
@@ -726,7 +738,7 @@ class Dataset:
|
|
|
726
738
|
|
|
727
739
|
return find_children(dataset_rid)
|
|
728
740
|
|
|
729
|
-
def
|
|
741
|
+
def _export_vocabulary(
|
|
730
742
|
self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
|
|
731
743
|
) -> list[dict[str, Any]]:
|
|
732
744
|
"""
|
|
@@ -756,10 +768,10 @@ class Dataset:
|
|
|
756
768
|
) -> Iterator[tuple[str, str, Table]]:
|
|
757
769
|
paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
|
|
758
770
|
|
|
759
|
-
def source_path(path: tuple[Table, ...]):
|
|
771
|
+
def source_path(path: tuple[Table, ...]) -> list[str]:
|
|
760
772
|
"""Convert a tuple representing a path into a source path component with FK linkage"""
|
|
761
773
|
path = list(path)
|
|
762
|
-
p = [f"{self._model.ml_schema}:Dataset/RID={{
|
|
774
|
+
p = [f"{self._model.ml_schema}:Dataset/RID={{RID}}"]
|
|
763
775
|
for table in path[1:]:
|
|
764
776
|
if table.name == "Dataset_Dataset":
|
|
765
777
|
p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
|
|
@@ -803,7 +815,7 @@ class Dataset:
|
|
|
803
815
|
dataset_elements = [
|
|
804
816
|
snapshot_catalog._model.name_to_table(e)
|
|
805
817
|
for e, m in snapshot_catalog.list_dataset_members(
|
|
806
|
-
dataset_rid=dataset_rid, # limit=1
|
|
818
|
+
dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
|
|
807
819
|
).items()
|
|
808
820
|
if m
|
|
809
821
|
]
|
|
@@ -857,7 +869,7 @@ class Dataset:
|
|
|
857
869
|
"""
|
|
858
870
|
|
|
859
871
|
def children_depth(
|
|
860
|
-
dataset_rid: RID, nested_datasets: dict[
|
|
872
|
+
dataset_rid: RID, nested_datasets: dict[str, list[str]]
|
|
861
873
|
) -> int:
|
|
862
874
|
"""Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
|
|
863
875
|
try:
|
|
@@ -899,13 +911,13 @@ class Dataset:
|
|
|
899
911
|
def _dataset_specification(
|
|
900
912
|
self,
|
|
901
913
|
writer: Callable[[str, str, Table], list[dict[str, Any]]],
|
|
902
|
-
dataset: DatasetSpec,
|
|
914
|
+
dataset: Optional[DatasetSpec] = None,
|
|
903
915
|
snapshot_catalog: Optional[DerivaML] = None,
|
|
904
916
|
) -> list[dict[str, Any]]:
|
|
905
917
|
"""Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
|
|
906
|
-
The top level data directory of the resulting BDBag will have one subdirectory for element type.
|
|
918
|
+
The top level data directory of the resulting BDBag will have one subdirectory for element type. The subdirectory
|
|
907
919
|
will contain the CSV indicating which elements of that type are present in the dataset_table, and then there will be a
|
|
908
|
-
|
|
920
|
+
subdirectory for each object that is reachable from the dataset_table members.
|
|
909
921
|
|
|
910
922
|
To simplify reconstructing the relationship between tables, the CVS for each
|
|
911
923
|
The top level data directory will also contain a subdirectory for any controlled vocabularies used in the dataset_table.
|
|
@@ -913,7 +925,7 @@ class Dataset:
|
|
|
913
925
|
|
|
914
926
|
For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign key relationships to
|
|
915
927
|
objects in tables T3 and T4. There are also two controlled vocabularies, CV1 and CV2. T2 is an asset table
|
|
916
|
-
which has two
|
|
928
|
+
which has two assets in it. The layout of the resulting bdbag would be:
|
|
917
929
|
data
|
|
918
930
|
CV1/
|
|
919
931
|
cv1.csv
|
|
@@ -939,12 +951,12 @@ class Dataset:
|
|
|
939
951
|
Returns:
|
|
940
952
|
A dataset_table specification.
|
|
941
953
|
"""
|
|
942
|
-
element_spec =
|
|
954
|
+
element_spec = self._export_vocabulary(writer)
|
|
943
955
|
for path in self._table_paths(
|
|
944
956
|
dataset=dataset, snapshot_catalog=snapshot_catalog
|
|
945
957
|
):
|
|
946
958
|
element_spec.extend(writer(*path))
|
|
947
|
-
return
|
|
959
|
+
return element_spec
|
|
948
960
|
|
|
949
961
|
def _download_dataset_bag(
|
|
950
962
|
self,
|
|
@@ -964,7 +976,8 @@ class Dataset:
|
|
|
964
976
|
for the dataset.
|
|
965
977
|
"""
|
|
966
978
|
if (
|
|
967
|
-
execution_rid
|
|
979
|
+
execution_rid
|
|
980
|
+
and execution_rid != DRY_RUN_RID
|
|
968
981
|
and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
|
|
969
982
|
):
|
|
970
983
|
raise DerivaMLException(f"RID {execution_rid} is not an execution")
|
|
@@ -984,7 +997,7 @@ class Dataset:
|
|
|
984
997
|
for h in self.dataset_history(dataset_rid=dataset.rid)
|
|
985
998
|
if h.dataset_version == dataset.version
|
|
986
999
|
][0]
|
|
987
|
-
return f"{self._model.catalog.catalog_id}@{
|
|
1000
|
+
return f"{self._model.catalog.catalog_id}@{version_record.snapshot}"
|
|
988
1001
|
|
|
989
1002
|
def _create_dataset_minid(
|
|
990
1003
|
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
@@ -999,7 +1012,7 @@ class Dataset:
|
|
|
999
1012
|
)
|
|
1000
1013
|
try:
|
|
1001
1014
|
self._logger.info(
|
|
1002
|
-
f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
|
|
1015
|
+
f"Downloading dataset {'minid' if self._use_minid else 'bag'} for catalog: {dataset.rid}@{str(dataset.version)}"
|
|
1003
1016
|
)
|
|
1004
1017
|
# Generate the bag and put into S3 storage.
|
|
1005
1018
|
exporter = DerivaExport(
|
|
@@ -1008,9 +1021,10 @@ class Dataset:
|
|
|
1008
1021
|
output_dir=tmp_dir,
|
|
1009
1022
|
defer_download=True,
|
|
1010
1023
|
timeout=(10, 610),
|
|
1011
|
-
envars={"
|
|
1024
|
+
envars={"RID": dataset.rid},
|
|
1012
1025
|
)
|
|
1013
1026
|
minid_page_url = exporter.export()[0] # Get the MINID launch page
|
|
1027
|
+
|
|
1014
1028
|
except (
|
|
1015
1029
|
DerivaDownloadError,
|
|
1016
1030
|
DerivaDownloadConfigurationError,
|
|
@@ -1020,17 +1034,18 @@ class Dataset:
|
|
|
1020
1034
|
) as e:
|
|
1021
1035
|
raise DerivaMLException(format_exception(e))
|
|
1022
1036
|
# Update version table with MINID.
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1037
|
+
if self._use_minid:
|
|
1038
|
+
version_path = (
|
|
1039
|
+
self._model.catalog.getPathBuilder()
|
|
1040
|
+
.schemas[self._ml_schema]
|
|
1041
|
+
.tables["Dataset_Version"]
|
|
1042
|
+
)
|
|
1043
|
+
version_rid = [
|
|
1044
|
+
h
|
|
1045
|
+
for h in self.dataset_history(dataset_rid=dataset.rid)
|
|
1046
|
+
if h.dataset_version == dataset.version
|
|
1047
|
+
][0].version_rid
|
|
1048
|
+
version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
|
|
1034
1049
|
return minid_page_url
|
|
1035
1050
|
|
|
1036
1051
|
def _get_dataset_minid(
|
|
@@ -1073,14 +1088,25 @@ class Dataset:
|
|
|
1073
1088
|
raise DerivaMLException(
|
|
1074
1089
|
f"Minid for dataset {dataset.rid} doesn't exist"
|
|
1075
1090
|
)
|
|
1076
|
-
self.
|
|
1091
|
+
if self._use_minid:
|
|
1092
|
+
self._logger.info("Creating new MINID for dataset %s", dataset.rid)
|
|
1077
1093
|
minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
|
|
1078
1094
|
# If provided a MINID, use the MINID metadata to get the checksum and download the bag.
|
|
1079
|
-
|
|
1080
|
-
|
|
1095
|
+
if self._use_minid:
|
|
1096
|
+
r = requests.get(minid_url, headers={"accept": "application/json"})
|
|
1097
|
+
dataset_minid = DatasetMinid(
|
|
1098
|
+
dataset_version=dataset.version, **r.json()
|
|
1099
|
+
)
|
|
1100
|
+
else:
|
|
1101
|
+
dataset_minid = DatasetMinid(
|
|
1102
|
+
dataset_version=dataset.version,
|
|
1103
|
+
RID=f"{dataset.rid}@{dataset_version_record.snapshot}",
|
|
1104
|
+
location=minid_url,
|
|
1105
|
+
)
|
|
1106
|
+
return dataset_minid
|
|
1081
1107
|
|
|
1082
1108
|
def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
|
|
1083
|
-
"""Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
|
|
1109
|
+
"""Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and validate
|
|
1084
1110
|
that all the metadata is correct
|
|
1085
1111
|
|
|
1086
1112
|
Args:
|
|
@@ -1089,19 +1115,37 @@ class Dataset:
|
|
|
1089
1115
|
the location of the unpacked and validated dataset_table bag and the RID of the bag and the bag MINID
|
|
1090
1116
|
"""
|
|
1091
1117
|
|
|
1092
|
-
# Check to see if we have an existing idempotent materialization of the desired bag. If so, then
|
|
1118
|
+
# Check to see if we have an existing idempotent materialization of the desired bag. If so, then reuse
|
|
1093
1119
|
# it. If not, then we need to extract the contents of the archive into our cache directory.
|
|
1094
1120
|
bag_dir = self._cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
|
|
1095
1121
|
if bag_dir.exists():
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1122
|
+
self._logger.info(
|
|
1123
|
+
f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
|
|
1124
|
+
)
|
|
1125
|
+
return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
|
|
1126
|
+
|
|
1127
|
+
# Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
|
|
1128
|
+
with TemporaryDirectory() as tmp_dir:
|
|
1129
|
+
if self._use_minid:
|
|
1130
|
+
# Get bag from S3
|
|
1131
|
+
archive_path = fetch_single_file(minid.bag_url)
|
|
1132
|
+
else:
|
|
1133
|
+
exporter = DerivaExport(
|
|
1134
|
+
host=self._model.catalog.deriva_server.server, output_dir=tmp_dir
|
|
1135
|
+
)
|
|
1136
|
+
archive_path = exporter.retrieve_file(minid.bag_url)
|
|
1137
|
+
hashes = hash_utils.compute_file_hashes(
|
|
1138
|
+
archive_path, hashes=["md5", "sha256"]
|
|
1139
|
+
)
|
|
1140
|
+
checksum = hashes["sha256"][0]
|
|
1141
|
+
bag_dir = self._cache_dir / f"{minid.dataset_rid}_{checksum}"
|
|
1142
|
+
if bag_dir.exists():
|
|
1143
|
+
self._logger.info(
|
|
1144
|
+
f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
|
|
1145
|
+
)
|
|
1146
|
+
return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
|
|
1147
|
+
bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
|
|
1148
|
+
bdb.validate_bag_structure(bag_path)
|
|
1105
1149
|
return Path(bag_path)
|
|
1106
1150
|
|
|
1107
1151
|
def _materialize_dataset_bag(
|
|
@@ -1120,17 +1164,18 @@ class Dataset:
|
|
|
1120
1164
|
|
|
1121
1165
|
def update_status(status: Status, msg: str) -> None:
|
|
1122
1166
|
"""Update the current status for this execution in the catalog"""
|
|
1123
|
-
|
|
1124
|
-
self.
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1167
|
+
if execution_rid and execution_rid != DRY_RUN_RID:
|
|
1168
|
+
self._model.catalog.getPathBuilder().schemas[
|
|
1169
|
+
self._ml_schema
|
|
1170
|
+
].Execution.update(
|
|
1171
|
+
[
|
|
1172
|
+
{
|
|
1173
|
+
"RID": execution_rid,
|
|
1174
|
+
"Status": status.value,
|
|
1175
|
+
"Status_Detail": msg,
|
|
1176
|
+
}
|
|
1177
|
+
]
|
|
1178
|
+
)
|
|
1134
1179
|
self._logger.info(msg)
|
|
1135
1180
|
|
|
1136
1181
|
def fetch_progress_callback(current, total):
|
|
@@ -1152,6 +1197,9 @@ class Dataset:
|
|
|
1152
1197
|
|
|
1153
1198
|
# If this bag has already been validated, our work is done. Otherwise, materialize the bag.
|
|
1154
1199
|
if not validated_check.exists():
|
|
1200
|
+
self._logger.info(
|
|
1201
|
+
f"Materializing bag {minid.dataset_rid} Version:{minid.dataset_version}"
|
|
1202
|
+
)
|
|
1155
1203
|
bdb.materialize(
|
|
1156
1204
|
bag_path.as_posix(),
|
|
1157
1205
|
fetch_callback=fetch_progress_callback,
|
|
@@ -1160,9 +1208,8 @@ class Dataset:
|
|
|
1160
1208
|
validated_check.touch()
|
|
1161
1209
|
return Path(bag_path)
|
|
1162
1210
|
|
|
1163
|
-
def
|
|
1211
|
+
def _export_annotation(
|
|
1164
1212
|
self,
|
|
1165
|
-
dataset: Optional[DatasetSpec] = None,
|
|
1166
1213
|
snapshot_catalog: Optional[DerivaML] = None,
|
|
1167
1214
|
) -> list[dict[str, Any]]:
|
|
1168
1215
|
"""Return and output specification for the datasets in the provided model
|
|
@@ -1171,19 +1218,6 @@ class Dataset:
|
|
|
1171
1218
|
An export specification suitable for Chaise.
|
|
1172
1219
|
"""
|
|
1173
1220
|
|
|
1174
|
-
def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
|
|
1175
|
-
"""
|
|
1176
|
-
|
|
1177
|
-
Args:
|
|
1178
|
-
spath: list[Table]:
|
|
1179
|
-
dpath: list[Table]:
|
|
1180
|
-
table: Table
|
|
1181
|
-
|
|
1182
|
-
Returns:
|
|
1183
|
-
An export specification suitable for Chaise.
|
|
1184
|
-
"""
|
|
1185
|
-
return self._export_dataset_element(spath, dpath, table)
|
|
1186
|
-
|
|
1187
1221
|
# Export specification is a specification for the datasets, plus any controlled vocabulary
|
|
1188
1222
|
return [
|
|
1189
1223
|
{
|
|
@@ -1202,41 +1236,34 @@ class Dataset:
|
|
|
1202
1236
|
"destination": {"type": "json", "name": "schema"},
|
|
1203
1237
|
},
|
|
1204
1238
|
] + self._dataset_specification(
|
|
1205
|
-
|
|
1239
|
+
self._export_annotation_dataset_element,
|
|
1240
|
+
None,
|
|
1241
|
+
snapshot_catalog=snapshot_catalog,
|
|
1206
1242
|
)
|
|
1207
1243
|
|
|
1208
|
-
def
|
|
1244
|
+
def _export_specification(
|
|
1209
1245
|
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
1210
1246
|
) -> list[dict[str, Any]]:
|
|
1211
1247
|
"""
|
|
1248
|
+
Generate a specification for export engine for specific dataset.
|
|
1249
|
+
|
|
1212
1250
|
Returns:
|
|
1213
1251
|
a download specification for the datasets in the provided model.
|
|
1214
1252
|
|
|
1215
1253
|
"""
|
|
1216
1254
|
|
|
1217
|
-
def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
|
|
1218
|
-
"""
|
|
1219
|
-
|
|
1220
|
-
Args:
|
|
1221
|
-
spath:
|
|
1222
|
-
dpath:
|
|
1223
|
-
table: Table
|
|
1224
|
-
|
|
1225
|
-
Returns:
|
|
1226
|
-
|
|
1227
|
-
"""
|
|
1228
|
-
return self._download_dataset_element(spath, dpath, table)
|
|
1229
|
-
|
|
1230
1255
|
# Download spec is the spec for any controlled vocabulary and for the dataset_table.
|
|
1231
1256
|
return [
|
|
1232
1257
|
{
|
|
1233
1258
|
"processor": "json",
|
|
1234
1259
|
"processor_params": {"query_path": "/schema", "output_path": "schema"},
|
|
1235
1260
|
}
|
|
1236
|
-
] + self._dataset_specification(
|
|
1261
|
+
] + self._dataset_specification(
|
|
1262
|
+
self._export_specification_dataset_element, dataset, snapshot_catalog
|
|
1263
|
+
)
|
|
1237
1264
|
|
|
1238
1265
|
@staticmethod
|
|
1239
|
-
def
|
|
1266
|
+
def _export_specification_dataset_element(
|
|
1240
1267
|
spath: str, dpath: str, table: Table
|
|
1241
1268
|
) -> list[dict[str, Any]]:
|
|
1242
1269
|
"""Return the download specification for the data object indicated by a path through the data model.
|
|
@@ -1253,7 +1280,7 @@ class Dataset:
|
|
|
1253
1280
|
{
|
|
1254
1281
|
"processor": "csv",
|
|
1255
1282
|
"processor_params": {
|
|
1256
|
-
"query_path": f"/entity/{spath}
|
|
1283
|
+
"query_path": f"/entity/{spath}",
|
|
1257
1284
|
"output_path": dpath,
|
|
1258
1285
|
},
|
|
1259
1286
|
}
|
|
@@ -1266,16 +1293,15 @@ class Dataset:
|
|
|
1266
1293
|
{
|
|
1267
1294
|
"processor": "fetch",
|
|
1268
1295
|
"processor_params": {
|
|
1269
|
-
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5
|
|
1296
|
+
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
|
|
1270
1297
|
"output_path": f"asset/{table.name}",
|
|
1271
1298
|
},
|
|
1272
1299
|
}
|
|
1273
1300
|
)
|
|
1274
1301
|
return exports
|
|
1275
1302
|
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
spath: str, dpath: str, table: Table
|
|
1303
|
+
def _export_annotation_dataset_element(
|
|
1304
|
+
self, spath: str, dpath: str, table: Table
|
|
1279
1305
|
) -> list[dict[str, Any]]:
|
|
1280
1306
|
"""Given a path in the data model, output an export specification for the path taken to get to the current table.
|
|
1281
1307
|
|
|
@@ -1291,9 +1317,23 @@ class Dataset:
|
|
|
1291
1317
|
# into a path in the form of /S:T1/S:T2/S:Table
|
|
1292
1318
|
# Generate the destination path in the file system using just the table names.
|
|
1293
1319
|
|
|
1320
|
+
skip_root_path = False
|
|
1321
|
+
if spath.startswith(f"{self._ml_schema}:Dataset/"):
|
|
1322
|
+
# Chaise will add table name and RID filter, so strip it off.
|
|
1323
|
+
spath = "/".join(spath.split("/")[2:])
|
|
1324
|
+
if spath == "":
|
|
1325
|
+
# This path is to just the dataset table.
|
|
1326
|
+
return []
|
|
1327
|
+
else:
|
|
1328
|
+
# A vocabulary table, so we don't want the root_path.
|
|
1329
|
+
skip_root_path = True
|
|
1294
1330
|
exports = [
|
|
1295
1331
|
{
|
|
1296
|
-
"source": {
|
|
1332
|
+
"source": {
|
|
1333
|
+
"api": "entity",
|
|
1334
|
+
"path": spath,
|
|
1335
|
+
"skip_root_path": skip_root_path,
|
|
1336
|
+
},
|
|
1297
1337
|
"destination": {"name": dpath, "type": "csv"},
|
|
1298
1338
|
}
|
|
1299
1339
|
]
|
|
@@ -1304,6 +1344,7 @@ class Dataset:
|
|
|
1304
1344
|
exports.append(
|
|
1305
1345
|
{
|
|
1306
1346
|
"source": {
|
|
1347
|
+
"skip_root_path": False,
|
|
1307
1348
|
"api": "attribute",
|
|
1308
1349
|
"path": f"{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
|
|
1309
1350
|
},
|
|
@@ -1313,44 +1354,53 @@ class Dataset:
|
|
|
1313
1354
|
return exports
|
|
1314
1355
|
|
|
1315
1356
|
def _generate_dataset_download_spec(
|
|
1316
|
-
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
|
|
1357
|
+
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
1317
1358
|
) -> dict[str, Any]:
|
|
1318
1359
|
"""
|
|
1360
|
+
Generate a specification for downloading a specific dataset.
|
|
1319
1361
|
|
|
1362
|
+
This routine creates a download specification that can be used by the Deriva export processor to download
|
|
1363
|
+
a specific dataset as a MINID.
|
|
1320
1364
|
Returns:
|
|
1321
1365
|
"""
|
|
1322
1366
|
s3_target = "s3://eye-ai-shared"
|
|
1323
1367
|
minid_test = False
|
|
1324
1368
|
|
|
1325
1369
|
catalog_id = self._version_snapshot(dataset)
|
|
1326
|
-
|
|
1327
|
-
|
|
1370
|
+
post_processors = (
|
|
1371
|
+
{
|
|
1372
|
+
"post_processors": [
|
|
1373
|
+
{
|
|
1374
|
+
"processor": "cloud_upload",
|
|
1375
|
+
"processor_params": {
|
|
1376
|
+
"acl": "public-read",
|
|
1377
|
+
"target_url": s3_target,
|
|
1378
|
+
},
|
|
1379
|
+
},
|
|
1380
|
+
{
|
|
1381
|
+
"processor": "identifier",
|
|
1382
|
+
"processor_params": {
|
|
1383
|
+
"test": minid_test,
|
|
1384
|
+
"env_column_map": {
|
|
1385
|
+
"RID": "{RID}@{snaptime}",
|
|
1386
|
+
"Description": "{Description}",
|
|
1387
|
+
},
|
|
1388
|
+
},
|
|
1389
|
+
},
|
|
1390
|
+
]
|
|
1391
|
+
}
|
|
1392
|
+
if self._use_minid
|
|
1393
|
+
else {}
|
|
1394
|
+
)
|
|
1395
|
+
return post_processors | {
|
|
1396
|
+
"env": {"RID": "{RID}"},
|
|
1328
1397
|
"bag": {
|
|
1329
|
-
"bag_name": "Dataset_{
|
|
1398
|
+
"bag_name": "Dataset_{RID}",
|
|
1330
1399
|
"bag_algorithms": ["md5"],
|
|
1331
1400
|
"bag_archiver": "zip",
|
|
1332
1401
|
"bag_metadata": {},
|
|
1333
1402
|
"bag_idempotent": True,
|
|
1334
1403
|
},
|
|
1335
|
-
"post_processors": [
|
|
1336
|
-
{
|
|
1337
|
-
"processor": "cloud_upload",
|
|
1338
|
-
"processor_params": {
|
|
1339
|
-
"acl": "public-read",
|
|
1340
|
-
"target_url": s3_target,
|
|
1341
|
-
},
|
|
1342
|
-
},
|
|
1343
|
-
{
|
|
1344
|
-
"processor": "identifier",
|
|
1345
|
-
"processor_params": {
|
|
1346
|
-
"test": minid_test,
|
|
1347
|
-
"env_column_map": {
|
|
1348
|
-
"Dataset_RID": "{RID}@{snaptime}",
|
|
1349
|
-
"Description": "{Description}",
|
|
1350
|
-
},
|
|
1351
|
-
},
|
|
1352
|
-
},
|
|
1353
|
-
],
|
|
1354
1404
|
"catalog": {
|
|
1355
1405
|
"host": f"{self._model.catalog.deriva_server.scheme}://{self._model.catalog.deriva_server.server}",
|
|
1356
1406
|
"catalog_id": catalog_id,
|
|
@@ -1366,125 +1416,50 @@ class Dataset:
|
|
|
1366
1416
|
{
|
|
1367
1417
|
"processor": "env",
|
|
1368
1418
|
"processor_params": {
|
|
1369
|
-
"query_path": "/entity/M:=deriva-ml:Dataset/RID={
|
|
1419
|
+
"query_path": "/entity/M:=deriva-ml:Dataset/RID={RID}",
|
|
1370
1420
|
"output_path": "Dataset",
|
|
1371
1421
|
"query_keys": ["RID", "Description"],
|
|
1372
1422
|
},
|
|
1373
1423
|
},
|
|
1374
1424
|
]
|
|
1375
|
-
+ self.
|
|
1425
|
+
+ self._export_specification(dataset, snapshot_catalog),
|
|
1376
1426
|
},
|
|
1377
1427
|
}
|
|
1378
1428
|
|
|
1379
|
-
def
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
"*": [
|
|
1393
|
-
"RID",
|
|
1394
|
-
"Description",
|
|
1395
|
-
{
|
|
1396
|
-
"display": {
|
|
1397
|
-
"markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
|
|
1398
|
-
},
|
|
1399
|
-
"markdown_name": "Annotation App",
|
|
1400
|
-
},
|
|
1401
|
-
rcb_name,
|
|
1402
|
-
rmb_name,
|
|
1403
|
-
],
|
|
1404
|
-
"detailed": [
|
|
1405
|
-
"RID",
|
|
1406
|
-
"Description",
|
|
1407
|
-
{
|
|
1408
|
-
"source": [
|
|
1409
|
-
{"inbound": ["deriva-ml", "Dataset_Dataset_Type_Dataset_fkey"]},
|
|
1410
|
-
{
|
|
1411
|
-
"outbound": [
|
|
1412
|
-
"deriva-ml",
|
|
1413
|
-
"Dataset_Dataset_Type_Dataset_Type_fkey",
|
|
1414
|
-
]
|
|
1429
|
+
def _generate_dataset_download_annotations(self) -> dict[str, Any]:
|
|
1430
|
+
post_processors = (
|
|
1431
|
+
{
|
|
1432
|
+
"type": "BAG",
|
|
1433
|
+
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
1434
|
+
"displayname": "BDBag to Cloud",
|
|
1435
|
+
"bag_idempotent": True,
|
|
1436
|
+
"postprocessors": [
|
|
1437
|
+
{
|
|
1438
|
+
"processor": "cloud_upload",
|
|
1439
|
+
"processor_params": {
|
|
1440
|
+
"acl": "public-read",
|
|
1441
|
+
"target_url": "s3://eye-ai-shared/",
|
|
1415
1442
|
},
|
|
1416
|
-
"RID",
|
|
1417
|
-
],
|
|
1418
|
-
"markdown_name": "Dataset Types",
|
|
1419
|
-
},
|
|
1420
|
-
{
|
|
1421
|
-
"display": {
|
|
1422
|
-
"markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
|
|
1423
1443
|
},
|
|
1424
|
-
"markdown_name": "Annotation App",
|
|
1425
|
-
},
|
|
1426
|
-
rcb_name,
|
|
1427
|
-
rmb_name,
|
|
1428
|
-
],
|
|
1429
|
-
"filter": {
|
|
1430
|
-
"and": [
|
|
1431
|
-
{"source": "RID"},
|
|
1432
|
-
{"source": "Description"},
|
|
1433
1444
|
{
|
|
1434
|
-
"
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
},
|
|
1441
|
-
{
|
|
1442
|
-
"outbound": [
|
|
1443
|
-
"deriva-ml",
|
|
1444
|
-
"Dataset_Dataset_Type_Dataset_Type_fkey",
|
|
1445
|
-
]
|
|
1445
|
+
"processor": "identifier",
|
|
1446
|
+
"processor_params": {
|
|
1447
|
+
"test": False,
|
|
1448
|
+
"env_column_map": {
|
|
1449
|
+
"RID": "{RID}@{snaptime}",
|
|
1450
|
+
"Description": "{Description}",
|
|
1446
1451
|
},
|
|
1447
|
-
|
|
1448
|
-
],
|
|
1449
|
-
"markdown_name": "Dataset Types",
|
|
1450
|
-
},
|
|
1451
|
-
{
|
|
1452
|
-
"source": [{"outbound": rcb_name}, "RID"],
|
|
1453
|
-
"markdown_name": "Created By",
|
|
1454
|
-
},
|
|
1455
|
-
{
|
|
1456
|
-
"source": [{"outbound": rmb_name}, "RID"],
|
|
1457
|
-
"markdown_name": "Modified By",
|
|
1452
|
+
},
|
|
1458
1453
|
},
|
|
1459
|
-
]
|
|
1460
|
-
},
|
|
1461
|
-
}
|
|
1462
|
-
|
|
1463
|
-
def _dataset_visible_fkeys(self) -> dict[str, Any]:
|
|
1464
|
-
def fkey_name(fk):
|
|
1465
|
-
return [fk.name[0].name, fk.name[1]]
|
|
1466
|
-
|
|
1467
|
-
dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
|
|
1468
|
-
|
|
1469
|
-
source_list = [
|
|
1470
|
-
{
|
|
1471
|
-
"source": [
|
|
1472
|
-
{"inbound": fkey_name(fkey.self_fkey)},
|
|
1473
|
-
{"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
|
|
1474
|
-
"RID",
|
|
1475
1454
|
],
|
|
1476
|
-
"markdown_name": other_fkey.pk_table.name,
|
|
1477
1455
|
}
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
def _generate_dataset_annotations(self) -> dict[str, Any]:
|
|
1456
|
+
if self._use_minid
|
|
1457
|
+
else {}
|
|
1458
|
+
)
|
|
1483
1459
|
return {
|
|
1484
1460
|
deriva_tags.export_fragment_definitions: {
|
|
1485
|
-
"dataset_export_outputs": self.
|
|
1461
|
+
"dataset_export_outputs": self._export_annotation()
|
|
1486
1462
|
},
|
|
1487
|
-
deriva_tags.visible_columns: self.dataset_visible_columns(),
|
|
1488
1463
|
deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
|
|
1489
1464
|
deriva_tags.export_2019: {
|
|
1490
1465
|
"detailed": {
|
|
@@ -1494,45 +1469,56 @@ class Dataset:
|
|
|
1494
1469
|
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
1495
1470
|
"displayname": "BDBag Download",
|
|
1496
1471
|
"bag_idempotent": True,
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
"processor": "identifier",
|
|
1500
|
-
"processor_params": {
|
|
1501
|
-
"test": False,
|
|
1502
|
-
"env_column_map": {
|
|
1503
|
-
"Dataset_RID": "{RID}@{snaptime}",
|
|
1504
|
-
"Description": "{Description}",
|
|
1505
|
-
},
|
|
1506
|
-
},
|
|
1507
|
-
}
|
|
1508
|
-
],
|
|
1509
|
-
},
|
|
1510
|
-
{
|
|
1511
|
-
"type": "BAG",
|
|
1512
|
-
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
1513
|
-
"displayname": "BDBag to Cloud",
|
|
1514
|
-
"bag_idempotent": True,
|
|
1515
|
-
"postprocessors": [
|
|
1516
|
-
{
|
|
1517
|
-
"processor": "cloud_upload",
|
|
1518
|
-
"processor_params": {
|
|
1519
|
-
"acl": "public-read",
|
|
1520
|
-
"target_url": "s3://eye-ai-shared/",
|
|
1521
|
-
},
|
|
1522
|
-
},
|
|
1523
|
-
{
|
|
1524
|
-
"processor": "identifier",
|
|
1525
|
-
"processor_params": {
|
|
1526
|
-
"test": False,
|
|
1527
|
-
"env_column_map": {
|
|
1528
|
-
"Dataset_RID": "{RID}@{snaptime}",
|
|
1529
|
-
"Description": "{Description}",
|
|
1530
|
-
},
|
|
1531
|
-
},
|
|
1532
|
-
},
|
|
1533
|
-
],
|
|
1534
|
-
},
|
|
1472
|
+
}
|
|
1473
|
+
| post_processors
|
|
1535
1474
|
]
|
|
1536
1475
|
}
|
|
1537
1476
|
},
|
|
1538
1477
|
}
|
|
1478
|
+
|
|
1479
|
+
def _dataset_visible_fkeys(self) -> dict[str, Any]:
|
|
1480
|
+
def fkey_name(fk):
|
|
1481
|
+
return [fk.name[0].name, fk.name[1]]
|
|
1482
|
+
|
|
1483
|
+
dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
|
|
1484
|
+
|
|
1485
|
+
source_list = [
|
|
1486
|
+
{
|
|
1487
|
+
"source": [
|
|
1488
|
+
{"inbound": ["deriva-ml", "Dataset_Version_Dataset_fkey"]},
|
|
1489
|
+
"RID",
|
|
1490
|
+
],
|
|
1491
|
+
"markdown_name": "Previous Versions",
|
|
1492
|
+
"entity": True,
|
|
1493
|
+
},
|
|
1494
|
+
{
|
|
1495
|
+
"source": [
|
|
1496
|
+
{"inbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
|
|
1497
|
+
{"outbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
|
|
1498
|
+
"RID",
|
|
1499
|
+
],
|
|
1500
|
+
"markdown_name": "Parent Datasets",
|
|
1501
|
+
},
|
|
1502
|
+
{
|
|
1503
|
+
"source": [
|
|
1504
|
+
{"inbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
|
|
1505
|
+
{"outbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
|
|
1506
|
+
"RID",
|
|
1507
|
+
],
|
|
1508
|
+
"markdown_name": "Child Datasets",
|
|
1509
|
+
},
|
|
1510
|
+
]
|
|
1511
|
+
source_list.extend(
|
|
1512
|
+
[
|
|
1513
|
+
{
|
|
1514
|
+
"source": [
|
|
1515
|
+
{"inbound": fkey_name(fkey.self_fkey)},
|
|
1516
|
+
{"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
|
|
1517
|
+
"RID",
|
|
1518
|
+
],
|
|
1519
|
+
"markdown_name": other_fkey.pk_table.name,
|
|
1520
|
+
}
|
|
1521
|
+
for fkey in dataset_table.find_associations(max_arity=3, pure=False)
|
|
1522
|
+
]
|
|
1523
|
+
)
|
|
1524
|
+
return {"detailed": source_list}
|