deriva-ml 1.13.2__py3-none-any.whl → 1.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/database_model.py +5 -11
- deriva_ml/dataset.py +294 -295
- deriva_ml/dataset_aux_classes.py +10 -10
- deriva_ml/demo_catalog.py +90 -67
- deriva_ml/deriva_definitions.py +62 -4
- deriva_ml/deriva_ml_base.py +24 -29
- deriva_ml/deriva_model.py +17 -5
- deriva_ml/execution.py +23 -3
- deriva_ml/history.py +4 -1
- deriva_ml/schema_setup/annotations.py +341 -126
- deriva_ml/schema_setup/create_schema.py +33 -65
- deriva_ml/schema_setup/policy.json +7 -3
- deriva_ml/upload.py +3 -3
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.14.0.dist-info}/METADATA +2 -2
- deriva_ml-1.14.0.dist-info/RECORD +31 -0
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.14.0.dist-info}/WHEEL +1 -1
- deriva_ml-1.13.2.dist-info/RECORD +0 -31
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.14.0.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.14.0.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.14.0.dist-info}/top_level.txt +0 -0
deriva_ml/dataset.py
CHANGED
|
@@ -1,18 +1,30 @@
|
|
|
1
1
|
"""
|
|
2
|
-
This module defines the DataSet class with is used to manipulate datasets in DerivaML
|
|
3
|
-
The intended use of this class is as a base class in DerivaML so all the methods documented here are
|
|
2
|
+
This module defines the DataSet class with is used to manipulate datasets in DerivaML.
|
|
3
|
+
The intended use of this class is as a base class in DerivaML, so all the methods documented here are
|
|
4
4
|
accessible via a DerivaML class instance.
|
|
5
5
|
|
|
6
|
-
|
|
7
6
|
"""
|
|
8
7
|
|
|
9
8
|
from __future__ import annotations
|
|
10
|
-
from bdbag.fetch.fetcher import fetch_single_file
|
|
11
9
|
from bdbag import bdbag_api as bdb
|
|
10
|
+
from bdbag.fetch.fetcher import fetch_single_file
|
|
12
11
|
from collections import defaultdict
|
|
12
|
+
from graphlib import TopologicalSorter
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from pydantic import (
|
|
17
|
+
validate_call,
|
|
18
|
+
ConfigDict,
|
|
19
|
+
)
|
|
20
|
+
import requests
|
|
21
|
+
from tempfile import TemporaryDirectory
|
|
22
|
+
from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
|
|
13
23
|
|
|
24
|
+
from .history import iso_to_snap
|
|
14
25
|
from deriva.core.ermrest_model import Table
|
|
15
26
|
from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
|
|
27
|
+
import deriva.core.utils.hash_utils as hash_utils
|
|
16
28
|
from deriva.transfer.download.deriva_export import DerivaExport
|
|
17
29
|
from deriva.transfer.download.deriva_download import (
|
|
18
30
|
DerivaDownloadConfigurationError,
|
|
@@ -22,24 +34,12 @@ from deriva.transfer.download.deriva_download import (
|
|
|
22
34
|
DerivaDownloadTimeoutError,
|
|
23
35
|
)
|
|
24
36
|
|
|
37
|
+
|
|
25
38
|
try:
|
|
26
39
|
from icecream import ic
|
|
27
40
|
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
28
41
|
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
29
42
|
|
|
30
|
-
from graphlib import TopologicalSorter
|
|
31
|
-
import json
|
|
32
|
-
import logging
|
|
33
|
-
from pathlib import Path
|
|
34
|
-
from pydantic import (
|
|
35
|
-
validate_call,
|
|
36
|
-
ConfigDict,
|
|
37
|
-
)
|
|
38
|
-
import requests
|
|
39
|
-
|
|
40
|
-
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
|
41
|
-
from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
|
|
42
|
-
|
|
43
43
|
from deriva_ml import DatasetBag
|
|
44
44
|
from .deriva_definitions import (
|
|
45
45
|
ML_SCHEMA,
|
|
@@ -49,7 +49,6 @@ from .deriva_definitions import (
|
|
|
49
49
|
RID,
|
|
50
50
|
DRY_RUN_RID,
|
|
51
51
|
)
|
|
52
|
-
from .history import iso_to_snap
|
|
53
52
|
from .deriva_model import DerivaModel
|
|
54
53
|
from .database_model import DatabaseModel
|
|
55
54
|
from .dataset_aux_classes import (
|
|
@@ -74,13 +73,20 @@ class Dataset:
|
|
|
74
73
|
|
|
75
74
|
_Logger = logging.getLogger("deriva_ml")
|
|
76
75
|
|
|
77
|
-
def __init__(
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
model: DerivaModel,
|
|
79
|
+
cache_dir: Path,
|
|
80
|
+
working_dir: Path,
|
|
81
|
+
use_minid: bool = True,
|
|
82
|
+
):
|
|
78
83
|
self._model = model
|
|
79
84
|
self._ml_schema = ML_SCHEMA
|
|
80
85
|
self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
|
|
81
86
|
self._cache_dir = cache_dir
|
|
82
87
|
self._working_dir = working_dir
|
|
83
88
|
self._logger = logging.getLogger("deriva_ml")
|
|
89
|
+
self._use_minid = use_minid
|
|
84
90
|
|
|
85
91
|
def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
|
|
86
92
|
try:
|
|
@@ -100,27 +106,28 @@ class Dataset:
|
|
|
100
106
|
dataset_list: list[DatasetSpec],
|
|
101
107
|
description: Optional[str] = "",
|
|
102
108
|
execution_rid: Optional[RID] = None,
|
|
103
|
-
) ->
|
|
109
|
+
) -> None:
|
|
104
110
|
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
105
|
-
|
|
111
|
+
# determine snapshot after changes were made
|
|
112
|
+
snap = self._model.catalog.get("/").json()["snaptime"]
|
|
106
113
|
# Construct version records for insert
|
|
107
|
-
version_records = [
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
114
|
+
version_records = schema_path.tables["Dataset_Version"].insert(
|
|
115
|
+
[
|
|
116
|
+
{
|
|
117
|
+
"Dataset": dataset.rid,
|
|
118
|
+
"Version": str(dataset.version),
|
|
119
|
+
"Description": description,
|
|
120
|
+
"Execution": execution_rid,
|
|
121
|
+
"Snapshot": snap,
|
|
122
|
+
}
|
|
123
|
+
for dataset in dataset_list
|
|
124
|
+
]
|
|
125
|
+
)
|
|
116
126
|
|
|
117
|
-
#
|
|
118
|
-
|
|
119
|
-
{"Version": v["RID"], "RID": v["Dataset"]}
|
|
120
|
-
|
|
121
|
-
]
|
|
122
|
-
schema_path.tables["Dataset"].update(version_rids)
|
|
123
|
-
return version_rids
|
|
127
|
+
# And update the dataset records.
|
|
128
|
+
schema_path.tables["Dataset"].update(
|
|
129
|
+
[{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records]
|
|
130
|
+
)
|
|
124
131
|
|
|
125
132
|
def _bootstrap_versions(self):
|
|
126
133
|
datasets = [ds["RID"] for ds in self.find_datasets()]
|
|
@@ -161,6 +168,21 @@ class Dataset:
|
|
|
161
168
|
]
|
|
162
169
|
)
|
|
163
170
|
|
|
171
|
+
def _set_version_snapshot(self):
|
|
172
|
+
dataset_version_path = (
|
|
173
|
+
self._model.catalog.getPathBuilder()
|
|
174
|
+
.schemas[self._ml_schema]
|
|
175
|
+
.tables["Dataset_Version"]
|
|
176
|
+
)
|
|
177
|
+
versions = dataset_version_path.entities().fetch()
|
|
178
|
+
dataset_version_path.update(
|
|
179
|
+
[
|
|
180
|
+
{"RID": h["RID"], "Snapshot": iso_to_snap(h["RCT"])}
|
|
181
|
+
for h in versions
|
|
182
|
+
if not h["Snapshot"]
|
|
183
|
+
]
|
|
184
|
+
)
|
|
185
|
+
|
|
164
186
|
def dataset_history(self, dataset_rid: RID) -> list[DatasetHistory]:
|
|
165
187
|
"""Return a list of DatasetHistory objects representing the dataset
|
|
166
188
|
|
|
@@ -170,6 +192,9 @@ class Dataset:
|
|
|
170
192
|
Returns:
|
|
171
193
|
A list of DatasetHistory objects which indicate the version-number, creation time, and bag instantiation of the dataset.
|
|
172
194
|
"""
|
|
195
|
+
|
|
196
|
+
if not self._is_dataset_rid(dataset_rid):
|
|
197
|
+
raise DerivaMLException(f"RID is not for a data set: {dataset_rid}")
|
|
173
198
|
version_path = (
|
|
174
199
|
self._model.catalog.getPathBuilder()
|
|
175
200
|
.schemas[self._ml_schema]
|
|
@@ -179,7 +204,7 @@ class Dataset:
|
|
|
179
204
|
DatasetHistory(
|
|
180
205
|
dataset_version=DatasetVersion.parse(v["Version"]),
|
|
181
206
|
minid=v["Minid"],
|
|
182
|
-
|
|
207
|
+
snapshot=v["Snapshot"],
|
|
183
208
|
dataset_rid=dataset_rid,
|
|
184
209
|
version_rid=v["RID"],
|
|
185
210
|
description=v["Description"],
|
|
@@ -240,7 +265,7 @@ class Dataset:
|
|
|
240
265
|
|
|
241
266
|
Args:
|
|
242
267
|
dataset_rid: RID of the dataset whose version is to be incremented.
|
|
243
|
-
component: Which version of the dataset_table to increment. Major, Minor or Patch
|
|
268
|
+
component: Which version of the dataset_table to increment. Major, Minor, or Patch
|
|
244
269
|
description: Description of the version update of the dataset_table.
|
|
245
270
|
execution_rid: Which execution is performing increment.
|
|
246
271
|
|
|
@@ -248,7 +273,7 @@ class Dataset:
|
|
|
248
273
|
new semantic version of the dataset_table as a 3-tuple
|
|
249
274
|
|
|
250
275
|
Raises:
|
|
251
|
-
DerivaMLException: if provided RID is not to a dataset_table.
|
|
276
|
+
DerivaMLException: if provided, RID is not to a dataset_table.
|
|
252
277
|
"""
|
|
253
278
|
|
|
254
279
|
# Find all the datasets that are reachable from this dataset and determine their new version numbers.
|
|
@@ -268,7 +293,7 @@ class Dataset:
|
|
|
268
293
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
269
294
|
def create_dataset(
|
|
270
295
|
self,
|
|
271
|
-
|
|
296
|
+
dataset_types: str | list[str],
|
|
272
297
|
description: str,
|
|
273
298
|
execution_rid: Optional[RID] = None,
|
|
274
299
|
version: Optional[DatasetVersion] = None,
|
|
@@ -276,7 +301,7 @@ class Dataset:
|
|
|
276
301
|
"""Create a new dataset_table from the specified list of RIDs.
|
|
277
302
|
|
|
278
303
|
Args:
|
|
279
|
-
|
|
304
|
+
dataset_types: One or more dataset_table types. Must be a term from the DatasetType controlled vocabulary.
|
|
280
305
|
description: Description of the dataset_table.
|
|
281
306
|
execution_rid: Execution under which the dataset_table will be created.
|
|
282
307
|
version: Version of the dataset_table.
|
|
@@ -304,7 +329,7 @@ class Dataset:
|
|
|
304
329
|
return False
|
|
305
330
|
|
|
306
331
|
# Create the entry for the new dataset_table and get its RID.
|
|
307
|
-
ds_types = [
|
|
332
|
+
ds_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
|
|
308
333
|
pb = self._model.catalog.getPathBuilder()
|
|
309
334
|
for ds_type in ds_types:
|
|
310
335
|
if not check_dataset_type(ds_type):
|
|
@@ -452,7 +477,9 @@ class Dataset:
|
|
|
452
477
|
)
|
|
453
478
|
|
|
454
479
|
# self.model = self.catalog.getCatalogModel()
|
|
455
|
-
self.dataset_table.annotations.update(
|
|
480
|
+
self.dataset_table.annotations.update(
|
|
481
|
+
self._generate_dataset_download_annotations()
|
|
482
|
+
)
|
|
456
483
|
self._model.model.apply()
|
|
457
484
|
return table
|
|
458
485
|
|
|
@@ -464,7 +491,7 @@ class Dataset:
|
|
|
464
491
|
|
|
465
492
|
Args:
|
|
466
493
|
dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
|
|
467
|
-
recurse:
|
|
494
|
+
recurse: (Default value = False)
|
|
468
495
|
limit: If provided, the maximum number of members to return for each element type.
|
|
469
496
|
|
|
470
497
|
Returns:
|
|
@@ -530,8 +557,8 @@ class Dataset:
|
|
|
530
557
|
dataset is incremented and the description, if provide is applied to that new version.
|
|
531
558
|
|
|
532
559
|
Args:
|
|
533
|
-
dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
|
|
534
|
-
members: List of RIDs
|
|
560
|
+
dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
|
|
561
|
+
members: List of member RIDs to add to the dataset_table.
|
|
535
562
|
validate: Check rid_list to make sure elements are not already in the dataset_table.
|
|
536
563
|
description: Markdown description of the updated dataset.
|
|
537
564
|
execution_rid: Optional RID of execution associated with this dataset.
|
|
@@ -544,7 +571,7 @@ class Dataset:
|
|
|
544
571
|
|
|
545
572
|
Args:
|
|
546
573
|
member_rid:
|
|
547
|
-
path:
|
|
574
|
+
path: (Default value = None)
|
|
548
575
|
|
|
549
576
|
Returns:
|
|
550
577
|
|
|
@@ -570,7 +597,7 @@ class Dataset:
|
|
|
570
597
|
a.other_fkeys.pop().pk_table.name: a.table.name
|
|
571
598
|
for a in self.dataset_table.find_associations()
|
|
572
599
|
}
|
|
573
|
-
# Get a list of all the types
|
|
600
|
+
# Get a list of all the object types that can be linked to a dataset_table.
|
|
574
601
|
for m in members:
|
|
575
602
|
try:
|
|
576
603
|
rid_info = self._model.catalog.resolve_rid(m)
|
|
@@ -618,8 +645,8 @@ class Dataset:
|
|
|
618
645
|
dataset is incremented and the description, if provide is applied to that new version.
|
|
619
646
|
|
|
620
647
|
Args:
|
|
621
|
-
dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
|
|
622
|
-
members: List of RIDs
|
|
648
|
+
dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
|
|
649
|
+
members: List of member RIDs to add to the dataset_table.
|
|
623
650
|
description: Markdown description of the updated dataset.
|
|
624
651
|
execution_rid: Optional RID of execution associated with this operation.
|
|
625
652
|
"""
|
|
@@ -634,7 +661,7 @@ class Dataset:
|
|
|
634
661
|
a.other_fkeys.pop().pk_table.name: a.table.name
|
|
635
662
|
for a in self.dataset_table.find_associations()
|
|
636
663
|
}
|
|
637
|
-
# Get a list of all the types
|
|
664
|
+
# Get a list of all the object types that can be linked to a dataset_table.
|
|
638
665
|
for m in members:
|
|
639
666
|
try:
|
|
640
667
|
rid_info = self._model.catalog.resolve_rid(m)
|
|
@@ -670,7 +697,7 @@ class Dataset:
|
|
|
670
697
|
)
|
|
671
698
|
|
|
672
699
|
@validate_call
|
|
673
|
-
def list_dataset_parents(self, dataset_rid: RID) -> list[
|
|
700
|
+
def list_dataset_parents(self, dataset_rid: RID) -> list[str]:
|
|
674
701
|
"""Given a dataset_table RID, return a list of RIDs of the parent datasets if this is included in a
|
|
675
702
|
nested dataset.
|
|
676
703
|
|
|
@@ -696,14 +723,14 @@ class Dataset:
|
|
|
696
723
|
|
|
697
724
|
@validate_call
|
|
698
725
|
def list_dataset_children(self, dataset_rid: RID, recurse=False) -> list[RID]:
|
|
699
|
-
"""Given a dataset_table RID, return a list of RIDs
|
|
726
|
+
"""Given a dataset_table RID, return a list of RIDs for any nested datasets.
|
|
700
727
|
|
|
701
728
|
Args:
|
|
702
729
|
dataset_rid: A dataset_table RID.
|
|
703
|
-
recurse: If True, return a list of
|
|
730
|
+
recurse: If True, return a list of nested datasets RIDs.
|
|
704
731
|
|
|
705
732
|
Returns:
|
|
706
|
-
list of
|
|
733
|
+
list of nested dataset RIDs.
|
|
707
734
|
|
|
708
735
|
"""
|
|
709
736
|
dataset_dataset_path = (
|
|
@@ -726,7 +753,7 @@ class Dataset:
|
|
|
726
753
|
|
|
727
754
|
return find_children(dataset_rid)
|
|
728
755
|
|
|
729
|
-
def
|
|
756
|
+
def _export_vocabulary(
|
|
730
757
|
self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
|
|
731
758
|
) -> list[dict[str, Any]]:
|
|
732
759
|
"""
|
|
@@ -756,10 +783,10 @@ class Dataset:
|
|
|
756
783
|
) -> Iterator[tuple[str, str, Table]]:
|
|
757
784
|
paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
|
|
758
785
|
|
|
759
|
-
def source_path(path: tuple[Table, ...]):
|
|
786
|
+
def source_path(path: tuple[Table, ...]) -> list[str]:
|
|
760
787
|
"""Convert a tuple representing a path into a source path component with FK linkage"""
|
|
761
788
|
path = list(path)
|
|
762
|
-
p = [f"{self._model.ml_schema}:Dataset/RID={{
|
|
789
|
+
p = [f"{self._model.ml_schema}:Dataset/RID={{RID}}"]
|
|
763
790
|
for table in path[1:]:
|
|
764
791
|
if table.name == "Dataset_Dataset":
|
|
765
792
|
p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
|
|
@@ -803,7 +830,7 @@ class Dataset:
|
|
|
803
830
|
dataset_elements = [
|
|
804
831
|
snapshot_catalog._model.name_to_table(e)
|
|
805
832
|
for e, m in snapshot_catalog.list_dataset_members(
|
|
806
|
-
dataset_rid=dataset_rid, # limit=1
|
|
833
|
+
dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
|
|
807
834
|
).items()
|
|
808
835
|
if m
|
|
809
836
|
]
|
|
@@ -857,7 +884,7 @@ class Dataset:
|
|
|
857
884
|
"""
|
|
858
885
|
|
|
859
886
|
def children_depth(
|
|
860
|
-
dataset_rid: RID, nested_datasets: dict[
|
|
887
|
+
dataset_rid: RID, nested_datasets: dict[str, list[str]]
|
|
861
888
|
) -> int:
|
|
862
889
|
"""Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
|
|
863
890
|
try:
|
|
@@ -899,13 +926,13 @@ class Dataset:
|
|
|
899
926
|
def _dataset_specification(
|
|
900
927
|
self,
|
|
901
928
|
writer: Callable[[str, str, Table], list[dict[str, Any]]],
|
|
902
|
-
dataset: DatasetSpec,
|
|
929
|
+
dataset: Optional[DatasetSpec] = None,
|
|
903
930
|
snapshot_catalog: Optional[DerivaML] = None,
|
|
904
931
|
) -> list[dict[str, Any]]:
|
|
905
932
|
"""Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
|
|
906
|
-
The top level data directory of the resulting BDBag will have one subdirectory for element type.
|
|
933
|
+
The top level data directory of the resulting BDBag will have one subdirectory for element type. The subdirectory
|
|
907
934
|
will contain the CSV indicating which elements of that type are present in the dataset_table, and then there will be a
|
|
908
|
-
|
|
935
|
+
subdirectory for each object that is reachable from the dataset_table members.
|
|
909
936
|
|
|
910
937
|
To simplify reconstructing the relationship between tables, the CVS for each
|
|
911
938
|
The top level data directory will also contain a subdirectory for any controlled vocabularies used in the dataset_table.
|
|
@@ -913,7 +940,7 @@ class Dataset:
|
|
|
913
940
|
|
|
914
941
|
For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign key relationships to
|
|
915
942
|
objects in tables T3 and T4. There are also two controlled vocabularies, CV1 and CV2. T2 is an asset table
|
|
916
|
-
which has two
|
|
943
|
+
which has two assets in it. The layout of the resulting bdbag would be:
|
|
917
944
|
data
|
|
918
945
|
CV1/
|
|
919
946
|
cv1.csv
|
|
@@ -939,12 +966,12 @@ class Dataset:
|
|
|
939
966
|
Returns:
|
|
940
967
|
A dataset_table specification.
|
|
941
968
|
"""
|
|
942
|
-
element_spec =
|
|
969
|
+
element_spec = self._export_vocabulary(writer)
|
|
943
970
|
for path in self._table_paths(
|
|
944
971
|
dataset=dataset, snapshot_catalog=snapshot_catalog
|
|
945
972
|
):
|
|
946
973
|
element_spec.extend(writer(*path))
|
|
947
|
-
return
|
|
974
|
+
return element_spec
|
|
948
975
|
|
|
949
976
|
def _download_dataset_bag(
|
|
950
977
|
self,
|
|
@@ -985,7 +1012,7 @@ class Dataset:
|
|
|
985
1012
|
for h in self.dataset_history(dataset_rid=dataset.rid)
|
|
986
1013
|
if h.dataset_version == dataset.version
|
|
987
1014
|
][0]
|
|
988
|
-
return f"{self._model.catalog.catalog_id}@{
|
|
1015
|
+
return f"{self._model.catalog.catalog_id}@{version_record.snapshot}"
|
|
989
1016
|
|
|
990
1017
|
def _create_dataset_minid(
|
|
991
1018
|
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
@@ -1000,7 +1027,7 @@ class Dataset:
|
|
|
1000
1027
|
)
|
|
1001
1028
|
try:
|
|
1002
1029
|
self._logger.info(
|
|
1003
|
-
f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
|
|
1030
|
+
f"Downloading dataset {'minid' if self._use_minid else 'bag'} for catalog: {dataset.rid}@{str(dataset.version)}"
|
|
1004
1031
|
)
|
|
1005
1032
|
# Generate the bag and put into S3 storage.
|
|
1006
1033
|
exporter = DerivaExport(
|
|
@@ -1009,9 +1036,10 @@ class Dataset:
|
|
|
1009
1036
|
output_dir=tmp_dir,
|
|
1010
1037
|
defer_download=True,
|
|
1011
1038
|
timeout=(10, 610),
|
|
1012
|
-
envars={"
|
|
1039
|
+
envars={"RID": dataset.rid},
|
|
1013
1040
|
)
|
|
1014
1041
|
minid_page_url = exporter.export()[0] # Get the MINID launch page
|
|
1042
|
+
|
|
1015
1043
|
except (
|
|
1016
1044
|
DerivaDownloadError,
|
|
1017
1045
|
DerivaDownloadConfigurationError,
|
|
@@ -1021,17 +1049,18 @@ class Dataset:
|
|
|
1021
1049
|
) as e:
|
|
1022
1050
|
raise DerivaMLException(format_exception(e))
|
|
1023
1051
|
# Update version table with MINID.
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1052
|
+
if self._use_minid:
|
|
1053
|
+
version_path = (
|
|
1054
|
+
self._model.catalog.getPathBuilder()
|
|
1055
|
+
.schemas[self._ml_schema]
|
|
1056
|
+
.tables["Dataset_Version"]
|
|
1057
|
+
)
|
|
1058
|
+
version_rid = [
|
|
1059
|
+
h
|
|
1060
|
+
for h in self.dataset_history(dataset_rid=dataset.rid)
|
|
1061
|
+
if h.dataset_version == dataset.version
|
|
1062
|
+
][0].version_rid
|
|
1063
|
+
version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
|
|
1035
1064
|
return minid_page_url
|
|
1036
1065
|
|
|
1037
1066
|
def _get_dataset_minid(
|
|
@@ -1074,14 +1103,25 @@ class Dataset:
|
|
|
1074
1103
|
raise DerivaMLException(
|
|
1075
1104
|
f"Minid for dataset {dataset.rid} doesn't exist"
|
|
1076
1105
|
)
|
|
1077
|
-
self.
|
|
1106
|
+
if self._use_minid:
|
|
1107
|
+
self._logger.info("Creating new MINID for dataset %s", dataset.rid)
|
|
1078
1108
|
minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
|
|
1079
1109
|
# If provided a MINID, use the MINID metadata to get the checksum and download the bag.
|
|
1080
|
-
|
|
1081
|
-
|
|
1110
|
+
if self._use_minid:
|
|
1111
|
+
r = requests.get(minid_url, headers={"accept": "application/json"})
|
|
1112
|
+
dataset_minid = DatasetMinid(
|
|
1113
|
+
dataset_version=dataset.version, **r.json()
|
|
1114
|
+
)
|
|
1115
|
+
else:
|
|
1116
|
+
dataset_minid = DatasetMinid(
|
|
1117
|
+
dataset_version=dataset.version,
|
|
1118
|
+
RID=f"{dataset.rid}@{dataset_version_record.snapshot}",
|
|
1119
|
+
location=minid_url,
|
|
1120
|
+
)
|
|
1121
|
+
return dataset_minid
|
|
1082
1122
|
|
|
1083
1123
|
def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
|
|
1084
|
-
"""Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
|
|
1124
|
+
"""Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and validate
|
|
1085
1125
|
that all the metadata is correct
|
|
1086
1126
|
|
|
1087
1127
|
Args:
|
|
@@ -1090,19 +1130,37 @@ class Dataset:
|
|
|
1090
1130
|
the location of the unpacked and validated dataset_table bag and the RID of the bag and the bag MINID
|
|
1091
1131
|
"""
|
|
1092
1132
|
|
|
1093
|
-
# Check to see if we have an existing idempotent materialization of the desired bag. If so, then
|
|
1133
|
+
# Check to see if we have an existing idempotent materialization of the desired bag. If so, then reuse
|
|
1094
1134
|
# it. If not, then we need to extract the contents of the archive into our cache directory.
|
|
1095
1135
|
bag_dir = self._cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
|
|
1096
1136
|
if bag_dir.exists():
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1137
|
+
self._logger.info(
|
|
1138
|
+
f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
|
|
1139
|
+
)
|
|
1140
|
+
return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
|
|
1141
|
+
|
|
1142
|
+
# Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
|
|
1143
|
+
with TemporaryDirectory() as tmp_dir:
|
|
1144
|
+
if self._use_minid:
|
|
1145
|
+
# Get bag from S3
|
|
1146
|
+
archive_path = fetch_single_file(minid.bag_url)
|
|
1147
|
+
else:
|
|
1148
|
+
exporter = DerivaExport(
|
|
1149
|
+
host=self._model.catalog.deriva_server.server, output_dir=tmp_dir
|
|
1150
|
+
)
|
|
1151
|
+
archive_path = exporter.retrieve_file(minid.bag_url)
|
|
1152
|
+
hashes = hash_utils.compute_file_hashes(
|
|
1153
|
+
archive_path, hashes=["md5", "sha256"]
|
|
1154
|
+
)
|
|
1155
|
+
checksum = hashes["sha256"][0]
|
|
1156
|
+
bag_dir = self._cache_dir / f"{minid.dataset_rid}_{checksum}"
|
|
1157
|
+
if bag_dir.exists():
|
|
1158
|
+
self._logger.info(
|
|
1159
|
+
f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
|
|
1160
|
+
)
|
|
1161
|
+
return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
|
|
1162
|
+
bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
|
|
1163
|
+
bdb.validate_bag_structure(bag_path)
|
|
1106
1164
|
return Path(bag_path)
|
|
1107
1165
|
|
|
1108
1166
|
def _materialize_dataset_bag(
|
|
@@ -1154,6 +1212,9 @@ class Dataset:
|
|
|
1154
1212
|
|
|
1155
1213
|
# If this bag has already been validated, our work is done. Otherwise, materialize the bag.
|
|
1156
1214
|
if not validated_check.exists():
|
|
1215
|
+
self._logger.info(
|
|
1216
|
+
f"Materializing bag {minid.dataset_rid} Version:{minid.dataset_version}"
|
|
1217
|
+
)
|
|
1157
1218
|
bdb.materialize(
|
|
1158
1219
|
bag_path.as_posix(),
|
|
1159
1220
|
fetch_callback=fetch_progress_callback,
|
|
@@ -1162,9 +1223,8 @@ class Dataset:
|
|
|
1162
1223
|
validated_check.touch()
|
|
1163
1224
|
return Path(bag_path)
|
|
1164
1225
|
|
|
1165
|
-
def
|
|
1226
|
+
def _export_annotation(
|
|
1166
1227
|
self,
|
|
1167
|
-
dataset: Optional[DatasetSpec] = None,
|
|
1168
1228
|
snapshot_catalog: Optional[DerivaML] = None,
|
|
1169
1229
|
) -> list[dict[str, Any]]:
|
|
1170
1230
|
"""Return and output specification for the datasets in the provided model
|
|
@@ -1173,19 +1233,6 @@ class Dataset:
|
|
|
1173
1233
|
An export specification suitable for Chaise.
|
|
1174
1234
|
"""
|
|
1175
1235
|
|
|
1176
|
-
def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
|
|
1177
|
-
"""
|
|
1178
|
-
|
|
1179
|
-
Args:
|
|
1180
|
-
spath: list[Table]:
|
|
1181
|
-
dpath: list[Table]:
|
|
1182
|
-
table: Table
|
|
1183
|
-
|
|
1184
|
-
Returns:
|
|
1185
|
-
An export specification suitable for Chaise.
|
|
1186
|
-
"""
|
|
1187
|
-
return self._export_dataset_element(spath, dpath, table)
|
|
1188
|
-
|
|
1189
1236
|
# Export specification is a specification for the datasets, plus any controlled vocabulary
|
|
1190
1237
|
return [
|
|
1191
1238
|
{
|
|
@@ -1204,41 +1251,34 @@ class Dataset:
|
|
|
1204
1251
|
"destination": {"type": "json", "name": "schema"},
|
|
1205
1252
|
},
|
|
1206
1253
|
] + self._dataset_specification(
|
|
1207
|
-
|
|
1254
|
+
self._export_annotation_dataset_element,
|
|
1255
|
+
None,
|
|
1256
|
+
snapshot_catalog=snapshot_catalog,
|
|
1208
1257
|
)
|
|
1209
1258
|
|
|
1210
|
-
def
|
|
1259
|
+
def _export_specification(
|
|
1211
1260
|
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
1212
1261
|
) -> list[dict[str, Any]]:
|
|
1213
1262
|
"""
|
|
1263
|
+
Generate a specification for export engine for specific dataset.
|
|
1264
|
+
|
|
1214
1265
|
Returns:
|
|
1215
1266
|
a download specification for the datasets in the provided model.
|
|
1216
1267
|
|
|
1217
1268
|
"""
|
|
1218
1269
|
|
|
1219
|
-
def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
|
|
1220
|
-
"""
|
|
1221
|
-
|
|
1222
|
-
Args:
|
|
1223
|
-
spath:
|
|
1224
|
-
dpath:
|
|
1225
|
-
table: Table
|
|
1226
|
-
|
|
1227
|
-
Returns:
|
|
1228
|
-
|
|
1229
|
-
"""
|
|
1230
|
-
return self._download_dataset_element(spath, dpath, table)
|
|
1231
|
-
|
|
1232
1270
|
# Download spec is the spec for any controlled vocabulary and for the dataset_table.
|
|
1233
1271
|
return [
|
|
1234
1272
|
{
|
|
1235
1273
|
"processor": "json",
|
|
1236
1274
|
"processor_params": {"query_path": "/schema", "output_path": "schema"},
|
|
1237
1275
|
}
|
|
1238
|
-
] + self._dataset_specification(
|
|
1276
|
+
] + self._dataset_specification(
|
|
1277
|
+
self._export_specification_dataset_element, dataset, snapshot_catalog
|
|
1278
|
+
)
|
|
1239
1279
|
|
|
1240
1280
|
@staticmethod
|
|
1241
|
-
def
|
|
1281
|
+
def _export_specification_dataset_element(
|
|
1242
1282
|
spath: str, dpath: str, table: Table
|
|
1243
1283
|
) -> list[dict[str, Any]]:
|
|
1244
1284
|
"""Return the download specification for the data object indicated by a path through the data model.
|
|
@@ -1255,7 +1295,7 @@ class Dataset:
|
|
|
1255
1295
|
{
|
|
1256
1296
|
"processor": "csv",
|
|
1257
1297
|
"processor_params": {
|
|
1258
|
-
"query_path": f"/entity/{spath}
|
|
1298
|
+
"query_path": f"/entity/{spath}",
|
|
1259
1299
|
"output_path": dpath,
|
|
1260
1300
|
},
|
|
1261
1301
|
}
|
|
@@ -1268,16 +1308,15 @@ class Dataset:
|
|
|
1268
1308
|
{
|
|
1269
1309
|
"processor": "fetch",
|
|
1270
1310
|
"processor_params": {
|
|
1271
|
-
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5
|
|
1311
|
+
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
|
|
1272
1312
|
"output_path": f"asset/{table.name}",
|
|
1273
1313
|
},
|
|
1274
1314
|
}
|
|
1275
1315
|
)
|
|
1276
1316
|
return exports
|
|
1277
1317
|
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
spath: str, dpath: str, table: Table
|
|
1318
|
+
def _export_annotation_dataset_element(
|
|
1319
|
+
self, spath: str, dpath: str, table: Table
|
|
1281
1320
|
) -> list[dict[str, Any]]:
|
|
1282
1321
|
"""Given a path in the data model, output an export specification for the path taken to get to the current table.
|
|
1283
1322
|
|
|
@@ -1293,9 +1332,23 @@ class Dataset:
|
|
|
1293
1332
|
# into a path in the form of /S:T1/S:T2/S:Table
|
|
1294
1333
|
# Generate the destination path in the file system using just the table names.
|
|
1295
1334
|
|
|
1335
|
+
skip_root_path = False
|
|
1336
|
+
if spath.startswith(f"{self._ml_schema}:Dataset/"):
|
|
1337
|
+
# Chaise will add table name and RID filter, so strip it off.
|
|
1338
|
+
spath = "/".join(spath.split("/")[2:])
|
|
1339
|
+
if spath == "":
|
|
1340
|
+
# This path is to just the dataset table.
|
|
1341
|
+
return []
|
|
1342
|
+
else:
|
|
1343
|
+
# A vocabulary table, so we don't want the root_path.
|
|
1344
|
+
skip_root_path = True
|
|
1296
1345
|
exports = [
|
|
1297
1346
|
{
|
|
1298
|
-
"source": {
|
|
1347
|
+
"source": {
|
|
1348
|
+
"api": "entity",
|
|
1349
|
+
"path": spath,
|
|
1350
|
+
"skip_root_path": skip_root_path,
|
|
1351
|
+
},
|
|
1299
1352
|
"destination": {"name": dpath, "type": "csv"},
|
|
1300
1353
|
}
|
|
1301
1354
|
]
|
|
@@ -1306,6 +1359,7 @@ class Dataset:
|
|
|
1306
1359
|
exports.append(
|
|
1307
1360
|
{
|
|
1308
1361
|
"source": {
|
|
1362
|
+
"skip_root_path": False,
|
|
1309
1363
|
"api": "attribute",
|
|
1310
1364
|
"path": f"{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
|
|
1311
1365
|
},
|
|
@@ -1315,44 +1369,53 @@ class Dataset:
|
|
|
1315
1369
|
return exports
|
|
1316
1370
|
|
|
1317
1371
|
def _generate_dataset_download_spec(
|
|
1318
|
-
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
|
|
1372
|
+
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
1319
1373
|
) -> dict[str, Any]:
|
|
1320
1374
|
"""
|
|
1375
|
+
Generate a specification for downloading a specific dataset.
|
|
1321
1376
|
|
|
1377
|
+
This routine creates a download specification that can be used by the Deriva export processor to download
|
|
1378
|
+
a specific dataset as a MINID.
|
|
1322
1379
|
Returns:
|
|
1323
1380
|
"""
|
|
1324
1381
|
s3_target = "s3://eye-ai-shared"
|
|
1325
1382
|
minid_test = False
|
|
1326
1383
|
|
|
1327
1384
|
catalog_id = self._version_snapshot(dataset)
|
|
1328
|
-
|
|
1329
|
-
|
|
1385
|
+
post_processors = (
|
|
1386
|
+
{
|
|
1387
|
+
"post_processors": [
|
|
1388
|
+
{
|
|
1389
|
+
"processor": "cloud_upload",
|
|
1390
|
+
"processor_params": {
|
|
1391
|
+
"acl": "public-read",
|
|
1392
|
+
"target_url": s3_target,
|
|
1393
|
+
},
|
|
1394
|
+
},
|
|
1395
|
+
{
|
|
1396
|
+
"processor": "identifier",
|
|
1397
|
+
"processor_params": {
|
|
1398
|
+
"test": minid_test,
|
|
1399
|
+
"env_column_map": {
|
|
1400
|
+
"RID": "{RID}@{snaptime}",
|
|
1401
|
+
"Description": "{Description}",
|
|
1402
|
+
},
|
|
1403
|
+
},
|
|
1404
|
+
},
|
|
1405
|
+
]
|
|
1406
|
+
}
|
|
1407
|
+
if self._use_minid
|
|
1408
|
+
else {}
|
|
1409
|
+
)
|
|
1410
|
+
return post_processors | {
|
|
1411
|
+
"env": {"RID": "{RID}"},
|
|
1330
1412
|
"bag": {
|
|
1331
|
-
"bag_name": "Dataset_{
|
|
1413
|
+
"bag_name": "Dataset_{RID}",
|
|
1332
1414
|
"bag_algorithms": ["md5"],
|
|
1333
1415
|
"bag_archiver": "zip",
|
|
1334
1416
|
"bag_metadata": {},
|
|
1335
1417
|
"bag_idempotent": True,
|
|
1336
1418
|
},
|
|
1337
|
-
"post_processors": [
|
|
1338
|
-
{
|
|
1339
|
-
"processor": "cloud_upload",
|
|
1340
|
-
"processor_params": {
|
|
1341
|
-
"acl": "public-read",
|
|
1342
|
-
"target_url": s3_target,
|
|
1343
|
-
},
|
|
1344
|
-
},
|
|
1345
|
-
{
|
|
1346
|
-
"processor": "identifier",
|
|
1347
|
-
"processor_params": {
|
|
1348
|
-
"test": minid_test,
|
|
1349
|
-
"env_column_map": {
|
|
1350
|
-
"Dataset_RID": "{RID}@{snaptime}",
|
|
1351
|
-
"Description": "{Description}",
|
|
1352
|
-
},
|
|
1353
|
-
},
|
|
1354
|
-
},
|
|
1355
|
-
],
|
|
1356
1419
|
"catalog": {
|
|
1357
1420
|
"host": f"{self._model.catalog.deriva_server.scheme}://{self._model.catalog.deriva_server.server}",
|
|
1358
1421
|
"catalog_id": catalog_id,
|
|
@@ -1368,125 +1431,50 @@ class Dataset:
|
|
|
1368
1431
|
{
|
|
1369
1432
|
"processor": "env",
|
|
1370
1433
|
"processor_params": {
|
|
1371
|
-
"query_path": "/entity/M:=deriva-ml:Dataset/RID={
|
|
1434
|
+
"query_path": "/entity/M:=deriva-ml:Dataset/RID={RID}",
|
|
1372
1435
|
"output_path": "Dataset",
|
|
1373
1436
|
"query_keys": ["RID", "Description"],
|
|
1374
1437
|
},
|
|
1375
1438
|
},
|
|
1376
1439
|
]
|
|
1377
|
-
+ self.
|
|
1440
|
+
+ self._export_specification(dataset, snapshot_catalog),
|
|
1378
1441
|
},
|
|
1379
1442
|
}
|
|
1380
1443
|
|
|
1381
|
-
def
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
"*": [
|
|
1395
|
-
"RID",
|
|
1396
|
-
"Description",
|
|
1397
|
-
{
|
|
1398
|
-
"display": {
|
|
1399
|
-
"markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
|
|
1400
|
-
},
|
|
1401
|
-
"markdown_name": "Annotation App",
|
|
1402
|
-
},
|
|
1403
|
-
rcb_name,
|
|
1404
|
-
rmb_name,
|
|
1405
|
-
],
|
|
1406
|
-
"detailed": [
|
|
1407
|
-
"RID",
|
|
1408
|
-
"Description",
|
|
1409
|
-
{
|
|
1410
|
-
"source": [
|
|
1411
|
-
{"inbound": ["deriva-ml", "Dataset_Dataset_Type_Dataset_fkey"]},
|
|
1412
|
-
{
|
|
1413
|
-
"outbound": [
|
|
1414
|
-
"deriva-ml",
|
|
1415
|
-
"Dataset_Dataset_Type_Dataset_Type_fkey",
|
|
1416
|
-
]
|
|
1444
|
+
def _generate_dataset_download_annotations(self) -> dict[str, Any]:
|
|
1445
|
+
post_processors = (
|
|
1446
|
+
{
|
|
1447
|
+
"type": "BAG",
|
|
1448
|
+
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
1449
|
+
"displayname": "BDBag to Cloud",
|
|
1450
|
+
"bag_idempotent": True,
|
|
1451
|
+
"postprocessors": [
|
|
1452
|
+
{
|
|
1453
|
+
"processor": "cloud_upload",
|
|
1454
|
+
"processor_params": {
|
|
1455
|
+
"acl": "public-read",
|
|
1456
|
+
"target_url": "s3://eye-ai-shared/",
|
|
1417
1457
|
},
|
|
1418
|
-
"RID",
|
|
1419
|
-
],
|
|
1420
|
-
"markdown_name": "Dataset Types",
|
|
1421
|
-
},
|
|
1422
|
-
{
|
|
1423
|
-
"display": {
|
|
1424
|
-
"markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
|
|
1425
1458
|
},
|
|
1426
|
-
"markdown_name": "Annotation App",
|
|
1427
|
-
},
|
|
1428
|
-
rcb_name,
|
|
1429
|
-
rmb_name,
|
|
1430
|
-
],
|
|
1431
|
-
"filter": {
|
|
1432
|
-
"and": [
|
|
1433
|
-
{"source": "RID"},
|
|
1434
|
-
{"source": "Description"},
|
|
1435
1459
|
{
|
|
1436
|
-
"
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
},
|
|
1443
|
-
{
|
|
1444
|
-
"outbound": [
|
|
1445
|
-
"deriva-ml",
|
|
1446
|
-
"Dataset_Dataset_Type_Dataset_Type_fkey",
|
|
1447
|
-
]
|
|
1460
|
+
"processor": "identifier",
|
|
1461
|
+
"processor_params": {
|
|
1462
|
+
"test": False,
|
|
1463
|
+
"env_column_map": {
|
|
1464
|
+
"RID": "{RID}@{snaptime}",
|
|
1465
|
+
"Description": "{Description}",
|
|
1448
1466
|
},
|
|
1449
|
-
|
|
1450
|
-
],
|
|
1451
|
-
"markdown_name": "Dataset Types",
|
|
1452
|
-
},
|
|
1453
|
-
{
|
|
1454
|
-
"source": [{"outbound": rcb_name}, "RID"],
|
|
1455
|
-
"markdown_name": "Created By",
|
|
1456
|
-
},
|
|
1457
|
-
{
|
|
1458
|
-
"source": [{"outbound": rmb_name}, "RID"],
|
|
1459
|
-
"markdown_name": "Modified By",
|
|
1467
|
+
},
|
|
1460
1468
|
},
|
|
1461
|
-
]
|
|
1462
|
-
},
|
|
1463
|
-
}
|
|
1464
|
-
|
|
1465
|
-
def _dataset_visible_fkeys(self) -> dict[str, Any]:
|
|
1466
|
-
def fkey_name(fk):
|
|
1467
|
-
return [fk.name[0].name, fk.name[1]]
|
|
1468
|
-
|
|
1469
|
-
dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
|
|
1470
|
-
|
|
1471
|
-
source_list = [
|
|
1472
|
-
{
|
|
1473
|
-
"source": [
|
|
1474
|
-
{"inbound": fkey_name(fkey.self_fkey)},
|
|
1475
|
-
{"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
|
|
1476
|
-
"RID",
|
|
1477
1469
|
],
|
|
1478
|
-
"markdown_name": other_fkey.pk_table.name,
|
|
1479
1470
|
}
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
def _generate_dataset_annotations(self) -> dict[str, Any]:
|
|
1471
|
+
if self._use_minid
|
|
1472
|
+
else {}
|
|
1473
|
+
)
|
|
1485
1474
|
return {
|
|
1486
1475
|
deriva_tags.export_fragment_definitions: {
|
|
1487
|
-
"dataset_export_outputs": self.
|
|
1476
|
+
"dataset_export_outputs": self._export_annotation()
|
|
1488
1477
|
},
|
|
1489
|
-
deriva_tags.visible_columns: self.dataset_visible_columns(),
|
|
1490
1478
|
deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
|
|
1491
1479
|
deriva_tags.export_2019: {
|
|
1492
1480
|
"detailed": {
|
|
@@ -1496,45 +1484,56 @@ class Dataset:
|
|
|
1496
1484
|
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
1497
1485
|
"displayname": "BDBag Download",
|
|
1498
1486
|
"bag_idempotent": True,
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
"processor": "identifier",
|
|
1502
|
-
"processor_params": {
|
|
1503
|
-
"test": False,
|
|
1504
|
-
"env_column_map": {
|
|
1505
|
-
"Dataset_RID": "{RID}@{snaptime}",
|
|
1506
|
-
"Description": "{Description}",
|
|
1507
|
-
},
|
|
1508
|
-
},
|
|
1509
|
-
}
|
|
1510
|
-
],
|
|
1511
|
-
},
|
|
1512
|
-
{
|
|
1513
|
-
"type": "BAG",
|
|
1514
|
-
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
1515
|
-
"displayname": "BDBag to Cloud",
|
|
1516
|
-
"bag_idempotent": True,
|
|
1517
|
-
"postprocessors": [
|
|
1518
|
-
{
|
|
1519
|
-
"processor": "cloud_upload",
|
|
1520
|
-
"processor_params": {
|
|
1521
|
-
"acl": "public-read",
|
|
1522
|
-
"target_url": "s3://eye-ai-shared/",
|
|
1523
|
-
},
|
|
1524
|
-
},
|
|
1525
|
-
{
|
|
1526
|
-
"processor": "identifier",
|
|
1527
|
-
"processor_params": {
|
|
1528
|
-
"test": False,
|
|
1529
|
-
"env_column_map": {
|
|
1530
|
-
"Dataset_RID": "{RID}@{snaptime}",
|
|
1531
|
-
"Description": "{Description}",
|
|
1532
|
-
},
|
|
1533
|
-
},
|
|
1534
|
-
},
|
|
1535
|
-
],
|
|
1536
|
-
},
|
|
1487
|
+
}
|
|
1488
|
+
| post_processors
|
|
1537
1489
|
]
|
|
1538
1490
|
}
|
|
1539
1491
|
},
|
|
1540
1492
|
}
|
|
1493
|
+
|
|
1494
|
+
def _dataset_visible_fkeys(self) -> dict[str, Any]:
|
|
1495
|
+
def fkey_name(fk):
|
|
1496
|
+
return [fk.name[0].name, fk.name[1]]
|
|
1497
|
+
|
|
1498
|
+
dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
|
|
1499
|
+
|
|
1500
|
+
source_list = [
|
|
1501
|
+
{
|
|
1502
|
+
"source": [
|
|
1503
|
+
{"inbound": ["deriva-ml", "Dataset_Version_Dataset_fkey"]},
|
|
1504
|
+
"RID",
|
|
1505
|
+
],
|
|
1506
|
+
"markdown_name": "Previous Versions",
|
|
1507
|
+
"entity": True,
|
|
1508
|
+
},
|
|
1509
|
+
{
|
|
1510
|
+
"source": [
|
|
1511
|
+
{"inbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
|
|
1512
|
+
{"outbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
|
|
1513
|
+
"RID",
|
|
1514
|
+
],
|
|
1515
|
+
"markdown_name": "Parent Datasets",
|
|
1516
|
+
},
|
|
1517
|
+
{
|
|
1518
|
+
"source": [
|
|
1519
|
+
{"inbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
|
|
1520
|
+
{"outbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
|
|
1521
|
+
"RID",
|
|
1522
|
+
],
|
|
1523
|
+
"markdown_name": "Child Datasets",
|
|
1524
|
+
},
|
|
1525
|
+
]
|
|
1526
|
+
source_list.extend(
|
|
1527
|
+
[
|
|
1528
|
+
{
|
|
1529
|
+
"source": [
|
|
1530
|
+
{"inbound": fkey_name(fkey.self_fkey)},
|
|
1531
|
+
{"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
|
|
1532
|
+
"RID",
|
|
1533
|
+
],
|
|
1534
|
+
"markdown_name": other_fkey.pk_table.name,
|
|
1535
|
+
}
|
|
1536
|
+
for fkey in dataset_table.find_associations(max_arity=3, pure=False)
|
|
1537
|
+
]
|
|
1538
|
+
)
|
|
1539
|
+
return {"detailed": source_list}
|