deriva-ml 1.13.2__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/database_model.py +5 -11
- deriva_ml/dataset.py +279 -295
- deriva_ml/dataset_aux_classes.py +10 -10
- deriva_ml/demo_catalog.py +90 -67
- deriva_ml/deriva_definitions.py +43 -4
- deriva_ml/deriva_ml_base.py +24 -29
- deriva_ml/deriva_model.py +17 -5
- deriva_ml/execution.py +23 -3
- deriva_ml/history.py +2 -0
- deriva_ml/schema_setup/annotations.py +341 -126
- deriva_ml/schema_setup/create_schema.py +33 -65
- deriva_ml/schema_setup/policy.json +7 -3
- deriva_ml/upload.py +3 -3
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.13.3.dist-info}/METADATA +2 -2
- deriva_ml-1.13.3.dist-info/RECORD +31 -0
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.13.3.dist-info}/WHEEL +1 -1
- deriva_ml-1.13.2.dist-info/RECORD +0 -31
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.13.3.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.13.3.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.13.2.dist-info → deriva_ml-1.13.3.dist-info}/top_level.txt +0 -0
deriva_ml/dataset.py
CHANGED
|
@@ -1,18 +1,30 @@
|
|
|
1
1
|
"""
|
|
2
|
-
This module defines the DataSet class with is used to manipulate datasets in DerivaML
|
|
3
|
-
The intended use of this class is as a base class in DerivaML so all the methods documented here are
|
|
2
|
+
This module defines the DataSet class with is used to manipulate datasets in DerivaML.
|
|
3
|
+
The intended use of this class is as a base class in DerivaML, so all the methods documented here are
|
|
4
4
|
accessible via a DerivaML class instance.
|
|
5
5
|
|
|
6
|
-
|
|
7
6
|
"""
|
|
8
7
|
|
|
9
8
|
from __future__ import annotations
|
|
10
|
-
from bdbag.fetch.fetcher import fetch_single_file
|
|
11
9
|
from bdbag import bdbag_api as bdb
|
|
10
|
+
from bdbag.fetch.fetcher import fetch_single_file
|
|
12
11
|
from collections import defaultdict
|
|
12
|
+
from graphlib import TopologicalSorter
|
|
13
|
+
import json
|
|
14
|
+
import logging
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from pydantic import (
|
|
17
|
+
validate_call,
|
|
18
|
+
ConfigDict,
|
|
19
|
+
)
|
|
20
|
+
import requests
|
|
21
|
+
from tempfile import TemporaryDirectory
|
|
22
|
+
from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
|
|
23
|
+
|
|
13
24
|
|
|
14
25
|
from deriva.core.ermrest_model import Table
|
|
15
26
|
from deriva.core.utils.core_utils import tag as deriva_tags, format_exception
|
|
27
|
+
import deriva.core.utils.hash_utils as hash_utils
|
|
16
28
|
from deriva.transfer.download.deriva_export import DerivaExport
|
|
17
29
|
from deriva.transfer.download.deriva_download import (
|
|
18
30
|
DerivaDownloadConfigurationError,
|
|
@@ -22,24 +34,12 @@ from deriva.transfer.download.deriva_download import (
|
|
|
22
34
|
DerivaDownloadTimeoutError,
|
|
23
35
|
)
|
|
24
36
|
|
|
37
|
+
|
|
25
38
|
try:
|
|
26
39
|
from icecream import ic
|
|
27
40
|
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
28
41
|
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
29
42
|
|
|
30
|
-
from graphlib import TopologicalSorter
|
|
31
|
-
import json
|
|
32
|
-
import logging
|
|
33
|
-
from pathlib import Path
|
|
34
|
-
from pydantic import (
|
|
35
|
-
validate_call,
|
|
36
|
-
ConfigDict,
|
|
37
|
-
)
|
|
38
|
-
import requests
|
|
39
|
-
|
|
40
|
-
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
|
41
|
-
from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
|
|
42
|
-
|
|
43
43
|
from deriva_ml import DatasetBag
|
|
44
44
|
from .deriva_definitions import (
|
|
45
45
|
ML_SCHEMA,
|
|
@@ -49,7 +49,6 @@ from .deriva_definitions import (
|
|
|
49
49
|
RID,
|
|
50
50
|
DRY_RUN_RID,
|
|
51
51
|
)
|
|
52
|
-
from .history import iso_to_snap
|
|
53
52
|
from .deriva_model import DerivaModel
|
|
54
53
|
from .database_model import DatabaseModel
|
|
55
54
|
from .dataset_aux_classes import (
|
|
@@ -74,13 +73,20 @@ class Dataset:
|
|
|
74
73
|
|
|
75
74
|
_Logger = logging.getLogger("deriva_ml")
|
|
76
75
|
|
|
77
|
-
def __init__(
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
model: DerivaModel,
|
|
79
|
+
cache_dir: Path,
|
|
80
|
+
working_dir: Path,
|
|
81
|
+
use_minid: bool = True,
|
|
82
|
+
):
|
|
78
83
|
self._model = model
|
|
79
84
|
self._ml_schema = ML_SCHEMA
|
|
80
85
|
self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
|
|
81
86
|
self._cache_dir = cache_dir
|
|
82
87
|
self._working_dir = working_dir
|
|
83
88
|
self._logger = logging.getLogger("deriva_ml")
|
|
89
|
+
self._use_minid = use_minid
|
|
84
90
|
|
|
85
91
|
def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
|
|
86
92
|
try:
|
|
@@ -100,27 +106,28 @@ class Dataset:
|
|
|
100
106
|
dataset_list: list[DatasetSpec],
|
|
101
107
|
description: Optional[str] = "",
|
|
102
108
|
execution_rid: Optional[RID] = None,
|
|
103
|
-
) ->
|
|
109
|
+
) -> None:
|
|
104
110
|
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
105
|
-
|
|
111
|
+
# determine snapshot after changes were made
|
|
112
|
+
snap = self._model.catalog.get("/").json()["snaptime"]
|
|
106
113
|
# Construct version records for insert
|
|
107
|
-
version_records = [
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
114
|
+
version_records = schema_path.tables["Dataset_Version"].insert(
|
|
115
|
+
[
|
|
116
|
+
{
|
|
117
|
+
"Dataset": dataset.rid,
|
|
118
|
+
"Version": str(dataset.version),
|
|
119
|
+
"Description": description,
|
|
120
|
+
"Execution": execution_rid,
|
|
121
|
+
"Snapshot": snap,
|
|
122
|
+
}
|
|
123
|
+
for dataset in dataset_list
|
|
124
|
+
]
|
|
125
|
+
)
|
|
116
126
|
|
|
117
|
-
#
|
|
118
|
-
|
|
119
|
-
{"Version": v["RID"], "RID": v["Dataset"]}
|
|
120
|
-
|
|
121
|
-
]
|
|
122
|
-
schema_path.tables["Dataset"].update(version_rids)
|
|
123
|
-
return version_rids
|
|
127
|
+
# And update the dataset records.
|
|
128
|
+
schema_path.tables["Dataset"].update(
|
|
129
|
+
[{"Version": v["RID"], "RID": v["Dataset"]} for v in version_records]
|
|
130
|
+
)
|
|
124
131
|
|
|
125
132
|
def _bootstrap_versions(self):
|
|
126
133
|
datasets = [ds["RID"] for ds in self.find_datasets()]
|
|
@@ -170,6 +177,9 @@ class Dataset:
|
|
|
170
177
|
Returns:
|
|
171
178
|
A list of DatasetHistory objects which indicate the version-number, creation time, and bag instantiation of the dataset.
|
|
172
179
|
"""
|
|
180
|
+
|
|
181
|
+
if not self._is_dataset_rid(dataset_rid):
|
|
182
|
+
raise DerivaMLException(f"RID is not for a data set: {dataset_rid}")
|
|
173
183
|
version_path = (
|
|
174
184
|
self._model.catalog.getPathBuilder()
|
|
175
185
|
.schemas[self._ml_schema]
|
|
@@ -179,7 +189,7 @@ class Dataset:
|
|
|
179
189
|
DatasetHistory(
|
|
180
190
|
dataset_version=DatasetVersion.parse(v["Version"]),
|
|
181
191
|
minid=v["Minid"],
|
|
182
|
-
|
|
192
|
+
snapshot=v["Snapshot"],
|
|
183
193
|
dataset_rid=dataset_rid,
|
|
184
194
|
version_rid=v["RID"],
|
|
185
195
|
description=v["Description"],
|
|
@@ -240,7 +250,7 @@ class Dataset:
|
|
|
240
250
|
|
|
241
251
|
Args:
|
|
242
252
|
dataset_rid: RID of the dataset whose version is to be incremented.
|
|
243
|
-
component: Which version of the dataset_table to increment. Major, Minor or Patch
|
|
253
|
+
component: Which version of the dataset_table to increment. Major, Minor, or Patch
|
|
244
254
|
description: Description of the version update of the dataset_table.
|
|
245
255
|
execution_rid: Which execution is performing increment.
|
|
246
256
|
|
|
@@ -248,7 +258,7 @@ class Dataset:
|
|
|
248
258
|
new semantic version of the dataset_table as a 3-tuple
|
|
249
259
|
|
|
250
260
|
Raises:
|
|
251
|
-
DerivaMLException: if provided RID is not to a dataset_table.
|
|
261
|
+
DerivaMLException: if provided, RID is not to a dataset_table.
|
|
252
262
|
"""
|
|
253
263
|
|
|
254
264
|
# Find all the datasets that are reachable from this dataset and determine their new version numbers.
|
|
@@ -268,7 +278,7 @@ class Dataset:
|
|
|
268
278
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
269
279
|
def create_dataset(
|
|
270
280
|
self,
|
|
271
|
-
|
|
281
|
+
dataset_types: str | list[str],
|
|
272
282
|
description: str,
|
|
273
283
|
execution_rid: Optional[RID] = None,
|
|
274
284
|
version: Optional[DatasetVersion] = None,
|
|
@@ -276,7 +286,7 @@ class Dataset:
|
|
|
276
286
|
"""Create a new dataset_table from the specified list of RIDs.
|
|
277
287
|
|
|
278
288
|
Args:
|
|
279
|
-
|
|
289
|
+
dataset_types: One or more dataset_table types. Must be a term from the DatasetType controlled vocabulary.
|
|
280
290
|
description: Description of the dataset_table.
|
|
281
291
|
execution_rid: Execution under which the dataset_table will be created.
|
|
282
292
|
version: Version of the dataset_table.
|
|
@@ -304,7 +314,7 @@ class Dataset:
|
|
|
304
314
|
return False
|
|
305
315
|
|
|
306
316
|
# Create the entry for the new dataset_table and get its RID.
|
|
307
|
-
ds_types = [
|
|
317
|
+
ds_types = [dataset_types] if isinstance(dataset_types, str) else dataset_types
|
|
308
318
|
pb = self._model.catalog.getPathBuilder()
|
|
309
319
|
for ds_type in ds_types:
|
|
310
320
|
if not check_dataset_type(ds_type):
|
|
@@ -452,7 +462,9 @@ class Dataset:
|
|
|
452
462
|
)
|
|
453
463
|
|
|
454
464
|
# self.model = self.catalog.getCatalogModel()
|
|
455
|
-
self.dataset_table.annotations.update(
|
|
465
|
+
self.dataset_table.annotations.update(
|
|
466
|
+
self._generate_dataset_download_annotations()
|
|
467
|
+
)
|
|
456
468
|
self._model.model.apply()
|
|
457
469
|
return table
|
|
458
470
|
|
|
@@ -464,7 +476,7 @@ class Dataset:
|
|
|
464
476
|
|
|
465
477
|
Args:
|
|
466
478
|
dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
|
|
467
|
-
recurse:
|
|
479
|
+
recurse: (Default value = False)
|
|
468
480
|
limit: If provided, the maximum number of members to return for each element type.
|
|
469
481
|
|
|
470
482
|
Returns:
|
|
@@ -530,8 +542,8 @@ class Dataset:
|
|
|
530
542
|
dataset is incremented and the description, if provide is applied to that new version.
|
|
531
543
|
|
|
532
544
|
Args:
|
|
533
|
-
dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
|
|
534
|
-
members: List of RIDs
|
|
545
|
+
dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
|
|
546
|
+
members: List of member RIDs to add to the dataset_table.
|
|
535
547
|
validate: Check rid_list to make sure elements are not already in the dataset_table.
|
|
536
548
|
description: Markdown description of the updated dataset.
|
|
537
549
|
execution_rid: Optional RID of execution associated with this dataset.
|
|
@@ -544,7 +556,7 @@ class Dataset:
|
|
|
544
556
|
|
|
545
557
|
Args:
|
|
546
558
|
member_rid:
|
|
547
|
-
path:
|
|
559
|
+
path: (Default value = None)
|
|
548
560
|
|
|
549
561
|
Returns:
|
|
550
562
|
|
|
@@ -570,7 +582,7 @@ class Dataset:
|
|
|
570
582
|
a.other_fkeys.pop().pk_table.name: a.table.name
|
|
571
583
|
for a in self.dataset_table.find_associations()
|
|
572
584
|
}
|
|
573
|
-
# Get a list of all the types
|
|
585
|
+
# Get a list of all the object types that can be linked to a dataset_table.
|
|
574
586
|
for m in members:
|
|
575
587
|
try:
|
|
576
588
|
rid_info = self._model.catalog.resolve_rid(m)
|
|
@@ -618,8 +630,8 @@ class Dataset:
|
|
|
618
630
|
dataset is incremented and the description, if provide is applied to that new version.
|
|
619
631
|
|
|
620
632
|
Args:
|
|
621
|
-
dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
|
|
622
|
-
members: List of RIDs
|
|
633
|
+
dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
|
|
634
|
+
members: List of member RIDs to add to the dataset_table.
|
|
623
635
|
description: Markdown description of the updated dataset.
|
|
624
636
|
execution_rid: Optional RID of execution associated with this operation.
|
|
625
637
|
"""
|
|
@@ -634,7 +646,7 @@ class Dataset:
|
|
|
634
646
|
a.other_fkeys.pop().pk_table.name: a.table.name
|
|
635
647
|
for a in self.dataset_table.find_associations()
|
|
636
648
|
}
|
|
637
|
-
# Get a list of all the types
|
|
649
|
+
# Get a list of all the object types that can be linked to a dataset_table.
|
|
638
650
|
for m in members:
|
|
639
651
|
try:
|
|
640
652
|
rid_info = self._model.catalog.resolve_rid(m)
|
|
@@ -670,7 +682,7 @@ class Dataset:
|
|
|
670
682
|
)
|
|
671
683
|
|
|
672
684
|
@validate_call
|
|
673
|
-
def list_dataset_parents(self, dataset_rid: RID) -> list[
|
|
685
|
+
def list_dataset_parents(self, dataset_rid: RID) -> list[str]:
|
|
674
686
|
"""Given a dataset_table RID, return a list of RIDs of the parent datasets if this is included in a
|
|
675
687
|
nested dataset.
|
|
676
688
|
|
|
@@ -696,14 +708,14 @@ class Dataset:
|
|
|
696
708
|
|
|
697
709
|
@validate_call
|
|
698
710
|
def list_dataset_children(self, dataset_rid: RID, recurse=False) -> list[RID]:
|
|
699
|
-
"""Given a dataset_table RID, return a list of RIDs
|
|
711
|
+
"""Given a dataset_table RID, return a list of RIDs for any nested datasets.
|
|
700
712
|
|
|
701
713
|
Args:
|
|
702
714
|
dataset_rid: A dataset_table RID.
|
|
703
|
-
recurse: If True, return a list of
|
|
715
|
+
recurse: If True, return a list of nested datasets RIDs.
|
|
704
716
|
|
|
705
717
|
Returns:
|
|
706
|
-
list of
|
|
718
|
+
list of nested dataset RIDs.
|
|
707
719
|
|
|
708
720
|
"""
|
|
709
721
|
dataset_dataset_path = (
|
|
@@ -726,7 +738,7 @@ class Dataset:
|
|
|
726
738
|
|
|
727
739
|
return find_children(dataset_rid)
|
|
728
740
|
|
|
729
|
-
def
|
|
741
|
+
def _export_vocabulary(
|
|
730
742
|
self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
|
|
731
743
|
) -> list[dict[str, Any]]:
|
|
732
744
|
"""
|
|
@@ -756,10 +768,10 @@ class Dataset:
|
|
|
756
768
|
) -> Iterator[tuple[str, str, Table]]:
|
|
757
769
|
paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
|
|
758
770
|
|
|
759
|
-
def source_path(path: tuple[Table, ...]):
|
|
771
|
+
def source_path(path: tuple[Table, ...]) -> list[str]:
|
|
760
772
|
"""Convert a tuple representing a path into a source path component with FK linkage"""
|
|
761
773
|
path = list(path)
|
|
762
|
-
p = [f"{self._model.ml_schema}:Dataset/RID={{
|
|
774
|
+
p = [f"{self._model.ml_schema}:Dataset/RID={{RID}}"]
|
|
763
775
|
for table in path[1:]:
|
|
764
776
|
if table.name == "Dataset_Dataset":
|
|
765
777
|
p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
|
|
@@ -803,7 +815,7 @@ class Dataset:
|
|
|
803
815
|
dataset_elements = [
|
|
804
816
|
snapshot_catalog._model.name_to_table(e)
|
|
805
817
|
for e, m in snapshot_catalog.list_dataset_members(
|
|
806
|
-
dataset_rid=dataset_rid, # limit=1
|
|
818
|
+
dataset_rid=dataset_rid, # limit=1 Limit seems to make things run slow.
|
|
807
819
|
).items()
|
|
808
820
|
if m
|
|
809
821
|
]
|
|
@@ -857,7 +869,7 @@ class Dataset:
|
|
|
857
869
|
"""
|
|
858
870
|
|
|
859
871
|
def children_depth(
|
|
860
|
-
dataset_rid: RID, nested_datasets: dict[
|
|
872
|
+
dataset_rid: RID, nested_datasets: dict[str, list[str]]
|
|
861
873
|
) -> int:
|
|
862
874
|
"""Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
|
|
863
875
|
try:
|
|
@@ -899,13 +911,13 @@ class Dataset:
|
|
|
899
911
|
def _dataset_specification(
|
|
900
912
|
self,
|
|
901
913
|
writer: Callable[[str, str, Table], list[dict[str, Any]]],
|
|
902
|
-
dataset: DatasetSpec,
|
|
914
|
+
dataset: Optional[DatasetSpec] = None,
|
|
903
915
|
snapshot_catalog: Optional[DerivaML] = None,
|
|
904
916
|
) -> list[dict[str, Any]]:
|
|
905
917
|
"""Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
|
|
906
|
-
The top level data directory of the resulting BDBag will have one subdirectory for element type.
|
|
918
|
+
The top level data directory of the resulting BDBag will have one subdirectory for element type. The subdirectory
|
|
907
919
|
will contain the CSV indicating which elements of that type are present in the dataset_table, and then there will be a
|
|
908
|
-
|
|
920
|
+
subdirectory for each object that is reachable from the dataset_table members.
|
|
909
921
|
|
|
910
922
|
To simplify reconstructing the relationship between tables, the CVS for each
|
|
911
923
|
The top level data directory will also contain a subdirectory for any controlled vocabularies used in the dataset_table.
|
|
@@ -913,7 +925,7 @@ class Dataset:
|
|
|
913
925
|
|
|
914
926
|
For example, consider a dataset_table that consists of two element types, T1 and T2. T1 has foreign key relationships to
|
|
915
927
|
objects in tables T3 and T4. There are also two controlled vocabularies, CV1 and CV2. T2 is an asset table
|
|
916
|
-
which has two
|
|
928
|
+
which has two assets in it. The layout of the resulting bdbag would be:
|
|
917
929
|
data
|
|
918
930
|
CV1/
|
|
919
931
|
cv1.csv
|
|
@@ -939,12 +951,12 @@ class Dataset:
|
|
|
939
951
|
Returns:
|
|
940
952
|
A dataset_table specification.
|
|
941
953
|
"""
|
|
942
|
-
element_spec =
|
|
954
|
+
element_spec = self._export_vocabulary(writer)
|
|
943
955
|
for path in self._table_paths(
|
|
944
956
|
dataset=dataset, snapshot_catalog=snapshot_catalog
|
|
945
957
|
):
|
|
946
958
|
element_spec.extend(writer(*path))
|
|
947
|
-
return
|
|
959
|
+
return element_spec
|
|
948
960
|
|
|
949
961
|
def _download_dataset_bag(
|
|
950
962
|
self,
|
|
@@ -985,7 +997,7 @@ class Dataset:
|
|
|
985
997
|
for h in self.dataset_history(dataset_rid=dataset.rid)
|
|
986
998
|
if h.dataset_version == dataset.version
|
|
987
999
|
][0]
|
|
988
|
-
return f"{self._model.catalog.catalog_id}@{
|
|
1000
|
+
return f"{self._model.catalog.catalog_id}@{version_record.snapshot}"
|
|
989
1001
|
|
|
990
1002
|
def _create_dataset_minid(
|
|
991
1003
|
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
@@ -1000,7 +1012,7 @@ class Dataset:
|
|
|
1000
1012
|
)
|
|
1001
1013
|
try:
|
|
1002
1014
|
self._logger.info(
|
|
1003
|
-
f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
|
|
1015
|
+
f"Downloading dataset {'minid' if self._use_minid else 'bag'} for catalog: {dataset.rid}@{str(dataset.version)}"
|
|
1004
1016
|
)
|
|
1005
1017
|
# Generate the bag and put into S3 storage.
|
|
1006
1018
|
exporter = DerivaExport(
|
|
@@ -1009,9 +1021,10 @@ class Dataset:
|
|
|
1009
1021
|
output_dir=tmp_dir,
|
|
1010
1022
|
defer_download=True,
|
|
1011
1023
|
timeout=(10, 610),
|
|
1012
|
-
envars={"
|
|
1024
|
+
envars={"RID": dataset.rid},
|
|
1013
1025
|
)
|
|
1014
1026
|
minid_page_url = exporter.export()[0] # Get the MINID launch page
|
|
1027
|
+
|
|
1015
1028
|
except (
|
|
1016
1029
|
DerivaDownloadError,
|
|
1017
1030
|
DerivaDownloadConfigurationError,
|
|
@@ -1021,17 +1034,18 @@ class Dataset:
|
|
|
1021
1034
|
) as e:
|
|
1022
1035
|
raise DerivaMLException(format_exception(e))
|
|
1023
1036
|
# Update version table with MINID.
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1037
|
+
if self._use_minid:
|
|
1038
|
+
version_path = (
|
|
1039
|
+
self._model.catalog.getPathBuilder()
|
|
1040
|
+
.schemas[self._ml_schema]
|
|
1041
|
+
.tables["Dataset_Version"]
|
|
1042
|
+
)
|
|
1043
|
+
version_rid = [
|
|
1044
|
+
h
|
|
1045
|
+
for h in self.dataset_history(dataset_rid=dataset.rid)
|
|
1046
|
+
if h.dataset_version == dataset.version
|
|
1047
|
+
][0].version_rid
|
|
1048
|
+
version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
|
|
1035
1049
|
return minid_page_url
|
|
1036
1050
|
|
|
1037
1051
|
def _get_dataset_minid(
|
|
@@ -1074,14 +1088,25 @@ class Dataset:
|
|
|
1074
1088
|
raise DerivaMLException(
|
|
1075
1089
|
f"Minid for dataset {dataset.rid} doesn't exist"
|
|
1076
1090
|
)
|
|
1077
|
-
self.
|
|
1091
|
+
if self._use_minid:
|
|
1092
|
+
self._logger.info("Creating new MINID for dataset %s", dataset.rid)
|
|
1078
1093
|
minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
|
|
1079
1094
|
# If provided a MINID, use the MINID metadata to get the checksum and download the bag.
|
|
1080
|
-
|
|
1081
|
-
|
|
1095
|
+
if self._use_minid:
|
|
1096
|
+
r = requests.get(minid_url, headers={"accept": "application/json"})
|
|
1097
|
+
dataset_minid = DatasetMinid(
|
|
1098
|
+
dataset_version=dataset.version, **r.json()
|
|
1099
|
+
)
|
|
1100
|
+
else:
|
|
1101
|
+
dataset_minid = DatasetMinid(
|
|
1102
|
+
dataset_version=dataset.version,
|
|
1103
|
+
RID=f"{dataset.rid}@{dataset_version_record.snapshot}",
|
|
1104
|
+
location=minid_url,
|
|
1105
|
+
)
|
|
1106
|
+
return dataset_minid
|
|
1082
1107
|
|
|
1083
1108
|
def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
|
|
1084
|
-
"""Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
|
|
1109
|
+
"""Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it, and validate
|
|
1085
1110
|
that all the metadata is correct
|
|
1086
1111
|
|
|
1087
1112
|
Args:
|
|
@@ -1090,19 +1115,37 @@ class Dataset:
|
|
|
1090
1115
|
the location of the unpacked and validated dataset_table bag and the RID of the bag and the bag MINID
|
|
1091
1116
|
"""
|
|
1092
1117
|
|
|
1093
|
-
# Check to see if we have an existing idempotent materialization of the desired bag. If so, then
|
|
1118
|
+
# Check to see if we have an existing idempotent materialization of the desired bag. If so, then reuse
|
|
1094
1119
|
# it. If not, then we need to extract the contents of the archive into our cache directory.
|
|
1095
1120
|
bag_dir = self._cache_dir / f"{minid.dataset_rid}_{minid.checksum}"
|
|
1096
1121
|
if bag_dir.exists():
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1122
|
+
self._logger.info(
|
|
1123
|
+
f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
|
|
1124
|
+
)
|
|
1125
|
+
return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
|
|
1126
|
+
|
|
1127
|
+
# Either bag hasn't been downloaded yet, or we are not using a Minid, so we don't know the checksum yet.
|
|
1128
|
+
with TemporaryDirectory() as tmp_dir:
|
|
1129
|
+
if self._use_minid:
|
|
1130
|
+
# Get bag from S3
|
|
1131
|
+
archive_path = fetch_single_file(minid.bag_url)
|
|
1132
|
+
else:
|
|
1133
|
+
exporter = DerivaExport(
|
|
1134
|
+
host=self._model.catalog.deriva_server.server, output_dir=tmp_dir
|
|
1135
|
+
)
|
|
1136
|
+
archive_path = exporter.retrieve_file(minid.bag_url)
|
|
1137
|
+
hashes = hash_utils.compute_file_hashes(
|
|
1138
|
+
archive_path, hashes=["md5", "sha256"]
|
|
1139
|
+
)
|
|
1140
|
+
checksum = hashes["sha256"][0]
|
|
1141
|
+
bag_dir = self._cache_dir / f"{minid.dataset_rid}_{checksum}"
|
|
1142
|
+
if bag_dir.exists():
|
|
1143
|
+
self._logger.info(
|
|
1144
|
+
f"Using cached bag for {minid.dataset_rid} Version:{minid.dataset_version}"
|
|
1145
|
+
)
|
|
1146
|
+
return Path(bag_dir / f"Dataset_{minid.dataset_rid}")
|
|
1147
|
+
bag_path = bdb.extract_bag(archive_path, bag_dir.as_posix())
|
|
1148
|
+
bdb.validate_bag_structure(bag_path)
|
|
1106
1149
|
return Path(bag_path)
|
|
1107
1150
|
|
|
1108
1151
|
def _materialize_dataset_bag(
|
|
@@ -1154,6 +1197,9 @@ class Dataset:
|
|
|
1154
1197
|
|
|
1155
1198
|
# If this bag has already been validated, our work is done. Otherwise, materialize the bag.
|
|
1156
1199
|
if not validated_check.exists():
|
|
1200
|
+
self._logger.info(
|
|
1201
|
+
f"Materializing bag {minid.dataset_rid} Version:{minid.dataset_version}"
|
|
1202
|
+
)
|
|
1157
1203
|
bdb.materialize(
|
|
1158
1204
|
bag_path.as_posix(),
|
|
1159
1205
|
fetch_callback=fetch_progress_callback,
|
|
@@ -1162,9 +1208,8 @@ class Dataset:
|
|
|
1162
1208
|
validated_check.touch()
|
|
1163
1209
|
return Path(bag_path)
|
|
1164
1210
|
|
|
1165
|
-
def
|
|
1211
|
+
def _export_annotation(
|
|
1166
1212
|
self,
|
|
1167
|
-
dataset: Optional[DatasetSpec] = None,
|
|
1168
1213
|
snapshot_catalog: Optional[DerivaML] = None,
|
|
1169
1214
|
) -> list[dict[str, Any]]:
|
|
1170
1215
|
"""Return and output specification for the datasets in the provided model
|
|
@@ -1173,19 +1218,6 @@ class Dataset:
|
|
|
1173
1218
|
An export specification suitable for Chaise.
|
|
1174
1219
|
"""
|
|
1175
1220
|
|
|
1176
|
-
def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
|
|
1177
|
-
"""
|
|
1178
|
-
|
|
1179
|
-
Args:
|
|
1180
|
-
spath: list[Table]:
|
|
1181
|
-
dpath: list[Table]:
|
|
1182
|
-
table: Table
|
|
1183
|
-
|
|
1184
|
-
Returns:
|
|
1185
|
-
An export specification suitable for Chaise.
|
|
1186
|
-
"""
|
|
1187
|
-
return self._export_dataset_element(spath, dpath, table)
|
|
1188
|
-
|
|
1189
1221
|
# Export specification is a specification for the datasets, plus any controlled vocabulary
|
|
1190
1222
|
return [
|
|
1191
1223
|
{
|
|
@@ -1204,41 +1236,34 @@ class Dataset:
|
|
|
1204
1236
|
"destination": {"type": "json", "name": "schema"},
|
|
1205
1237
|
},
|
|
1206
1238
|
] + self._dataset_specification(
|
|
1207
|
-
|
|
1239
|
+
self._export_annotation_dataset_element,
|
|
1240
|
+
None,
|
|
1241
|
+
snapshot_catalog=snapshot_catalog,
|
|
1208
1242
|
)
|
|
1209
1243
|
|
|
1210
|
-
def
|
|
1244
|
+
def _export_specification(
|
|
1211
1245
|
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
1212
1246
|
) -> list[dict[str, Any]]:
|
|
1213
1247
|
"""
|
|
1248
|
+
Generate a specification for export engine for specific dataset.
|
|
1249
|
+
|
|
1214
1250
|
Returns:
|
|
1215
1251
|
a download specification for the datasets in the provided model.
|
|
1216
1252
|
|
|
1217
1253
|
"""
|
|
1218
1254
|
|
|
1219
|
-
def writer(spath: str, dpath: str, table: Table) -> list[dict[str, Any]]:
|
|
1220
|
-
"""
|
|
1221
|
-
|
|
1222
|
-
Args:
|
|
1223
|
-
spath:
|
|
1224
|
-
dpath:
|
|
1225
|
-
table: Table
|
|
1226
|
-
|
|
1227
|
-
Returns:
|
|
1228
|
-
|
|
1229
|
-
"""
|
|
1230
|
-
return self._download_dataset_element(spath, dpath, table)
|
|
1231
|
-
|
|
1232
1255
|
# Download spec is the spec for any controlled vocabulary and for the dataset_table.
|
|
1233
1256
|
return [
|
|
1234
1257
|
{
|
|
1235
1258
|
"processor": "json",
|
|
1236
1259
|
"processor_params": {"query_path": "/schema", "output_path": "schema"},
|
|
1237
1260
|
}
|
|
1238
|
-
] + self._dataset_specification(
|
|
1261
|
+
] + self._dataset_specification(
|
|
1262
|
+
self._export_specification_dataset_element, dataset, snapshot_catalog
|
|
1263
|
+
)
|
|
1239
1264
|
|
|
1240
1265
|
@staticmethod
|
|
1241
|
-
def
|
|
1266
|
+
def _export_specification_dataset_element(
|
|
1242
1267
|
spath: str, dpath: str, table: Table
|
|
1243
1268
|
) -> list[dict[str, Any]]:
|
|
1244
1269
|
"""Return the download specification for the data object indicated by a path through the data model.
|
|
@@ -1255,7 +1280,7 @@ class Dataset:
|
|
|
1255
1280
|
{
|
|
1256
1281
|
"processor": "csv",
|
|
1257
1282
|
"processor_params": {
|
|
1258
|
-
"query_path": f"/entity/{spath}
|
|
1283
|
+
"query_path": f"/entity/{spath}",
|
|
1259
1284
|
"output_path": dpath,
|
|
1260
1285
|
},
|
|
1261
1286
|
}
|
|
@@ -1268,16 +1293,15 @@ class Dataset:
|
|
|
1268
1293
|
{
|
|
1269
1294
|
"processor": "fetch",
|
|
1270
1295
|
"processor_params": {
|
|
1271
|
-
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5
|
|
1296
|
+
"query_path": f"/attribute/{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
|
|
1272
1297
|
"output_path": f"asset/{table.name}",
|
|
1273
1298
|
},
|
|
1274
1299
|
}
|
|
1275
1300
|
)
|
|
1276
1301
|
return exports
|
|
1277
1302
|
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
spath: str, dpath: str, table: Table
|
|
1303
|
+
def _export_annotation_dataset_element(
|
|
1304
|
+
self, spath: str, dpath: str, table: Table
|
|
1281
1305
|
) -> list[dict[str, Any]]:
|
|
1282
1306
|
"""Given a path in the data model, output an export specification for the path taken to get to the current table.
|
|
1283
1307
|
|
|
@@ -1293,9 +1317,23 @@ class Dataset:
|
|
|
1293
1317
|
# into a path in the form of /S:T1/S:T2/S:Table
|
|
1294
1318
|
# Generate the destination path in the file system using just the table names.
|
|
1295
1319
|
|
|
1320
|
+
skip_root_path = False
|
|
1321
|
+
if spath.startswith(f"{self._ml_schema}:Dataset/"):
|
|
1322
|
+
# Chaise will add table name and RID filter, so strip it off.
|
|
1323
|
+
spath = "/".join(spath.split("/")[2:])
|
|
1324
|
+
if spath == "":
|
|
1325
|
+
# This path is to just the dataset table.
|
|
1326
|
+
return []
|
|
1327
|
+
else:
|
|
1328
|
+
# A vocabulary table, so we don't want the root_path.
|
|
1329
|
+
skip_root_path = True
|
|
1296
1330
|
exports = [
|
|
1297
1331
|
{
|
|
1298
|
-
"source": {
|
|
1332
|
+
"source": {
|
|
1333
|
+
"api": "entity",
|
|
1334
|
+
"path": spath,
|
|
1335
|
+
"skip_root_path": skip_root_path,
|
|
1336
|
+
},
|
|
1299
1337
|
"destination": {"name": dpath, "type": "csv"},
|
|
1300
1338
|
}
|
|
1301
1339
|
]
|
|
@@ -1306,6 +1344,7 @@ class Dataset:
|
|
|
1306
1344
|
exports.append(
|
|
1307
1345
|
{
|
|
1308
1346
|
"source": {
|
|
1347
|
+
"skip_root_path": False,
|
|
1309
1348
|
"api": "attribute",
|
|
1310
1349
|
"path": f"{spath}/!(URL::null::)/url:=URL,length:=Length,filename:=Filename,md5:=MD5",
|
|
1311
1350
|
},
|
|
@@ -1315,44 +1354,53 @@ class Dataset:
|
|
|
1315
1354
|
return exports
|
|
1316
1355
|
|
|
1317
1356
|
def _generate_dataset_download_spec(
|
|
1318
|
-
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
|
|
1357
|
+
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
1319
1358
|
) -> dict[str, Any]:
|
|
1320
1359
|
"""
|
|
1360
|
+
Generate a specification for downloading a specific dataset.
|
|
1321
1361
|
|
|
1362
|
+
This routine creates a download specification that can be used by the Deriva export processor to download
|
|
1363
|
+
a specific dataset as a MINID.
|
|
1322
1364
|
Returns:
|
|
1323
1365
|
"""
|
|
1324
1366
|
s3_target = "s3://eye-ai-shared"
|
|
1325
1367
|
minid_test = False
|
|
1326
1368
|
|
|
1327
1369
|
catalog_id = self._version_snapshot(dataset)
|
|
1328
|
-
|
|
1329
|
-
|
|
1370
|
+
post_processors = (
|
|
1371
|
+
{
|
|
1372
|
+
"post_processors": [
|
|
1373
|
+
{
|
|
1374
|
+
"processor": "cloud_upload",
|
|
1375
|
+
"processor_params": {
|
|
1376
|
+
"acl": "public-read",
|
|
1377
|
+
"target_url": s3_target,
|
|
1378
|
+
},
|
|
1379
|
+
},
|
|
1380
|
+
{
|
|
1381
|
+
"processor": "identifier",
|
|
1382
|
+
"processor_params": {
|
|
1383
|
+
"test": minid_test,
|
|
1384
|
+
"env_column_map": {
|
|
1385
|
+
"RID": "{RID}@{snaptime}",
|
|
1386
|
+
"Description": "{Description}",
|
|
1387
|
+
},
|
|
1388
|
+
},
|
|
1389
|
+
},
|
|
1390
|
+
]
|
|
1391
|
+
}
|
|
1392
|
+
if self._use_minid
|
|
1393
|
+
else {}
|
|
1394
|
+
)
|
|
1395
|
+
return post_processors | {
|
|
1396
|
+
"env": {"RID": "{RID}"},
|
|
1330
1397
|
"bag": {
|
|
1331
|
-
"bag_name": "Dataset_{
|
|
1398
|
+
"bag_name": "Dataset_{RID}",
|
|
1332
1399
|
"bag_algorithms": ["md5"],
|
|
1333
1400
|
"bag_archiver": "zip",
|
|
1334
1401
|
"bag_metadata": {},
|
|
1335
1402
|
"bag_idempotent": True,
|
|
1336
1403
|
},
|
|
1337
|
-
"post_processors": [
|
|
1338
|
-
{
|
|
1339
|
-
"processor": "cloud_upload",
|
|
1340
|
-
"processor_params": {
|
|
1341
|
-
"acl": "public-read",
|
|
1342
|
-
"target_url": s3_target,
|
|
1343
|
-
},
|
|
1344
|
-
},
|
|
1345
|
-
{
|
|
1346
|
-
"processor": "identifier",
|
|
1347
|
-
"processor_params": {
|
|
1348
|
-
"test": minid_test,
|
|
1349
|
-
"env_column_map": {
|
|
1350
|
-
"Dataset_RID": "{RID}@{snaptime}",
|
|
1351
|
-
"Description": "{Description}",
|
|
1352
|
-
},
|
|
1353
|
-
},
|
|
1354
|
-
},
|
|
1355
|
-
],
|
|
1356
1404
|
"catalog": {
|
|
1357
1405
|
"host": f"{self._model.catalog.deriva_server.scheme}://{self._model.catalog.deriva_server.server}",
|
|
1358
1406
|
"catalog_id": catalog_id,
|
|
@@ -1368,125 +1416,50 @@ class Dataset:
|
|
|
1368
1416
|
{
|
|
1369
1417
|
"processor": "env",
|
|
1370
1418
|
"processor_params": {
|
|
1371
|
-
"query_path": "/entity/M:=deriva-ml:Dataset/RID={
|
|
1419
|
+
"query_path": "/entity/M:=deriva-ml:Dataset/RID={RID}",
|
|
1372
1420
|
"output_path": "Dataset",
|
|
1373
1421
|
"query_keys": ["RID", "Description"],
|
|
1374
1422
|
},
|
|
1375
1423
|
},
|
|
1376
1424
|
]
|
|
1377
|
-
+ self.
|
|
1425
|
+
+ self._export_specification(dataset, snapshot_catalog),
|
|
1378
1426
|
},
|
|
1379
1427
|
}
|
|
1380
1428
|
|
|
1381
|
-
def
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
"*": [
|
|
1395
|
-
"RID",
|
|
1396
|
-
"Description",
|
|
1397
|
-
{
|
|
1398
|
-
"display": {
|
|
1399
|
-
"markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
|
|
1400
|
-
},
|
|
1401
|
-
"markdown_name": "Annotation App",
|
|
1402
|
-
},
|
|
1403
|
-
rcb_name,
|
|
1404
|
-
rmb_name,
|
|
1405
|
-
],
|
|
1406
|
-
"detailed": [
|
|
1407
|
-
"RID",
|
|
1408
|
-
"Description",
|
|
1409
|
-
{
|
|
1410
|
-
"source": [
|
|
1411
|
-
{"inbound": ["deriva-ml", "Dataset_Dataset_Type_Dataset_fkey"]},
|
|
1412
|
-
{
|
|
1413
|
-
"outbound": [
|
|
1414
|
-
"deriva-ml",
|
|
1415
|
-
"Dataset_Dataset_Type_Dataset_Type_fkey",
|
|
1416
|
-
]
|
|
1429
|
+
def _generate_dataset_download_annotations(self) -> dict[str, Any]:
|
|
1430
|
+
post_processors = (
|
|
1431
|
+
{
|
|
1432
|
+
"type": "BAG",
|
|
1433
|
+
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
1434
|
+
"displayname": "BDBag to Cloud",
|
|
1435
|
+
"bag_idempotent": True,
|
|
1436
|
+
"postprocessors": [
|
|
1437
|
+
{
|
|
1438
|
+
"processor": "cloud_upload",
|
|
1439
|
+
"processor_params": {
|
|
1440
|
+
"acl": "public-read",
|
|
1441
|
+
"target_url": "s3://eye-ai-shared/",
|
|
1417
1442
|
},
|
|
1418
|
-
"RID",
|
|
1419
|
-
],
|
|
1420
|
-
"markdown_name": "Dataset Types",
|
|
1421
|
-
},
|
|
1422
|
-
{
|
|
1423
|
-
"display": {
|
|
1424
|
-
"markdown_pattern": "[Annotate Dataset](https://www.eye-ai.org/apps/grading-interface/main?dataset_rid={{{RID}}}){: .btn}"
|
|
1425
1443
|
},
|
|
1426
|
-
"markdown_name": "Annotation App",
|
|
1427
|
-
},
|
|
1428
|
-
rcb_name,
|
|
1429
|
-
rmb_name,
|
|
1430
|
-
],
|
|
1431
|
-
"filter": {
|
|
1432
|
-
"and": [
|
|
1433
|
-
{"source": "RID"},
|
|
1434
|
-
{"source": "Description"},
|
|
1435
1444
|
{
|
|
1436
|
-
"
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
},
|
|
1443
|
-
{
|
|
1444
|
-
"outbound": [
|
|
1445
|
-
"deriva-ml",
|
|
1446
|
-
"Dataset_Dataset_Type_Dataset_Type_fkey",
|
|
1447
|
-
]
|
|
1445
|
+
"processor": "identifier",
|
|
1446
|
+
"processor_params": {
|
|
1447
|
+
"test": False,
|
|
1448
|
+
"env_column_map": {
|
|
1449
|
+
"RID": "{RID}@{snaptime}",
|
|
1450
|
+
"Description": "{Description}",
|
|
1448
1451
|
},
|
|
1449
|
-
|
|
1450
|
-
],
|
|
1451
|
-
"markdown_name": "Dataset Types",
|
|
1452
|
-
},
|
|
1453
|
-
{
|
|
1454
|
-
"source": [{"outbound": rcb_name}, "RID"],
|
|
1455
|
-
"markdown_name": "Created By",
|
|
1456
|
-
},
|
|
1457
|
-
{
|
|
1458
|
-
"source": [{"outbound": rmb_name}, "RID"],
|
|
1459
|
-
"markdown_name": "Modified By",
|
|
1452
|
+
},
|
|
1460
1453
|
},
|
|
1461
|
-
]
|
|
1462
|
-
},
|
|
1463
|
-
}
|
|
1464
|
-
|
|
1465
|
-
def _dataset_visible_fkeys(self) -> dict[str, Any]:
|
|
1466
|
-
def fkey_name(fk):
|
|
1467
|
-
return [fk.name[0].name, fk.name[1]]
|
|
1468
|
-
|
|
1469
|
-
dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
|
|
1470
|
-
|
|
1471
|
-
source_list = [
|
|
1472
|
-
{
|
|
1473
|
-
"source": [
|
|
1474
|
-
{"inbound": fkey_name(fkey.self_fkey)},
|
|
1475
|
-
{"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
|
|
1476
|
-
"RID",
|
|
1477
1454
|
],
|
|
1478
|
-
"markdown_name": other_fkey.pk_table.name,
|
|
1479
1455
|
}
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
def _generate_dataset_annotations(self) -> dict[str, Any]:
|
|
1456
|
+
if self._use_minid
|
|
1457
|
+
else {}
|
|
1458
|
+
)
|
|
1485
1459
|
return {
|
|
1486
1460
|
deriva_tags.export_fragment_definitions: {
|
|
1487
|
-
"dataset_export_outputs": self.
|
|
1461
|
+
"dataset_export_outputs": self._export_annotation()
|
|
1488
1462
|
},
|
|
1489
|
-
deriva_tags.visible_columns: self.dataset_visible_columns(),
|
|
1490
1463
|
deriva_tags.visible_foreign_keys: self._dataset_visible_fkeys(),
|
|
1491
1464
|
deriva_tags.export_2019: {
|
|
1492
1465
|
"detailed": {
|
|
@@ -1496,45 +1469,56 @@ class Dataset:
|
|
|
1496
1469
|
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
1497
1470
|
"displayname": "BDBag Download",
|
|
1498
1471
|
"bag_idempotent": True,
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
"processor": "identifier",
|
|
1502
|
-
"processor_params": {
|
|
1503
|
-
"test": False,
|
|
1504
|
-
"env_column_map": {
|
|
1505
|
-
"Dataset_RID": "{RID}@{snaptime}",
|
|
1506
|
-
"Description": "{Description}",
|
|
1507
|
-
},
|
|
1508
|
-
},
|
|
1509
|
-
}
|
|
1510
|
-
],
|
|
1511
|
-
},
|
|
1512
|
-
{
|
|
1513
|
-
"type": "BAG",
|
|
1514
|
-
"outputs": [{"fragment_key": "dataset_export_outputs"}],
|
|
1515
|
-
"displayname": "BDBag to Cloud",
|
|
1516
|
-
"bag_idempotent": True,
|
|
1517
|
-
"postprocessors": [
|
|
1518
|
-
{
|
|
1519
|
-
"processor": "cloud_upload",
|
|
1520
|
-
"processor_params": {
|
|
1521
|
-
"acl": "public-read",
|
|
1522
|
-
"target_url": "s3://eye-ai-shared/",
|
|
1523
|
-
},
|
|
1524
|
-
},
|
|
1525
|
-
{
|
|
1526
|
-
"processor": "identifier",
|
|
1527
|
-
"processor_params": {
|
|
1528
|
-
"test": False,
|
|
1529
|
-
"env_column_map": {
|
|
1530
|
-
"Dataset_RID": "{RID}@{snaptime}",
|
|
1531
|
-
"Description": "{Description}",
|
|
1532
|
-
},
|
|
1533
|
-
},
|
|
1534
|
-
},
|
|
1535
|
-
],
|
|
1536
|
-
},
|
|
1472
|
+
}
|
|
1473
|
+
| post_processors
|
|
1537
1474
|
]
|
|
1538
1475
|
}
|
|
1539
1476
|
},
|
|
1540
1477
|
}
|
|
1478
|
+
|
|
1479
|
+
def _dataset_visible_fkeys(self) -> dict[str, Any]:
|
|
1480
|
+
def fkey_name(fk):
|
|
1481
|
+
return [fk.name[0].name, fk.name[1]]
|
|
1482
|
+
|
|
1483
|
+
dataset_table = self._model.schemas["deriva-ml"].tables["Dataset"]
|
|
1484
|
+
|
|
1485
|
+
source_list = [
|
|
1486
|
+
{
|
|
1487
|
+
"source": [
|
|
1488
|
+
{"inbound": ["deriva-ml", "Dataset_Version_Dataset_fkey"]},
|
|
1489
|
+
"RID",
|
|
1490
|
+
],
|
|
1491
|
+
"markdown_name": "Previous Versions",
|
|
1492
|
+
"entity": True,
|
|
1493
|
+
},
|
|
1494
|
+
{
|
|
1495
|
+
"source": [
|
|
1496
|
+
{"inbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
|
|
1497
|
+
{"outbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
|
|
1498
|
+
"RID",
|
|
1499
|
+
],
|
|
1500
|
+
"markdown_name": "Parent Datasets",
|
|
1501
|
+
},
|
|
1502
|
+
{
|
|
1503
|
+
"source": [
|
|
1504
|
+
{"inbound": ["deriva-ml", "Dataset_Dataset_Dataset_fkey"]},
|
|
1505
|
+
{"outbound": ["deriva-ml", "Dataset_Dataset_Nested_Dataset_fkey"]},
|
|
1506
|
+
"RID",
|
|
1507
|
+
],
|
|
1508
|
+
"markdown_name": "Child Datasets",
|
|
1509
|
+
},
|
|
1510
|
+
]
|
|
1511
|
+
source_list.extend(
|
|
1512
|
+
[
|
|
1513
|
+
{
|
|
1514
|
+
"source": [
|
|
1515
|
+
{"inbound": fkey_name(fkey.self_fkey)},
|
|
1516
|
+
{"outbound": fkey_name(other_fkey := fkey.other_fkeys.pop())},
|
|
1517
|
+
"RID",
|
|
1518
|
+
],
|
|
1519
|
+
"markdown_name": other_fkey.pk_table.name,
|
|
1520
|
+
}
|
|
1521
|
+
for fkey in dataset_table.find_associations(max_arity=3, pure=False)
|
|
1522
|
+
]
|
|
1523
|
+
)
|
|
1524
|
+
return {"detailed": source_list}
|