deriva-ml 1.8.1__py3-none-any.whl → 1.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/VERSION.py +1 -1
- deriva_ml/dataset.py +204 -77
- deriva_ml/deriva_definitions.py +0 -1
- deriva_ml/deriva_ml_base.py +182 -30
- deriva_ml/deriva_ml_execute.py +104 -0
- deriva_ml/deriva_model.py +4 -1
- deriva_ml/execution.py +45 -4
- deriva_ml/execution_configuration.py +5 -5
- deriva_ml/upload.py +4 -1
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.4.dist-info}/METADATA +4 -2
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.4.dist-info}/RECORD +15 -14
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.4.dist-info}/WHEEL +1 -1
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.4.dist-info}/LICENSE +0 -0
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.4.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.8.1.dist-info → deriva_ml-1.8.4.dist-info}/top_level.txt +0 -0
deriva_ml/VERSION.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.8.
|
|
1
|
+
__version__ = "1.8.4"
|
deriva_ml/dataset.py
CHANGED
|
@@ -6,6 +6,7 @@ accessible via a DerivaML class instance.
|
|
|
6
6
|
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
from __future__ import annotations
|
|
9
10
|
from bdbag.fetch.fetcher import fetch_single_file
|
|
10
11
|
from bdbag import bdbag_api as bdb
|
|
11
12
|
from collections import defaultdict
|
|
@@ -37,7 +38,7 @@ from pydantic import (
|
|
|
37
38
|
import requests
|
|
38
39
|
|
|
39
40
|
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
|
40
|
-
from typing import Any, Callable, Optional, Iterable, Iterator
|
|
41
|
+
from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
|
|
41
42
|
|
|
42
43
|
from deriva_ml import DatasetBag
|
|
43
44
|
from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
|
|
@@ -52,6 +53,9 @@ from .dataset_aux_classes import (
|
|
|
52
53
|
DatasetSpec,
|
|
53
54
|
)
|
|
54
55
|
|
|
56
|
+
if TYPE_CHECKING:
|
|
57
|
+
from .deriva_ml_base import DerivaML
|
|
58
|
+
|
|
55
59
|
|
|
56
60
|
class Dataset:
|
|
57
61
|
"""
|
|
@@ -83,29 +87,32 @@ class Dataset:
|
|
|
83
87
|
else:
|
|
84
88
|
return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
|
|
85
89
|
|
|
86
|
-
def
|
|
90
|
+
def _insert_dataset_versions(
|
|
87
91
|
self,
|
|
88
|
-
|
|
89
|
-
dataset_version: DatasetVersion,
|
|
92
|
+
dataset_list: list[DatasetSpec],
|
|
90
93
|
description: Optional[str] = "",
|
|
91
94
|
execution_rid: Optional[RID] = None,
|
|
92
95
|
) -> RID:
|
|
93
96
|
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
97
|
+
|
|
98
|
+
# Construct version records for insert
|
|
99
|
+
version_records = [
|
|
100
|
+
{
|
|
101
|
+
"Dataset": dataset.rid,
|
|
102
|
+
"Version": str(dataset.version),
|
|
103
|
+
"Description": description,
|
|
104
|
+
"Execution": execution_rid,
|
|
105
|
+
}
|
|
106
|
+
for dataset in dataset_list
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
# Insert version records and construct entities for updating the dataset version column.
|
|
110
|
+
version_rids = [
|
|
111
|
+
{"Version": v["RID"], "RID": v["Dataset"]}
|
|
112
|
+
for v in schema_path.tables["Dataset_Version"].insert(version_records)
|
|
113
|
+
]
|
|
114
|
+
schema_path.tables["Dataset"].update(version_rids)
|
|
115
|
+
return version_rids
|
|
109
116
|
|
|
110
117
|
def _bootstrap_versions(self):
|
|
111
118
|
datasets = [ds["RID"] for ds in self.find_datasets()]
|
|
@@ -237,16 +244,20 @@ class Dataset:
|
|
|
237
244
|
Raises:
|
|
238
245
|
DerivaMLException: if provided RID is not to a dataset_table.
|
|
239
246
|
"""
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
execution_rid=execution_rid,
|
|
247
|
+
|
|
248
|
+
# Find all of the datasets that are reachable from this dataset and determine their new version numbers.
|
|
249
|
+
related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
|
|
250
|
+
version_update_list = [
|
|
251
|
+
DatasetSpec(
|
|
252
|
+
rid=ds_rid,
|
|
253
|
+
version=self.dataset_version(ds_rid).increment_version(component),
|
|
248
254
|
)
|
|
249
|
-
|
|
255
|
+
for ds_rid in related_datasets
|
|
256
|
+
]
|
|
257
|
+
updated_versions = self._insert_dataset_versions(
|
|
258
|
+
version_update_list, description=description, execution_rid=execution_rid
|
|
259
|
+
)
|
|
260
|
+
return [d.version for d in version_update_list if d.rid == dataset_rid][0]
|
|
250
261
|
|
|
251
262
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
252
263
|
def create_dataset(
|
|
@@ -323,9 +334,8 @@ class Dataset:
|
|
|
323
334
|
pb.schemas[self._ml_schema].Dataset_Execution.insert(
|
|
324
335
|
[{"Dataset": dataset_rid, "Execution": execution_rid}]
|
|
325
336
|
)
|
|
326
|
-
self.
|
|
327
|
-
dataset_rid,
|
|
328
|
-
dataset_version=version,
|
|
337
|
+
self._insert_dataset_versions(
|
|
338
|
+
[DatasetSpec(rid=dataset_rid, version=version)],
|
|
329
339
|
execution_rid=execution_rid,
|
|
330
340
|
description="Initial dataset creation.",
|
|
331
341
|
)
|
|
@@ -447,7 +457,7 @@ class Dataset:
|
|
|
447
457
|
|
|
448
458
|
# @validate_call
|
|
449
459
|
def list_dataset_members(
|
|
450
|
-
self, dataset_rid: RID, recurse: bool = False
|
|
460
|
+
self, dataset_rid: RID, recurse: bool = False, limit: Optional[int] = None
|
|
451
461
|
) -> dict[str, list[dict[str, Any]]]:
|
|
452
462
|
"""Return a list of entities associated with a specific dataset_table.
|
|
453
463
|
|
|
@@ -455,6 +465,7 @@ class Dataset:
|
|
|
455
465
|
dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
|
|
456
466
|
dataset_rid: RID:
|
|
457
467
|
recurse: (Default value = False)
|
|
468
|
+
limit: If provided, the maximum number of members to return for each element type.
|
|
458
469
|
|
|
459
470
|
Returns:
|
|
460
471
|
Dictionary of entities associated with a specific dataset_table. Key is the table from which the elements
|
|
@@ -492,7 +503,9 @@ class Dataset:
|
|
|
492
503
|
target_path,
|
|
493
504
|
on=(member_path.columns[member_column] == target_path.columns["RID"]),
|
|
494
505
|
)
|
|
495
|
-
target_entities = list(
|
|
506
|
+
target_entities = list(
|
|
507
|
+
path.entities().fetch(limit=limit) if limit else path.entities().fetch()
|
|
508
|
+
)
|
|
496
509
|
members[target_table.name].extend(target_entities)
|
|
497
510
|
if recurse and target_table == self.dataset_table:
|
|
498
511
|
# Get the members for all the nested datasets and add to the member list.
|
|
@@ -694,11 +707,25 @@ class Dataset:
|
|
|
694
707
|
list of RIDs of nested datasets.
|
|
695
708
|
|
|
696
709
|
"""
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
710
|
+
dataset_dataset_path = (
|
|
711
|
+
self._model.catalog.getPathBuilder()
|
|
712
|
+
.schemas[self._ml_schema]
|
|
713
|
+
.tables["Dataset_Dataset"]
|
|
714
|
+
)
|
|
715
|
+
nested_datasets = list(dataset_dataset_path.entities().fetch())
|
|
716
|
+
|
|
717
|
+
def find_children(rid: RID):
|
|
718
|
+
children = [
|
|
719
|
+
child["Nested_Dataset"]
|
|
720
|
+
for child in nested_datasets
|
|
721
|
+
if child["Dataset"] == rid
|
|
722
|
+
]
|
|
723
|
+
if recurse:
|
|
724
|
+
for child in children.copy():
|
|
725
|
+
children.extend(find_children(child))
|
|
726
|
+
return children
|
|
727
|
+
|
|
728
|
+
return find_children(dataset_rid)
|
|
702
729
|
|
|
703
730
|
def _vocabulary_specification(
|
|
704
731
|
self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
|
|
@@ -723,26 +750,20 @@ class Dataset:
|
|
|
723
750
|
for o in writer(f"{table.schema.name}:{table.name}", table.name, table)
|
|
724
751
|
]
|
|
725
752
|
|
|
726
|
-
def _table_paths(
|
|
753
|
+
def _table_paths(
|
|
754
|
+
self, dataset: DatasetSpec = None, snapshot_catalog: Optional[DerivaML] = None
|
|
755
|
+
) -> Iterator[tuple[str, str, Table]]:
|
|
727
756
|
|
|
728
|
-
|
|
729
|
-
paths = self._model._schema_to_paths()
|
|
730
|
-
nested_paths = paths
|
|
757
|
+
paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
|
|
731
758
|
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
nested_paths = [
|
|
736
|
-
[self.dataset_table, dataset_dataset] + p for p in nested_paths
|
|
737
|
-
]
|
|
738
|
-
paths.extend(nested_paths)
|
|
739
|
-
|
|
740
|
-
def source_path(path):
|
|
759
|
+
def source_path(path: tuple[Table, ...]):
|
|
760
|
+
"""Convert a tuple representing a path into a source path component with FK linkage"""
|
|
761
|
+
path = list(path)
|
|
741
762
|
p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
|
|
742
763
|
for table in path[1:]:
|
|
743
|
-
if table ==
|
|
764
|
+
if table.name == "Dataset_Dataset":
|
|
744
765
|
p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
|
|
745
|
-
elif table ==
|
|
766
|
+
elif table.name == "Dataset":
|
|
746
767
|
p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
|
|
747
768
|
elif table.name == "Dataset_Version":
|
|
748
769
|
p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
|
|
@@ -753,10 +774,81 @@ class Dataset:
|
|
|
753
774
|
src_paths = ["/".join(source_path(p)) for p in paths]
|
|
754
775
|
dest_paths = ["/".join([t.name for t in p]) for p in paths]
|
|
755
776
|
target_tables = [p[-1] for p in paths]
|
|
756
|
-
|
|
757
777
|
return zip(src_paths, dest_paths, target_tables)
|
|
758
778
|
|
|
759
|
-
def
|
|
779
|
+
def _collect_paths(
|
|
780
|
+
self,
|
|
781
|
+
dataset_rid: Optional[RID] = None,
|
|
782
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
783
|
+
dataset_nesting_depth: Optional[int] = None,
|
|
784
|
+
) -> set[tuple[Table, ...]]:
|
|
785
|
+
|
|
786
|
+
snapshot_catalog = snapshot_catalog or self
|
|
787
|
+
dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
788
|
+
"Dataset"
|
|
789
|
+
]
|
|
790
|
+
dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
791
|
+
"Dataset_Dataset"
|
|
792
|
+
]
|
|
793
|
+
dataset_associations = [
|
|
794
|
+
a
|
|
795
|
+
for a in self.dataset_table.find_associations()
|
|
796
|
+
if a.table.schema.name != self._ml_schema
|
|
797
|
+
or a.table.name == "Dataset_Dataset"
|
|
798
|
+
]
|
|
799
|
+
if dataset_rid:
|
|
800
|
+
# Get a list of the members of the dataset so we can figure out which tables to query.
|
|
801
|
+
dataset_elements = [
|
|
802
|
+
snapshot_catalog._model.name_to_table(e)
|
|
803
|
+
for e, m in snapshot_catalog.list_dataset_members(
|
|
804
|
+
dataset_rid=dataset_rid, limit=1
|
|
805
|
+
).items()
|
|
806
|
+
if m
|
|
807
|
+
]
|
|
808
|
+
included_associations = [
|
|
809
|
+
a.table
|
|
810
|
+
for a in dataset_table.find_associations()
|
|
811
|
+
if a.other_fkeys.pop().pk_table in dataset_elements
|
|
812
|
+
]
|
|
813
|
+
else:
|
|
814
|
+
included_associations = dataset_associations
|
|
815
|
+
# Get the paths through the schema and filter out all of dataset paths not used by this dataset.
|
|
816
|
+
paths = {
|
|
817
|
+
tuple(p)
|
|
818
|
+
for p in snapshot_catalog._model._schema_to_paths()
|
|
819
|
+
if (len(p) == 1)
|
|
820
|
+
or (p[1] not in dataset_associations) # Tables in the domain schema
|
|
821
|
+
or (
|
|
822
|
+
p[1] in included_associations
|
|
823
|
+
) # Tables that include members of the dataset
|
|
824
|
+
}
|
|
825
|
+
# Now get paths for nested datasets
|
|
826
|
+
nested_paths = set()
|
|
827
|
+
if dataset_rid:
|
|
828
|
+
for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
|
|
829
|
+
nested_paths |= self._collect_paths(
|
|
830
|
+
c, snapshot_catalog=snapshot_catalog
|
|
831
|
+
)
|
|
832
|
+
else:
|
|
833
|
+
# Initialize nesting depth if not already provided.
|
|
834
|
+
dataset_nesting_depth = (
|
|
835
|
+
self._dataset_nesting_depth()
|
|
836
|
+
if dataset_nesting_depth is None
|
|
837
|
+
else dataset_nesting_depth
|
|
838
|
+
)
|
|
839
|
+
if dataset_nesting_depth:
|
|
840
|
+
nested_paths = self._collect_paths(
|
|
841
|
+
dataset_nesting_depth=dataset_nesting_depth - 1
|
|
842
|
+
)
|
|
843
|
+
if nested_paths:
|
|
844
|
+
paths |= {
|
|
845
|
+
tuple([dataset_table]),
|
|
846
|
+
(dataset_table, dataset_dataset),
|
|
847
|
+
}
|
|
848
|
+
paths |= {(self.dataset_table, dataset_dataset) + p for p in nested_paths}
|
|
849
|
+
return paths
|
|
850
|
+
|
|
851
|
+
def _dataset_nesting_depth(self, dataset_rid: Optional[RID] = None) -> int:
|
|
760
852
|
"""Determine the maximum dataset nesting depth in the current catalog.
|
|
761
853
|
|
|
762
854
|
Returns:
|
|
@@ -766,7 +858,7 @@ class Dataset:
|
|
|
766
858
|
def children_depth(
|
|
767
859
|
dataset_rid: RID, nested_datasets: dict[RID, list[RID]]
|
|
768
860
|
) -> int:
|
|
769
|
-
"""Return the number of nested datasets in the current catalog"""
|
|
861
|
+
"""Return the number of nested datasets for the dataset_rid if provided, otherwise in the current catalog"""
|
|
770
862
|
try:
|
|
771
863
|
children = nested_datasets[dataset_rid]
|
|
772
864
|
return (
|
|
@@ -783,8 +875,19 @@ class Dataset:
|
|
|
783
875
|
.schemas[self._ml_schema]
|
|
784
876
|
.tables["Dataset_Dataset"]
|
|
785
877
|
)
|
|
878
|
+
dataset_children = (
|
|
879
|
+
[
|
|
880
|
+
{
|
|
881
|
+
"Dataset": dataset_rid,
|
|
882
|
+
"Nested_Dataset": c,
|
|
883
|
+
} # Make uniform with return from datapath
|
|
884
|
+
for c in self.list_dataset_children(dataset_rid)
|
|
885
|
+
]
|
|
886
|
+
if dataset_rid
|
|
887
|
+
else pb.entities().fetch()
|
|
888
|
+
)
|
|
786
889
|
nested_dataset = defaultdict(list)
|
|
787
|
-
for ds in
|
|
890
|
+
for ds in dataset_children:
|
|
788
891
|
nested_dataset[ds["Dataset"]].append(ds["Nested_Dataset"])
|
|
789
892
|
return (
|
|
790
893
|
max(map(lambda d: children_depth(d, dict(nested_dataset)), nested_dataset))
|
|
@@ -793,7 +896,10 @@ class Dataset:
|
|
|
793
896
|
)
|
|
794
897
|
|
|
795
898
|
def _dataset_specification(
|
|
796
|
-
self,
|
|
899
|
+
self,
|
|
900
|
+
writer: Callable[[str, str, Table], list[dict[str, Any]]],
|
|
901
|
+
dataset: DatasetSpec,
|
|
902
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
797
903
|
) -> list[dict[str, Any]]:
|
|
798
904
|
"""Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
|
|
799
905
|
The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
|
|
@@ -833,21 +939,24 @@ class Dataset:
|
|
|
833
939
|
A dataset_table specification.
|
|
834
940
|
"""
|
|
835
941
|
element_spec = []
|
|
836
|
-
for path in self._table_paths(
|
|
942
|
+
for path in self._table_paths(
|
|
943
|
+
dataset=dataset, snapshot_catalog=snapshot_catalog
|
|
944
|
+
):
|
|
837
945
|
element_spec.extend(writer(*path))
|
|
838
946
|
return self._vocabulary_specification(writer) + element_spec
|
|
839
947
|
|
|
840
|
-
|
|
841
|
-
def download_dataset_bag(
|
|
948
|
+
def _download_dataset_bag(
|
|
842
949
|
self,
|
|
843
950
|
dataset: DatasetSpec,
|
|
844
951
|
execution_rid: Optional[RID] = None,
|
|
952
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
845
953
|
) -> DatasetBag:
|
|
846
954
|
"""Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
|
|
847
955
|
|
|
848
956
|
Args:
|
|
849
957
|
dataset: Specification of the dataset to be downloaded.
|
|
850
958
|
execution_rid: Execution RID for the dataset.
|
|
959
|
+
snapshot_catalog: Snapshot catalog for the dataset version if specified.
|
|
851
960
|
|
|
852
961
|
Returns:
|
|
853
962
|
Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
|
|
@@ -858,16 +967,17 @@ class Dataset:
|
|
|
858
967
|
and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
|
|
859
968
|
):
|
|
860
969
|
raise DerivaMLException(f"RID {execution_rid} is not an execution")
|
|
861
|
-
minid = self.
|
|
970
|
+
minid = self._get_dataset_minid(dataset, snapshot_catalog=snapshot_catalog)
|
|
862
971
|
|
|
863
972
|
bag_path = (
|
|
864
973
|
self._materialize_dataset_bag(minid, execution_rid=execution_rid)
|
|
865
974
|
if dataset.materialize
|
|
866
|
-
else self.
|
|
975
|
+
else self._download_dataset_minid(minid)
|
|
867
976
|
)
|
|
868
977
|
return DatabaseModel(minid, bag_path).get_dataset()
|
|
869
978
|
|
|
870
979
|
def _version_snapshot(self, dataset: DatasetSpec) -> str:
|
|
980
|
+
"""Return a catalog with snapshot for the specified dataset version"""
|
|
871
981
|
version_record = [
|
|
872
982
|
h
|
|
873
983
|
for h in self.dataset_history(dataset_rid=dataset.rid)
|
|
@@ -875,13 +985,17 @@ class Dataset:
|
|
|
875
985
|
][0]
|
|
876
986
|
return f"{self._model.catalog.catalog_id}@{iso_to_snap(version_record.timestamp.isoformat())}"
|
|
877
987
|
|
|
878
|
-
def _create_dataset_minid(
|
|
988
|
+
def _create_dataset_minid(
|
|
989
|
+
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
990
|
+
) -> str:
|
|
879
991
|
with TemporaryDirectory() as tmp_dir:
|
|
880
992
|
# Generate a download specification file for the current catalog schema. By default, this spec
|
|
881
993
|
# will generate a minid and place the bag into S3 storage.
|
|
882
994
|
spec_file = f"{tmp_dir}/download_spec.json"
|
|
883
995
|
with open(spec_file, "w", encoding="utf-8") as ds:
|
|
884
|
-
json.dump(
|
|
996
|
+
json.dump(
|
|
997
|
+
self._generate_dataset_download_spec(dataset, snapshot_catalog), ds
|
|
998
|
+
)
|
|
885
999
|
try:
|
|
886
1000
|
self._logger.info(
|
|
887
1001
|
f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
|
|
@@ -918,14 +1032,17 @@ class Dataset:
|
|
|
918
1032
|
version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
|
|
919
1033
|
return minid_page_url
|
|
920
1034
|
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
1035
|
+
def _get_dataset_minid(
|
|
1036
|
+
self,
|
|
1037
|
+
dataset: DatasetSpec,
|
|
1038
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
1039
|
+
create: bool = True,
|
|
924
1040
|
) -> DatasetMinid:
|
|
925
1041
|
"""Return a MINID to the specified dataset. If no version is specified, use the latest.
|
|
926
1042
|
|
|
927
1043
|
Args:
|
|
928
1044
|
dataset: Specification of the dataset.
|
|
1045
|
+
snapshot_catalog: Snapshot catalog for the dataset version if specified.
|
|
929
1046
|
create: Create a new MINID if one doesn't already exist.
|
|
930
1047
|
|
|
931
1048
|
Returns:
|
|
@@ -956,12 +1073,12 @@ class Dataset:
|
|
|
956
1073
|
f"Minid for dataset {dataset.rid} doesn't exist"
|
|
957
1074
|
)
|
|
958
1075
|
self._logger.info("Creating new MINID for dataset %s", dataset.rid)
|
|
959
|
-
minid_url = self._create_dataset_minid(dataset)
|
|
1076
|
+
minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
|
|
960
1077
|
# If provided a MINID, use the MINID metadata to get the checksum and download the bag.
|
|
961
1078
|
r = requests.get(minid_url, headers={"accept": "application/json"})
|
|
962
1079
|
return DatasetMinid(dataset_version=dataset.version, **r.json())
|
|
963
1080
|
|
|
964
|
-
def
|
|
1081
|
+
def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
|
|
965
1082
|
"""Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
|
|
966
1083
|
that all the metadata is correct
|
|
967
1084
|
|
|
@@ -1028,7 +1145,7 @@ class Dataset:
|
|
|
1028
1145
|
return True
|
|
1029
1146
|
|
|
1030
1147
|
# request metadata
|
|
1031
|
-
bag_path = self.
|
|
1148
|
+
bag_path = self._download_dataset_minid(minid)
|
|
1032
1149
|
bag_dir = bag_path.parent
|
|
1033
1150
|
validated_check = bag_dir / "validated_check.txt"
|
|
1034
1151
|
|
|
@@ -1042,7 +1159,11 @@ class Dataset:
|
|
|
1042
1159
|
validated_check.touch()
|
|
1043
1160
|
return Path(bag_path)
|
|
1044
1161
|
|
|
1045
|
-
def _export_outputs(
|
|
1162
|
+
def _export_outputs(
|
|
1163
|
+
self,
|
|
1164
|
+
dataset: Optional[DatasetSpec] = None,
|
|
1165
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
1166
|
+
) -> list[dict[str, Any]]:
|
|
1046
1167
|
"""Return and output specification for the datasets in the provided model
|
|
1047
1168
|
|
|
1048
1169
|
Returns:
|
|
@@ -1079,9 +1200,13 @@ class Dataset:
|
|
|
1079
1200
|
"source": {"api": "schema", "skip_root_path": True},
|
|
1080
1201
|
"destination": {"type": "json", "name": "schema"},
|
|
1081
1202
|
},
|
|
1082
|
-
] + self._dataset_specification(
|
|
1203
|
+
] + self._dataset_specification(
|
|
1204
|
+
writer, dataset, snapshot_catalog=snapshot_catalog
|
|
1205
|
+
)
|
|
1083
1206
|
|
|
1084
|
-
def _processor_params(
|
|
1207
|
+
def _processor_params(
|
|
1208
|
+
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
1209
|
+
) -> list[dict[str, Any]]:
|
|
1085
1210
|
"""
|
|
1086
1211
|
Returns:
|
|
1087
1212
|
a download specification for the datasets in the provided model.
|
|
@@ -1107,7 +1232,7 @@ class Dataset:
|
|
|
1107
1232
|
"processor": "json",
|
|
1108
1233
|
"processor_params": {"query_path": "/schema", "output_path": "schema"},
|
|
1109
1234
|
}
|
|
1110
|
-
] + self._dataset_specification(writer)
|
|
1235
|
+
] + self._dataset_specification(writer, dataset, snapshot_catalog)
|
|
1111
1236
|
|
|
1112
1237
|
@staticmethod
|
|
1113
1238
|
def _download_dataset_element(
|
|
@@ -1186,7 +1311,9 @@ class Dataset:
|
|
|
1186
1311
|
)
|
|
1187
1312
|
return exports
|
|
1188
1313
|
|
|
1189
|
-
def _generate_dataset_download_spec(
|
|
1314
|
+
def _generate_dataset_download_spec(
|
|
1315
|
+
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
|
|
1316
|
+
) -> dict[str, Any]:
|
|
1190
1317
|
"""
|
|
1191
1318
|
|
|
1192
1319
|
Returns:
|
|
@@ -1244,7 +1371,7 @@ class Dataset:
|
|
|
1244
1371
|
},
|
|
1245
1372
|
},
|
|
1246
1373
|
]
|
|
1247
|
-
+ self._processor_params(),
|
|
1374
|
+
+ self._processor_params(dataset, snapshot_catalog),
|
|
1248
1375
|
},
|
|
1249
1376
|
}
|
|
1250
1377
|
|
deriva_ml/deriva_definitions.py
CHANGED
deriva_ml/deriva_ml_base.py
CHANGED
|
@@ -15,8 +15,11 @@ import logging
|
|
|
15
15
|
from datetime import datetime
|
|
16
16
|
import hashlib
|
|
17
17
|
from itertools import chain
|
|
18
|
+
import inspect
|
|
18
19
|
from pathlib import Path
|
|
19
20
|
import requests
|
|
21
|
+
from setuptools_git_versioning import get_latest_file_commit
|
|
22
|
+
import subprocess
|
|
20
23
|
from typing import Optional, Any, Iterable, TYPE_CHECKING
|
|
21
24
|
from deriva.core import (
|
|
22
25
|
ErmrestCatalog,
|
|
@@ -35,6 +38,8 @@ from pydantic import validate_call, ConfigDict
|
|
|
35
38
|
from .execution_configuration import ExecutionConfiguration, Workflow
|
|
36
39
|
from .feature import Feature, FeatureRecord
|
|
37
40
|
from .dataset import Dataset
|
|
41
|
+
from .dataset_aux_classes import DatasetSpec
|
|
42
|
+
from .dataset_bag import DatasetBag
|
|
38
43
|
from .deriva_model import DerivaModel
|
|
39
44
|
from .upload import (
|
|
40
45
|
table_path,
|
|
@@ -56,6 +61,18 @@ from .deriva_definitions import (
|
|
|
56
61
|
FileSpec,
|
|
57
62
|
)
|
|
58
63
|
|
|
64
|
+
try:
|
|
65
|
+
from icecream import ic
|
|
66
|
+
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
67
|
+
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
from IPython import get_ipython
|
|
72
|
+
except ImportError: # Graceful fallback if IPython isn't installed.
|
|
73
|
+
get_ipython = lambda: None
|
|
74
|
+
|
|
75
|
+
|
|
59
76
|
if TYPE_CHECKING:
|
|
60
77
|
from .execution import Execution
|
|
61
78
|
|
|
@@ -132,6 +149,17 @@ class DerivaML(Dataset):
|
|
|
132
149
|
self.version = model_version
|
|
133
150
|
self.configuration = None
|
|
134
151
|
self._execution: Optional[Execution] = None
|
|
152
|
+
self._notebook = None
|
|
153
|
+
try:
|
|
154
|
+
from IPython import get_ipython
|
|
155
|
+
|
|
156
|
+
ipython = get_ipython()
|
|
157
|
+
# Check if running in Jupyter's ZMQ kernel (used by notebooks)
|
|
158
|
+
if ipython is not None and "IPKernelApp" in ipython.config:
|
|
159
|
+
self._notebook = Path(ipython.user_ns.get("__session__"))
|
|
160
|
+
# Check if running in Jupyter's ZMQ kernel (used by notebooks)
|
|
161
|
+
except (ImportError, AttributeError):
|
|
162
|
+
pass
|
|
135
163
|
|
|
136
164
|
self.domain_schema = self.model.domain_schema
|
|
137
165
|
self.project_name = project_name or self.domain_schema
|
|
@@ -705,6 +733,28 @@ class DerivaML(Dataset):
|
|
|
705
733
|
for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()
|
|
706
734
|
]
|
|
707
735
|
|
|
736
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
737
|
+
def download_dataset_bag(
|
|
738
|
+
self,
|
|
739
|
+
dataset: DatasetSpec,
|
|
740
|
+
execution_rid: Optional[RID] = None,
|
|
741
|
+
) -> DatasetBag:
|
|
742
|
+
"""Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
|
|
743
|
+
|
|
744
|
+
Args:
|
|
745
|
+
dataset: Specification of the dataset to be downloaded.
|
|
746
|
+
execution_rid: Execution RID for the dataset.
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
|
|
750
|
+
for the dataset.
|
|
751
|
+
"""
|
|
752
|
+
return self._download_dataset_bag(
|
|
753
|
+
dataset=dataset,
|
|
754
|
+
execution_rid=execution_rid,
|
|
755
|
+
snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
|
|
756
|
+
)
|
|
757
|
+
|
|
708
758
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
709
759
|
def download_asset(self, asset_rid: RID, dest_dir: Path) -> Path:
|
|
710
760
|
"""Download an asset from a URL and place it in a local directory.
|
|
@@ -808,8 +858,10 @@ class DerivaML(Dataset):
|
|
|
808
858
|
Iterable of the RIDs of the files that were added.
|
|
809
859
|
"""
|
|
810
860
|
defined_types = self.list_vocabulary_terms(MLVocab.file_type)
|
|
811
|
-
if execution_rid and self.resolve_rid(execution_rid).table.name !=
|
|
812
|
-
raise DerivaMLException(
|
|
861
|
+
if execution_rid and self.resolve_rid(execution_rid).table.name != "Execution":
|
|
862
|
+
raise DerivaMLException(
|
|
863
|
+
f"RID {execution_rid} is not for an execution table."
|
|
864
|
+
)
|
|
813
865
|
|
|
814
866
|
def check_file_type(dtype: str) -> bool:
|
|
815
867
|
for term in defined_types:
|
|
@@ -862,18 +914,11 @@ class DerivaML(Dataset):
|
|
|
862
914
|
self, file_types: Optional[list[str]] = None
|
|
863
915
|
) -> list[dict[str, Any]]:
|
|
864
916
|
"""Return the contents of the file table. Denormalized file types into the file record."""
|
|
865
|
-
atable = next(
|
|
866
|
-
self._model.schemas[self._ml_schema]
|
|
867
|
-
.tables[MLVocab.dataset_type]
|
|
868
|
-
.find_associations()
|
|
869
|
-
).name
|
|
870
917
|
ml_path = self.pathBuilder.schemas[self._ml_schema]
|
|
871
|
-
atable_path = ml_path.tables[atable]
|
|
872
918
|
file_path = ml_path.File
|
|
873
919
|
type_path = ml_path.File_File_Type
|
|
874
920
|
|
|
875
921
|
# Get a list of all the dataset_type values associated with this dataset_table.
|
|
876
|
-
files = []
|
|
877
922
|
path = file_path.link(type_path)
|
|
878
923
|
path = path.attributes(
|
|
879
924
|
path.File.RID,
|
|
@@ -885,10 +930,12 @@ class DerivaML(Dataset):
|
|
|
885
930
|
)
|
|
886
931
|
file_map = {}
|
|
887
932
|
for f in path.fetch():
|
|
888
|
-
file_map.setdefault(f[
|
|
933
|
+
file_map.setdefault(f["RID"], f | {"File_Types": []})["File_Types"].append(
|
|
934
|
+
f["File_Type"]
|
|
935
|
+
)
|
|
889
936
|
|
|
890
937
|
# Now get rid of the File_Type key and return the result
|
|
891
|
-
return [
|
|
938
|
+
return [(f, f.pop("File_Type"))[0] for f in file_map.values()]
|
|
892
939
|
|
|
893
940
|
def list_workflows(self) -> list[Workflow]:
|
|
894
941
|
"""Return a list of all the workflows in the catalog."""
|
|
@@ -901,6 +948,7 @@ class DerivaML(Dataset):
|
|
|
901
948
|
version=w["Version"],
|
|
902
949
|
description=w["Description"],
|
|
903
950
|
rid=w["RID"],
|
|
951
|
+
checksum=w["Checksum"],
|
|
904
952
|
)
|
|
905
953
|
for w in workflow_path.entities().fetch()
|
|
906
954
|
]
|
|
@@ -917,33 +965,18 @@ class DerivaML(Dataset):
|
|
|
917
965
|
"""
|
|
918
966
|
|
|
919
967
|
# Check to make sure that the workflow is not already in the table. If it's not, add it.
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
response = requests.get(url)
|
|
924
|
-
response.raise_for_status()
|
|
925
|
-
except Exception:
|
|
926
|
-
raise DerivaMLException(f"Invalid URL: {url}")
|
|
927
|
-
else:
|
|
928
|
-
sha256_hash = hashlib.sha256()
|
|
929
|
-
sha256_hash.update(response.content)
|
|
930
|
-
checksum = "SHA-256: " + sha256_hash.hexdigest()
|
|
931
|
-
return checksum
|
|
968
|
+
|
|
969
|
+
if workflow_rid := self.lookup_workflow(workflow.url):
|
|
970
|
+
return workflow_rid
|
|
932
971
|
|
|
933
972
|
ml_schema_path = self.pathBuilder.schemas[self.ml_schema]
|
|
934
973
|
try:
|
|
935
|
-
url_column = ml_schema_path.Workflow.URL
|
|
936
|
-
workflow_record = list(
|
|
937
|
-
ml_schema_path.Workflow.filter(url_column == workflow.url).entities()
|
|
938
|
-
)[0]
|
|
939
|
-
workflow_rid = workflow_record["RID"]
|
|
940
|
-
except IndexError:
|
|
941
974
|
# Record doesn't exist already
|
|
942
975
|
workflow_record = {
|
|
943
976
|
"URL": workflow.url,
|
|
944
977
|
"Name": workflow.name,
|
|
945
978
|
"Description": workflow.description,
|
|
946
|
-
"Checksum":
|
|
979
|
+
"Checksum": workflow.checksum,
|
|
947
980
|
"Version": workflow.version,
|
|
948
981
|
MLVocab.workflow_type: self.lookup_term(
|
|
949
982
|
MLVocab.workflow_type, workflow.workflow_type
|
|
@@ -955,6 +988,125 @@ class DerivaML(Dataset):
|
|
|
955
988
|
raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
|
|
956
989
|
return workflow_rid
|
|
957
990
|
|
|
991
|
+
def lookup_workflow(self, url: str) -> Optional[RID]:
|
|
992
|
+
workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
|
|
993
|
+
try:
|
|
994
|
+
url_column = workflow_path.URL
|
|
995
|
+
return list(workflow_path.filter(url_column == url).entities())[0]["RID"]
|
|
996
|
+
except IndexError:
|
|
997
|
+
return None
|
|
998
|
+
|
|
999
|
+
def create_workflow(
|
|
1000
|
+
self, name: str, workflow_type: str, description: str = "", create: bool = True
|
|
1001
|
+
) -> RID:
|
|
1002
|
+
"""Identify current executing program and return a workflow RID for it
|
|
1003
|
+
|
|
1004
|
+
Determane the notebook of script that is currently being executed. Assume that this is
|
|
1005
|
+
being executed from a cloned GitHub repository. Determine the remote repository name for
|
|
1006
|
+
this object. Then either retrieve an existing workflow for this executable of create
|
|
1007
|
+
a new one.
|
|
1008
|
+
|
|
1009
|
+
Args:
|
|
1010
|
+
name: The name of the workflow.
|
|
1011
|
+
workflow_type: The type of the workflow.
|
|
1012
|
+
description: The description of the workflow.
|
|
1013
|
+
create: Whether or not to create a new workflow.
|
|
1014
|
+
"""
|
|
1015
|
+
# Make sure type is correct.
|
|
1016
|
+
self.lookup_term(MLVocab.workflow_type, workflow_type)
|
|
1017
|
+
filename, github_url, is_dirty = self._github_url()
|
|
1018
|
+
|
|
1019
|
+
if is_dirty:
|
|
1020
|
+
self._logger.warning(
|
|
1021
|
+
f"File {filename} has been modified since last commit. Consider commiting before executing"
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
sha256_hash = hashlib.sha256()
|
|
1025
|
+
if self._notebook:
|
|
1026
|
+
# If you are in a notebook, strip out the outputs before computing the checksum.
|
|
1027
|
+
result = subprocess.run(
|
|
1028
|
+
["nbstripout", "-t", filename],
|
|
1029
|
+
capture_output=True,
|
|
1030
|
+
text=False,
|
|
1031
|
+
check=True,
|
|
1032
|
+
)
|
|
1033
|
+
sha256_hash.update(result.stdout)
|
|
1034
|
+
else:
|
|
1035
|
+
with open(filename, "rb") as f:
|
|
1036
|
+
sha256_hash.update(f.read())
|
|
1037
|
+
checksum = "SHA-256:" + sha256_hash.hexdigest()
|
|
1038
|
+
|
|
1039
|
+
workflow = Workflow(
|
|
1040
|
+
name=name,
|
|
1041
|
+
url=github_url,
|
|
1042
|
+
checksum=checksum,
|
|
1043
|
+
description=description,
|
|
1044
|
+
workflow_type=workflow_type,
|
|
1045
|
+
)
|
|
1046
|
+
return self.add_workflow(workflow) if create else None
|
|
1047
|
+
|
|
1048
|
+
def _github_url(self) -> tuple[str, str, bool]:
|
|
1049
|
+
"""Return a GitHUB URL for the latest commit of the script from which this routine is called.
|
|
1050
|
+
|
|
1051
|
+
This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
|
|
1052
|
+
the file is in a gitHUB repository and commited. It returns a URL to the last commited version of this
|
|
1053
|
+
file in GitHUB.
|
|
1054
|
+
|
|
1055
|
+
Returns: A tuple with the filename, gethub_url and a boolaen to indicated if uncommited changes
|
|
1056
|
+
have been made to the file.
|
|
1057
|
+
|
|
1058
|
+
"""
|
|
1059
|
+
|
|
1060
|
+
# Get the name of the script that is calling this function.
|
|
1061
|
+
if self._notebook:
|
|
1062
|
+
# Try to get the __session__ variable from the user namespace.
|
|
1063
|
+
filename = Path("").absolute().parent / self._notebook
|
|
1064
|
+
else:
|
|
1065
|
+
stack = inspect.stack()
|
|
1066
|
+
if len(stack) > 1:
|
|
1067
|
+
filename = Path(
|
|
1068
|
+
stack[2].filename
|
|
1069
|
+
) # Get the caller's filename, which is two up the stack from here.
|
|
1070
|
+
else:
|
|
1071
|
+
raise DerivaMLException(
|
|
1072
|
+
f"Looking for caller failed"
|
|
1073
|
+
) # Stack is too shallow
|
|
1074
|
+
|
|
1075
|
+
# Get repo URL from local github repo.
|
|
1076
|
+
try:
|
|
1077
|
+
result = subprocess.run(
|
|
1078
|
+
["git", "remote", "get-url", "origin"], capture_output=True, text=True
|
|
1079
|
+
)
|
|
1080
|
+
github_url = result.stdout.strip().removesuffix(".git")
|
|
1081
|
+
except subprocess.CalledProcessError:
|
|
1082
|
+
raise DerivaMLException(f"No GIT remote found")
|
|
1083
|
+
|
|
1084
|
+
# Find the root directory for the repository
|
|
1085
|
+
repo_root = filename
|
|
1086
|
+
while repo_root != repo_root.root:
|
|
1087
|
+
if (repo_root / ".git").exists():
|
|
1088
|
+
break
|
|
1089
|
+
else:
|
|
1090
|
+
repo_root = repo_root.parent
|
|
1091
|
+
|
|
1092
|
+
# Now check to see if file has been modified since the last commit.
|
|
1093
|
+
try:
|
|
1094
|
+
result = subprocess.run(
|
|
1095
|
+
["git", "status", "--porcelain"],
|
|
1096
|
+
capture_output=True,
|
|
1097
|
+
text=True,
|
|
1098
|
+
check=True,
|
|
1099
|
+
)
|
|
1100
|
+
is_dirty = bool(
|
|
1101
|
+
" M " in result.stdout.strip()
|
|
1102
|
+
) # Returns True if output indicates a modified file
|
|
1103
|
+
except subprocess.CalledProcessError:
|
|
1104
|
+
is_dirty = False # If Git command fails, assume no changes
|
|
1105
|
+
|
|
1106
|
+
sha = get_latest_file_commit(filename)
|
|
1107
|
+
url = f"{github_url}/blob/{sha}/{filename.relative_to(repo_root)}"
|
|
1108
|
+
return filename, url, is_dirty
|
|
1109
|
+
|
|
958
1110
|
# @validate_call
|
|
959
1111
|
def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
|
|
960
1112
|
"""Create an execution object
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from sympy import cxxcode
|
|
2
|
+
|
|
3
|
+
from deriva_ml import DerivaML, execution_configuration
|
|
4
|
+
|
|
5
|
+
def execute(host, catalog, script):
|
|
6
|
+
workflow_rid = foobar
|
|
7
|
+
execution_configuration = cxxcode(
|
|
8
|
+
|
|
9
|
+
)
|
|
10
|
+
ml_instance = DerivaML()
|
|
11
|
+
ml_instance.create_execution(configuration)
|
|
12
|
+
script
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
from deriva_ml import DerivaML, ExecutionConfiguration, DatasetSpec, RID, DerivaMLException
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
import json
|
|
19
|
+
import traceback
|
|
20
|
+
import argparse
|
|
21
|
+
import requests
|
|
22
|
+
from requests.exceptions import HTTPError, ConnectionError
|
|
23
|
+
from deriva.transfer import GenericDownloader
|
|
24
|
+
from deriva.transfer.download import DerivaDownloadError, DerivaDownloadConfigurationError, \
|
|
25
|
+
DerivaDownloadAuthenticationError, DerivaDownloadAuthorizationError, DerivaDownloadTimeoutError, \
|
|
26
|
+
DerivaDownloadBaggingError
|
|
27
|
+
from deriva.core import BaseCLI, KeyValuePairArgs, format_credential, format_exception, urlparse
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DerivaMLExecCLI(BaseCLI):
|
|
31
|
+
def __init__(self, description, epilog, **kwargs):
|
|
32
|
+
|
|
33
|
+
BaseCLI.__init__(self, description, epilog, **kwargs)
|
|
34
|
+
self.parser.add_argument("--catalog", default=1, metavar="<1>", help="Catalog number. Default: 1")
|
|
35
|
+
self.parser.add_argument("--timeout", metavar="<seconds>",
|
|
36
|
+
help="Total number of seconds elapsed before the download is aborted.")
|
|
37
|
+
self.parser.add_argument("output_dir", metavar="<output dir>", help="Path to an output directory.")
|
|
38
|
+
self.parser.add_argument("envars", metavar="[key=value key=value ...]",
|
|
39
|
+
nargs=argparse.REMAINDER, action=KeyValuePairArgs, default={},
|
|
40
|
+
help="Variable length of whitespace-delimited key=value pair arguments used for "
|
|
41
|
+
"string interpolation in specific parts of the configuration file. "
|
|
42
|
+
"For example: key1=value1 key2=value2")
|
|
43
|
+
|
|
44
|
+
def main(self):
|
|
45
|
+
try:
|
|
46
|
+
args = self.parse_cli()
|
|
47
|
+
except ValueError as e:
|
|
48
|
+
sys.stderr.write(str(e))
|
|
49
|
+
return 2
|
|
50
|
+
if not args.quiet:
|
|
51
|
+
sys.stderr.write("\n")
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
try:
|
|
55
|
+
ml_instance = DerivaML(args.hostname, args.catalog)
|
|
56
|
+
downloaded = self.execute()
|
|
57
|
+
sys.stdout.write("\n%s\n" % (json.dumps(downloaded)))
|
|
58
|
+
except ConnectionError as e:
|
|
59
|
+
raise DerivaDownloadError("Connection error occurred. %s" % format_exception(e))
|
|
60
|
+
except HTTPError as e:
|
|
61
|
+
if e.response.status_code == requests.codes.unauthorized:
|
|
62
|
+
raise DerivaDownloadAuthenticationError(
|
|
63
|
+
"The requested service requires authentication and a valid login session could "
|
|
64
|
+
"not be found for the specified host. Server responded: %s" % e)
|
|
65
|
+
elif e.response.status_code == requests.codes.forbidden:
|
|
66
|
+
raise DerivaDownloadAuthorizationError(
|
|
67
|
+
"A requested operation was forbidden. Server responded: %s" % e)
|
|
68
|
+
except (DerivaDownloadError, DerivaDownloadConfigurationError, DerivaDownloadAuthenticationError,
|
|
69
|
+
DerivaDownloadAuthorizationError, DerivaDownloadTimeoutError, DerivaDownloadBaggingError) as e:
|
|
70
|
+
sys.stderr.write(("\n" if not args.quiet else "") + format_exception(e))
|
|
71
|
+
if args.debug:
|
|
72
|
+
traceback.print_exc()
|
|
73
|
+
return 1
|
|
74
|
+
except:
|
|
75
|
+
sys.stderr.write("An unexpected error occurred.")
|
|
76
|
+
traceback.print_exc()
|
|
77
|
+
return 1
|
|
78
|
+
finally:
|
|
79
|
+
if not args.quiet:
|
|
80
|
+
sys.stderr.write("\n\n")
|
|
81
|
+
return 0
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def do_stuff():
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
def main(datasets: list[RID], model: list[RID], hostname: str, catalog_id: str):
|
|
88
|
+
my_url = DerivaML.github_url()
|
|
89
|
+
ml_instance = DerivaML(hostname, catalog_id)
|
|
90
|
+
ml_instance.lookup_workflow(my_url)
|
|
91
|
+
config = ExecutionConfiguration(
|
|
92
|
+
datasets=[DatasetSpec(rid=dataset,
|
|
93
|
+
version=ml_instance.dataset_version(dataset)) for dataset in datasets],
|
|
94
|
+
assets=model,
|
|
95
|
+
workflow= ml_instance.lookup_workflow(my_url)
|
|
96
|
+
)
|
|
97
|
+
execution = ml_instance.create_execution(config)
|
|
98
|
+
with execution as e:
|
|
99
|
+
do_stuff()
|
|
100
|
+
execution.upload_execution_outputs()
|
|
101
|
+
|
|
102
|
+
if __name__ == "__main__":
|
|
103
|
+
main(datasets, model, hostname, catalog_id)
|
|
104
|
+
if __file__ == matplotlib_inline
|
deriva_ml/deriva_model.py
CHANGED
|
@@ -265,7 +265,9 @@ class DerivaModel:
|
|
|
265
265
|
return relationships[0]
|
|
266
266
|
|
|
267
267
|
def _schema_to_paths(
|
|
268
|
-
self,
|
|
268
|
+
self,
|
|
269
|
+
root: Table = None,
|
|
270
|
+
path: list[Table] = None,
|
|
269
271
|
) -> list[list[Table]]:
|
|
270
272
|
"""Recursively walk over the domain schema graph and extend the current path.
|
|
271
273
|
|
|
@@ -278,6 +280,7 @@ class DerivaModel:
|
|
|
278
280
|
A list of all the paths through the graph. Each path is a list of tables.
|
|
279
281
|
|
|
280
282
|
"""
|
|
283
|
+
|
|
281
284
|
root = root or self.model.schemas[self.ml_schema].tables["Dataset"]
|
|
282
285
|
path = path.copy() if path else []
|
|
283
286
|
parent = path[-1] if path else None # Table that we are coming from.
|
deriva_ml/execution.py
CHANGED
|
@@ -12,6 +12,7 @@ import os
|
|
|
12
12
|
import shutil
|
|
13
13
|
from datetime import datetime
|
|
14
14
|
from pathlib import Path
|
|
15
|
+
import requests
|
|
15
16
|
from tempfile import NamedTemporaryFile
|
|
16
17
|
from typing import Iterable, Any, Optional
|
|
17
18
|
from deriva.core import format_exception
|
|
@@ -28,7 +29,6 @@ from .deriva_definitions import (
|
|
|
28
29
|
)
|
|
29
30
|
from .deriva_ml_base import DerivaML, FeatureRecord
|
|
30
31
|
from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
|
|
31
|
-
from .dataset import Dataset
|
|
32
32
|
from .dataset_bag import DatasetBag
|
|
33
33
|
from .execution_configuration import ExecutionConfiguration
|
|
34
34
|
from .execution_environment import get_execution_environment
|
|
@@ -51,6 +51,12 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
51
51
|
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
52
52
|
|
|
53
53
|
|
|
54
|
+
try:
|
|
55
|
+
from jupyter_server.serverapp import list_running_servers
|
|
56
|
+
except ImportError:
|
|
57
|
+
list_running_servers = lambda: []
|
|
58
|
+
|
|
59
|
+
|
|
54
60
|
class Execution:
|
|
55
61
|
"""The Execution class is used to capture the context of an activity within DerivaML. While these are primarily
|
|
56
62
|
computational, manual processes can be represented by an execution as well.
|
|
@@ -100,6 +106,7 @@ class Execution:
|
|
|
100
106
|
self.configuration = configuration
|
|
101
107
|
self._ml_object = ml_object
|
|
102
108
|
self.start_time = None
|
|
109
|
+
self.stop_time = None
|
|
103
110
|
self.status = Status.created
|
|
104
111
|
self.uploaded_assets: list[Path] = []
|
|
105
112
|
|
|
@@ -221,8 +228,9 @@ class Execution:
|
|
|
221
228
|
Returns:
|
|
222
229
|
the location of the unpacked and validated dataset_table bag and the RID of the bag
|
|
223
230
|
"""
|
|
224
|
-
|
|
225
|
-
|
|
231
|
+
return self._ml_object.download_dataset_bag(
|
|
232
|
+
dataset, execution_rid=self.execution_rid
|
|
233
|
+
)
|
|
226
234
|
|
|
227
235
|
@validate_call
|
|
228
236
|
def update_status(self, status: Status, msg: str) -> None:
|
|
@@ -243,6 +251,35 @@ class Execution:
|
|
|
243
251
|
]
|
|
244
252
|
)
|
|
245
253
|
|
|
254
|
+
def _create_notebook_checkpoint(self):
|
|
255
|
+
"""Trigger a checkpoint creation using Jupyter's API."""
|
|
256
|
+
notebook_name = self._ml_object._notebook
|
|
257
|
+
servers = list_running_servers()
|
|
258
|
+
# Look for the server running this notebook.
|
|
259
|
+
root = Path("").absolute().parent.as_posix()
|
|
260
|
+
servers = list(list_running_servers())
|
|
261
|
+
# Jupyterhub seems to handle root_dir differently then server case.
|
|
262
|
+
server = (
|
|
263
|
+
servers
|
|
264
|
+
if len(servers) == 1
|
|
265
|
+
else [s for s in servers if s["root_dir"] == root]
|
|
266
|
+
)[0]
|
|
267
|
+
notebook_url = f"{server['url']}api/contents/{notebook_name}"
|
|
268
|
+
|
|
269
|
+
# Get notebook content
|
|
270
|
+
response = requests.get(
|
|
271
|
+
notebook_url, headers={"Authorization": f"Token {server['token']}"}
|
|
272
|
+
)
|
|
273
|
+
if response.status_code == 200:
|
|
274
|
+
notebook_content = response.json()["content"]
|
|
275
|
+
# Execution metadata cannot be in a directory, so map path into filename.
|
|
276
|
+
checkpoint_path = (
|
|
277
|
+
self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
|
|
278
|
+
/ f"{notebook_name.as_posix().replace('/','_')}.checkpoint"
|
|
279
|
+
)
|
|
280
|
+
with open(checkpoint_path, "w", encoding="utf-8") as f:
|
|
281
|
+
json.dump(notebook_content, f)
|
|
282
|
+
|
|
246
283
|
def execution_start(self) -> None:
|
|
247
284
|
"""Start an execution, uploading status to catalog"""
|
|
248
285
|
|
|
@@ -252,11 +289,15 @@ class Execution:
|
|
|
252
289
|
|
|
253
290
|
def execution_stop(self) -> None:
|
|
254
291
|
"""Finish the execution and update the duration and status of execution."""
|
|
255
|
-
|
|
292
|
+
self.stop_time = datetime.now()
|
|
293
|
+
duration = self.stop_time - self.start_time
|
|
256
294
|
hours, remainder = divmod(duration.total_seconds(), 3600)
|
|
257
295
|
minutes, seconds = divmod(remainder, 60)
|
|
258
296
|
duration = f"{round(hours, 0)}H {round(minutes, 0)}min {round(seconds, 4)}sec"
|
|
259
297
|
|
|
298
|
+
if self._ml_object._notebook:
|
|
299
|
+
self._create_notebook_checkpoint()
|
|
300
|
+
|
|
260
301
|
self.update_status(Status.completed, "Algorithm execution ended.")
|
|
261
302
|
self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
|
|
262
303
|
[{"RID": self.execution_rid, "Duration": duration}]
|
|
@@ -33,18 +33,18 @@ class Workflow(BaseModel):
|
|
|
33
33
|
version: Optional[str] = None
|
|
34
34
|
description: Optional[str] = ""
|
|
35
35
|
rid: Optional[RID] = None
|
|
36
|
+
checksum: Optional[str]
|
|
37
|
+
|
|
36
38
|
|
|
37
39
|
|
|
38
40
|
class ExecutionConfiguration(BaseModel):
|
|
39
41
|
"""Define the parameters that are used to configure a specific execution.
|
|
40
42
|
|
|
41
43
|
Attributes:
|
|
42
|
-
datasets: List of
|
|
43
|
-
|
|
44
|
-
needed, a dictionary that defines the rid and the materialization parameter for the
|
|
45
|
-
download_dataset_bag method can be specified, e.g. datasets=[{'rid': RID, 'materialize': True}].
|
|
44
|
+
datasets: List of dataset specifications which specify the dataset RID, version and if the dataset
|
|
45
|
+
should be materialized.
|
|
46
46
|
assets: List of assets to be downloaded prior to execution. The values must be RIDs in an asset table
|
|
47
|
-
workflow: A workflow instance. Must have a name, URI to the workflow instance, and a type.
|
|
47
|
+
workflow: A RID for a workflow instance. Must have a name, URI to the workflow instance, and a type.
|
|
48
48
|
description: A description of the execution. Can use Markdown format.
|
|
49
49
|
"""
|
|
50
50
|
|
deriva_ml/upload.py
CHANGED
|
@@ -70,8 +70,11 @@ exec_asset_regex = (
|
|
|
70
70
|
exec_metadata_dir_regex = (
|
|
71
71
|
exec_dir_regex + r"/execution-metadata/(?P<execution_metadata_type>[-\w]+)"
|
|
72
72
|
)
|
|
73
|
+
|
|
74
|
+
# May have more than one suffix
|
|
73
75
|
exec_metadata_regex = (
|
|
74
|
-
exec_metadata_dir_regex
|
|
76
|
+
exec_metadata_dir_regex
|
|
77
|
+
+ r"/(?P<filename>[-\w]+([.][\w]+)*)[.](?P<file_ext>[a-z0-9]*)$"
|
|
75
78
|
)
|
|
76
79
|
feature_dir_regex = exec_dir_regex + r"/feature"
|
|
77
80
|
feature_table_dir_regex = (
|
|
@@ -1,16 +1,18 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: deriva-ml
|
|
3
|
-
Version: 1.8.
|
|
3
|
+
Version: 1.8.4
|
|
4
4
|
Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
|
|
5
5
|
Author-email: ISRD <isrd-dev@isi.edu>
|
|
6
6
|
Requires-Python: >=3.10
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
License-File: LICENSE
|
|
9
|
-
Requires-Dist: deriva~=1.7.
|
|
9
|
+
Requires-Dist: deriva~=1.7.7
|
|
10
10
|
Requires-Dist: pandas
|
|
11
11
|
Requires-Dist: regex~=2024.7.24
|
|
12
12
|
Requires-Dist: pydantic>=2.10.6
|
|
13
13
|
Requires-Dist: semver>3.0.0
|
|
14
|
+
Requires-Dist: setuptools-git-versioning<3,>=2.0
|
|
15
|
+
Requires-Dist: nbstripout
|
|
14
16
|
|
|
15
17
|
Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
|
|
16
18
|
using a deriva catalog.
|
|
@@ -1,20 +1,21 @@
|
|
|
1
|
-
deriva_ml/VERSION.py,sha256=
|
|
1
|
+
deriva_ml/VERSION.py,sha256=8kdJa8mgK7VES73y02oBbzwoXZCUs42GzbJ4UU-L_3I,22
|
|
2
2
|
deriva_ml/__init__.py,sha256=0PHNB8gRDALLtaffRmU7wCUgWbRHVQZcjuPJxMLNEco,856
|
|
3
3
|
deriva_ml/database_model.py,sha256=uhoyVyd8MQmY8J9ovCH8fjxhZDxxXNkdJyYdeyEGPXA,13898
|
|
4
|
-
deriva_ml/dataset.py,sha256=
|
|
4
|
+
deriva_ml/dataset.py,sha256=xC6QPUp4MZcJiEnOEU3NnzoLBL9RcJWtPTyzIQP0Ivw,60666
|
|
5
5
|
deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
|
|
6
6
|
deriva_ml/dataset_bag.py,sha256=e6IHv3saZUnZRfl0EjfnlV2NnmPeOagYYv3PuZqS1l0,11501
|
|
7
7
|
deriva_ml/demo_catalog.py,sha256=xQPhFlflqwJskNQrQ-jdBSnGzBm2-aONBgcRxfsdNKM,11045
|
|
8
|
-
deriva_ml/deriva_definitions.py,sha256=
|
|
9
|
-
deriva_ml/deriva_ml_base.py,sha256=
|
|
10
|
-
deriva_ml/
|
|
11
|
-
deriva_ml/
|
|
12
|
-
deriva_ml/
|
|
8
|
+
deriva_ml/deriva_definitions.py,sha256=pZLPoUxiuJ-uGglmQ6sF9oVXsSUuOnPEqywoec78XNM,8893
|
|
9
|
+
deriva_ml/deriva_ml_base.py,sha256=3iA1OaPU-6Q7ixt87uDmPuHHZ5P-FyHvX0AKfi4tKp0,42224
|
|
10
|
+
deriva_ml/deriva_ml_execute.py,sha256=y_rGjc97eidBuzy-AaQGe93vuTbWbkNkK9rpReqV0IY,4433
|
|
11
|
+
deriva_ml/deriva_model.py,sha256=LV3FjIhIlz13ckZSmu0aOJhT9EVE0-M9oVMudfkxb0g,12004
|
|
12
|
+
deriva_ml/execution.py,sha256=c7dbk4HvEh7E4BLlBrf_azUxxhRSUmLQa_6G8t8OKVY,29929
|
|
13
|
+
deriva_ml/execution_configuration.py,sha256=bjnZwXN6M7YPy5dFQwoGEBU8YjhQRSe1FW0rL0V9TaM,3422
|
|
13
14
|
deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
|
|
14
15
|
deriva_ml/feature.py,sha256=7e8WYPCfJSrGxJh9oUTduYSnB5ekybRhXa_0HIigS_w,5459
|
|
15
16
|
deriva_ml/history.py,sha256=qTDLDs8Ow_6r7mDO0gZm0Fg81SWKOAgtCU5pzZoDRgM,2828
|
|
16
17
|
deriva_ml/test_functions.py,sha256=-eqLHjjCQCLBNAr1ofbZekNiCOfMISSACRxT_YHER8I,4396
|
|
17
|
-
deriva_ml/upload.py,sha256=
|
|
18
|
+
deriva_ml/upload.py,sha256=CKtT-gBln3pnAll9TFaiPhFSHC-bzg9oE4ruh_OSOqY,22270
|
|
18
19
|
deriva_ml/build/lib/schema_setup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
20
|
deriva_ml/build/lib/schema_setup/alter_annotation.py,sha256=pkwk0WystN69JfAFK4iBJZAZVQKbRs-gN9IFYuS9rfg,1739
|
|
20
21
|
deriva_ml/build/lib/schema_setup/annotation_temp.py,sha256=Euygu8wNklZFUbR6mz-pDWJemlzdsIn9d6j0f6fCfgE,9102
|
|
@@ -26,9 +27,9 @@ deriva_ml/schema_setup/annotations.py,sha256=Uogm9YkRtoKSdgfQlICqRywbCATppwBO-Xr
|
|
|
26
27
|
deriva_ml/schema_setup/create_schema.py,sha256=jwziMWJPbjRgjiRBT-KtidnXI8YNEFO74A9fwfptjHY,10626
|
|
27
28
|
deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
|
|
28
29
|
deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
|
|
29
|
-
deriva_ml-1.8.
|
|
30
|
-
deriva_ml-1.8.
|
|
31
|
-
deriva_ml-1.8.
|
|
32
|
-
deriva_ml-1.8.
|
|
33
|
-
deriva_ml-1.8.
|
|
34
|
-
deriva_ml-1.8.
|
|
30
|
+
deriva_ml-1.8.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
31
|
+
deriva_ml-1.8.4.dist-info/METADATA,sha256=F14U7NvY310NBB4wGp3-OVmAUXvMy_sDNuS1ZmRjwek,631
|
|
32
|
+
deriva_ml-1.8.4.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
|
|
33
|
+
deriva_ml-1.8.4.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
|
|
34
|
+
deriva_ml-1.8.4.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
|
|
35
|
+
deriva_ml-1.8.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|