deriva-ml 1.8.2__tar.gz → 1.8.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deriva_ml-1.8.2/src/deriva_ml.egg-info → deriva_ml-1.8.4}/PKG-INFO +4 -2
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/pyproject.toml +4 -2
- deriva_ml-1.8.4/src/deriva_ml/VERSION.py +1 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/dataset.py +149 -93
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/deriva_definitions.py +0 -1
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/deriva_ml_base.py +182 -30
- deriva_ml-1.8.4/src/deriva_ml/deriva_ml_execute.py +104 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/execution.py +45 -4
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/execution_configuration.py +5 -5
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/upload.py +4 -1
- {deriva_ml-1.8.2 → deriva_ml-1.8.4/src/deriva_ml.egg-info}/PKG-INFO +4 -2
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml.egg-info/SOURCES.txt +1 -0
- deriva_ml-1.8.4/src/deriva_ml.egg-info/requires.txt +7 -0
- deriva_ml-1.8.2/src/deriva_ml/VERSION.py +0 -1
- deriva_ml-1.8.2/src/deriva_ml.egg-info/requires.txt +0 -5
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/LICENSE +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/README.md +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/setup.cfg +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/__init__.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/build/lib/schema_setup/__init__.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/build/lib/schema_setup/alter_annotation.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/build/lib/schema_setup/annotation_temp.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/build/lib/schema_setup/create_schema.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/build/lib/schema_setup/table_comments_utils.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/database_model.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/dataset_aux_classes.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/dataset_bag.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/demo_catalog.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/deriva_model.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/execution_environment.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/feature.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/history.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/schema_setup/__init__.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/schema_setup/alter_annotation.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/schema_setup/annotations.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/schema_setup/create_schema.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/schema_setup/policy.json +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/schema_setup/table_comments_utils.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/test_functions.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml.egg-info/dependency_links.txt +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml.egg-info/entry_points.txt +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml.egg-info/top_level.txt +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/tests/test_basic_tables.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/tests/test_dataset.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/tests/test_download.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/tests/test_execution.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/tests/test_features.py +0 -0
- {deriva_ml-1.8.2 → deriva_ml-1.8.4}/tests/test_upload.py +0 -0
|
@@ -1,16 +1,18 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: deriva-ml
|
|
3
|
-
Version: 1.8.
|
|
3
|
+
Version: 1.8.4
|
|
4
4
|
Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
|
|
5
5
|
Author-email: ISRD <isrd-dev@isi.edu>
|
|
6
6
|
Requires-Python: >=3.10
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
License-File: LICENSE
|
|
9
|
-
Requires-Dist: deriva~=1.7.
|
|
9
|
+
Requires-Dist: deriva~=1.7.7
|
|
10
10
|
Requires-Dist: pandas
|
|
11
11
|
Requires-Dist: regex~=2024.7.24
|
|
12
12
|
Requires-Dist: pydantic>=2.10.6
|
|
13
13
|
Requires-Dist: semver>3.0.0
|
|
14
|
+
Requires-Dist: setuptools-git-versioning<3,>=2.0
|
|
15
|
+
Requires-Dist: nbstripout
|
|
14
16
|
|
|
15
17
|
Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
|
|
16
18
|
using a deriva catalog.
|
|
@@ -13,11 +13,13 @@ description = "Utilities to simplify use of Dervia and Pandas to create reproduc
|
|
|
13
13
|
readme = "README.md"
|
|
14
14
|
requires-python = ">=3.10"
|
|
15
15
|
dependencies = [
|
|
16
|
-
"deriva~=1.7.
|
|
16
|
+
"deriva~=1.7.7",
|
|
17
17
|
"pandas",
|
|
18
18
|
"regex~=2024.7.24",
|
|
19
19
|
"pydantic>=2.10.6",
|
|
20
|
-
"semver>3.0.0"
|
|
20
|
+
"semver>3.0.0",
|
|
21
|
+
"setuptools-git-versioning>=2.0,<3",
|
|
22
|
+
"nbstripout",
|
|
21
23
|
]
|
|
22
24
|
|
|
23
25
|
[tool.setuptools.package-data]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.8.4"
|
|
@@ -6,6 +6,7 @@ accessible via a DerivaML class instance.
|
|
|
6
6
|
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
from __future__ import annotations
|
|
9
10
|
from bdbag.fetch.fetcher import fetch_single_file
|
|
10
11
|
from bdbag import bdbag_api as bdb
|
|
11
12
|
from collections import defaultdict
|
|
@@ -37,7 +38,7 @@ from pydantic import (
|
|
|
37
38
|
import requests
|
|
38
39
|
|
|
39
40
|
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
|
40
|
-
from typing import Any, Callable, Optional, Iterable, Iterator
|
|
41
|
+
from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
|
|
41
42
|
|
|
42
43
|
from deriva_ml import DatasetBag
|
|
43
44
|
from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
|
|
@@ -52,6 +53,9 @@ from .dataset_aux_classes import (
|
|
|
52
53
|
DatasetSpec,
|
|
53
54
|
)
|
|
54
55
|
|
|
56
|
+
if TYPE_CHECKING:
|
|
57
|
+
from .deriva_ml_base import DerivaML
|
|
58
|
+
|
|
55
59
|
|
|
56
60
|
class Dataset:
|
|
57
61
|
"""
|
|
@@ -83,29 +87,32 @@ class Dataset:
|
|
|
83
87
|
else:
|
|
84
88
|
return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
|
|
85
89
|
|
|
86
|
-
def
|
|
90
|
+
def _insert_dataset_versions(
|
|
87
91
|
self,
|
|
88
|
-
|
|
89
|
-
dataset_version: DatasetVersion,
|
|
92
|
+
dataset_list: list[DatasetSpec],
|
|
90
93
|
description: Optional[str] = "",
|
|
91
94
|
execution_rid: Optional[RID] = None,
|
|
92
95
|
) -> RID:
|
|
93
96
|
schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
97
|
+
|
|
98
|
+
# Construct version records for insert
|
|
99
|
+
version_records = [
|
|
100
|
+
{
|
|
101
|
+
"Dataset": dataset.rid,
|
|
102
|
+
"Version": str(dataset.version),
|
|
103
|
+
"Description": description,
|
|
104
|
+
"Execution": execution_rid,
|
|
105
|
+
}
|
|
106
|
+
for dataset in dataset_list
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
# Insert version records and construct entities for updating the dataset version column.
|
|
110
|
+
version_rids = [
|
|
111
|
+
{"Version": v["RID"], "RID": v["Dataset"]}
|
|
112
|
+
for v in schema_path.tables["Dataset_Version"].insert(version_records)
|
|
113
|
+
]
|
|
114
|
+
schema_path.tables["Dataset"].update(version_rids)
|
|
115
|
+
return version_rids
|
|
109
116
|
|
|
110
117
|
def _bootstrap_versions(self):
|
|
111
118
|
datasets = [ds["RID"] for ds in self.find_datasets()]
|
|
@@ -237,16 +244,20 @@ class Dataset:
|
|
|
237
244
|
Raises:
|
|
238
245
|
DerivaMLException: if provided RID is not to a dataset_table.
|
|
239
246
|
"""
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
execution_rid=execution_rid,
|
|
247
|
+
|
|
248
|
+
# Find all of the datasets that are reachable from this dataset and determine their new version numbers.
|
|
249
|
+
related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
|
|
250
|
+
version_update_list = [
|
|
251
|
+
DatasetSpec(
|
|
252
|
+
rid=ds_rid,
|
|
253
|
+
version=self.dataset_version(ds_rid).increment_version(component),
|
|
248
254
|
)
|
|
249
|
-
|
|
255
|
+
for ds_rid in related_datasets
|
|
256
|
+
]
|
|
257
|
+
updated_versions = self._insert_dataset_versions(
|
|
258
|
+
version_update_list, description=description, execution_rid=execution_rid
|
|
259
|
+
)
|
|
260
|
+
return [d.version for d in version_update_list if d.rid == dataset_rid][0]
|
|
250
261
|
|
|
251
262
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
252
263
|
def create_dataset(
|
|
@@ -323,9 +334,8 @@ class Dataset:
|
|
|
323
334
|
pb.schemas[self._ml_schema].Dataset_Execution.insert(
|
|
324
335
|
[{"Dataset": dataset_rid, "Execution": execution_rid}]
|
|
325
336
|
)
|
|
326
|
-
self.
|
|
327
|
-
dataset_rid,
|
|
328
|
-
dataset_version=version,
|
|
337
|
+
self._insert_dataset_versions(
|
|
338
|
+
[DatasetSpec(rid=dataset_rid, version=version)],
|
|
329
339
|
execution_rid=execution_rid,
|
|
330
340
|
description="Initial dataset creation.",
|
|
331
341
|
)
|
|
@@ -455,7 +465,7 @@ class Dataset:
|
|
|
455
465
|
dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
|
|
456
466
|
dataset_rid: RID:
|
|
457
467
|
recurse: (Default value = False)
|
|
458
|
-
limit: If provided, the
|
|
468
|
+
limit: If provided, the maximum number of members to return for each element type.
|
|
459
469
|
|
|
460
470
|
Returns:
|
|
461
471
|
Dictionary of entities associated with a specific dataset_table. Key is the table from which the elements
|
|
@@ -697,11 +707,25 @@ class Dataset:
|
|
|
697
707
|
list of RIDs of nested datasets.
|
|
698
708
|
|
|
699
709
|
"""
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
710
|
+
dataset_dataset_path = (
|
|
711
|
+
self._model.catalog.getPathBuilder()
|
|
712
|
+
.schemas[self._ml_schema]
|
|
713
|
+
.tables["Dataset_Dataset"]
|
|
714
|
+
)
|
|
715
|
+
nested_datasets = list(dataset_dataset_path.entities().fetch())
|
|
716
|
+
|
|
717
|
+
def find_children(rid: RID):
|
|
718
|
+
children = [
|
|
719
|
+
child["Nested_Dataset"]
|
|
720
|
+
for child in nested_datasets
|
|
721
|
+
if child["Dataset"] == rid
|
|
722
|
+
]
|
|
723
|
+
if recurse:
|
|
724
|
+
for child in children.copy():
|
|
725
|
+
children.extend(find_children(child))
|
|
726
|
+
return children
|
|
727
|
+
|
|
728
|
+
return find_children(dataset_rid)
|
|
705
729
|
|
|
706
730
|
def _vocabulary_specification(
|
|
707
731
|
self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
|
|
@@ -727,20 +751,19 @@ class Dataset:
|
|
|
727
751
|
]
|
|
728
752
|
|
|
729
753
|
def _table_paths(
|
|
730
|
-
self, dataset: DatasetSpec = None
|
|
731
|
-
) -> Iterator[tuple[
|
|
754
|
+
self, dataset: DatasetSpec = None, snapshot_catalog: Optional[DerivaML] = None
|
|
755
|
+
) -> Iterator[tuple[str, str, Table]]:
|
|
732
756
|
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
paths = self._collect_paths(dataset and dataset.rid)
|
|
757
|
+
paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
|
|
736
758
|
|
|
737
759
|
def source_path(path: tuple[Table, ...]):
|
|
760
|
+
"""Convert a tuple representing a path into a source path component with FK linkage"""
|
|
738
761
|
path = list(path)
|
|
739
762
|
p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
|
|
740
763
|
for table in path[1:]:
|
|
741
|
-
if table ==
|
|
764
|
+
if table.name == "Dataset_Dataset":
|
|
742
765
|
p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
|
|
743
|
-
elif table ==
|
|
766
|
+
elif table.name == "Dataset":
|
|
744
767
|
p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
|
|
745
768
|
elif table.name == "Dataset_Version":
|
|
746
769
|
p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
|
|
@@ -751,63 +774,76 @@ class Dataset:
|
|
|
751
774
|
src_paths = ["/".join(source_path(p)) for p in paths]
|
|
752
775
|
dest_paths = ["/".join([t.name for t in p]) for p in paths]
|
|
753
776
|
target_tables = [p[-1] for p in paths]
|
|
754
|
-
|
|
755
777
|
return zip(src_paths, dest_paths, target_tables)
|
|
756
778
|
|
|
757
779
|
def _collect_paths(
|
|
758
780
|
self,
|
|
759
781
|
dataset_rid: Optional[RID] = None,
|
|
782
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
760
783
|
dataset_nesting_depth: Optional[int] = None,
|
|
761
784
|
) -> set[tuple[Table, ...]]:
|
|
762
785
|
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
786
|
+
snapshot_catalog = snapshot_catalog or self
|
|
787
|
+
dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
788
|
+
"Dataset"
|
|
789
|
+
]
|
|
790
|
+
dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
|
|
791
|
+
"Dataset_Dataset"
|
|
792
|
+
]
|
|
793
|
+
dataset_associations = [
|
|
794
|
+
a
|
|
795
|
+
for a in self.dataset_table.find_associations()
|
|
796
|
+
if a.table.schema.name != self._ml_schema
|
|
797
|
+
or a.table.name == "Dataset_Dataset"
|
|
798
|
+
]
|
|
799
|
+
if dataset_rid:
|
|
800
|
+
# Get a list of the members of the dataset so we can figure out which tables to query.
|
|
801
|
+
dataset_elements = [
|
|
802
|
+
snapshot_catalog._model.name_to_table(e)
|
|
803
|
+
for e, m in snapshot_catalog.list_dataset_members(
|
|
775
804
|
dataset_rid=dataset_rid, limit=1
|
|
776
805
|
).items()
|
|
777
806
|
if m
|
|
778
807
|
]
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
for a in self.dataset_table.find_associations()
|
|
787
|
-
if a.other_fkeys.pop().pk_table in dataset_elements
|
|
788
|
-
]
|
|
808
|
+
included_associations = [
|
|
809
|
+
a.table
|
|
810
|
+
for a in dataset_table.find_associations()
|
|
811
|
+
if a.other_fkeys.pop().pk_table in dataset_elements
|
|
812
|
+
]
|
|
813
|
+
else:
|
|
814
|
+
included_associations = dataset_associations
|
|
789
815
|
# Get the paths through the schema and filter out all of dataset paths not used by this dataset.
|
|
790
816
|
paths = {
|
|
791
817
|
tuple(p)
|
|
792
|
-
for p in
|
|
818
|
+
for p in snapshot_catalog._model._schema_to_paths()
|
|
793
819
|
if (len(p) == 1)
|
|
794
|
-
or (p[1] not in dataset_associations)
|
|
795
|
-
or (
|
|
820
|
+
or (p[1] not in dataset_associations) # Tables in the domain schema
|
|
821
|
+
or (
|
|
822
|
+
p[1] in included_associations
|
|
823
|
+
) # Tables that include members of the dataset
|
|
796
824
|
}
|
|
797
825
|
# Now get paths for nested datasets
|
|
798
826
|
nested_paths = set()
|
|
799
827
|
if dataset_rid:
|
|
800
|
-
for c in
|
|
801
|
-
nested_paths |= self._collect_paths(
|
|
828
|
+
for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
|
|
829
|
+
nested_paths |= self._collect_paths(
|
|
830
|
+
c, snapshot_catalog=snapshot_catalog
|
|
831
|
+
)
|
|
802
832
|
else:
|
|
833
|
+
# Initialize nesting depth if not already provided.
|
|
834
|
+
dataset_nesting_depth = (
|
|
835
|
+
self._dataset_nesting_depth()
|
|
836
|
+
if dataset_nesting_depth is None
|
|
837
|
+
else dataset_nesting_depth
|
|
838
|
+
)
|
|
803
839
|
if dataset_nesting_depth:
|
|
804
840
|
nested_paths = self._collect_paths(
|
|
805
841
|
dataset_nesting_depth=dataset_nesting_depth - 1
|
|
806
842
|
)
|
|
807
843
|
if nested_paths:
|
|
808
844
|
paths |= {
|
|
809
|
-
tuple([
|
|
810
|
-
(
|
|
845
|
+
tuple([dataset_table]),
|
|
846
|
+
(dataset_table, dataset_dataset),
|
|
811
847
|
}
|
|
812
848
|
paths |= {(self.dataset_table, dataset_dataset) + p for p in nested_paths}
|
|
813
849
|
return paths
|
|
@@ -863,6 +899,7 @@ class Dataset:
|
|
|
863
899
|
self,
|
|
864
900
|
writer: Callable[[str, str, Table], list[dict[str, Any]]],
|
|
865
901
|
dataset: DatasetSpec,
|
|
902
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
866
903
|
) -> list[dict[str, Any]]:
|
|
867
904
|
"""Output a download/export specification for a dataset_table. Each element of the dataset_table will be placed in its own dir
|
|
868
905
|
The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
|
|
@@ -902,21 +939,24 @@ class Dataset:
|
|
|
902
939
|
A dataset_table specification.
|
|
903
940
|
"""
|
|
904
941
|
element_spec = []
|
|
905
|
-
for path in self._table_paths(
|
|
942
|
+
for path in self._table_paths(
|
|
943
|
+
dataset=dataset, snapshot_catalog=snapshot_catalog
|
|
944
|
+
):
|
|
906
945
|
element_spec.extend(writer(*path))
|
|
907
946
|
return self._vocabulary_specification(writer) + element_spec
|
|
908
947
|
|
|
909
|
-
|
|
910
|
-
def download_dataset_bag(
|
|
948
|
+
def _download_dataset_bag(
|
|
911
949
|
self,
|
|
912
950
|
dataset: DatasetSpec,
|
|
913
951
|
execution_rid: Optional[RID] = None,
|
|
952
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
914
953
|
) -> DatasetBag:
|
|
915
954
|
"""Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
|
|
916
955
|
|
|
917
956
|
Args:
|
|
918
957
|
dataset: Specification of the dataset to be downloaded.
|
|
919
958
|
execution_rid: Execution RID for the dataset.
|
|
959
|
+
snapshot_catalog: Snapshot catalog for the dataset version if specified.
|
|
920
960
|
|
|
921
961
|
Returns:
|
|
922
962
|
Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
|
|
@@ -927,16 +967,17 @@ class Dataset:
|
|
|
927
967
|
and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
|
|
928
968
|
):
|
|
929
969
|
raise DerivaMLException(f"RID {execution_rid} is not an execution")
|
|
930
|
-
minid = self.
|
|
970
|
+
minid = self._get_dataset_minid(dataset, snapshot_catalog=snapshot_catalog)
|
|
931
971
|
|
|
932
972
|
bag_path = (
|
|
933
973
|
self._materialize_dataset_bag(minid, execution_rid=execution_rid)
|
|
934
974
|
if dataset.materialize
|
|
935
|
-
else self.
|
|
975
|
+
else self._download_dataset_minid(minid)
|
|
936
976
|
)
|
|
937
977
|
return DatabaseModel(minid, bag_path).get_dataset()
|
|
938
978
|
|
|
939
979
|
def _version_snapshot(self, dataset: DatasetSpec) -> str:
|
|
980
|
+
"""Return a catalog with snapshot for the specified dataset version"""
|
|
940
981
|
version_record = [
|
|
941
982
|
h
|
|
942
983
|
for h in self.dataset_history(dataset_rid=dataset.rid)
|
|
@@ -944,13 +985,17 @@ class Dataset:
|
|
|
944
985
|
][0]
|
|
945
986
|
return f"{self._model.catalog.catalog_id}@{iso_to_snap(version_record.timestamp.isoformat())}"
|
|
946
987
|
|
|
947
|
-
def _create_dataset_minid(
|
|
988
|
+
def _create_dataset_minid(
|
|
989
|
+
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
990
|
+
) -> str:
|
|
948
991
|
with TemporaryDirectory() as tmp_dir:
|
|
949
992
|
# Generate a download specification file for the current catalog schema. By default, this spec
|
|
950
993
|
# will generate a minid and place the bag into S3 storage.
|
|
951
994
|
spec_file = f"{tmp_dir}/download_spec.json"
|
|
952
995
|
with open(spec_file, "w", encoding="utf-8") as ds:
|
|
953
|
-
json.dump(
|
|
996
|
+
json.dump(
|
|
997
|
+
self._generate_dataset_download_spec(dataset, snapshot_catalog), ds
|
|
998
|
+
)
|
|
954
999
|
try:
|
|
955
1000
|
self._logger.info(
|
|
956
1001
|
f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
|
|
@@ -987,14 +1032,17 @@ class Dataset:
|
|
|
987
1032
|
version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
|
|
988
1033
|
return minid_page_url
|
|
989
1034
|
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
1035
|
+
def _get_dataset_minid(
|
|
1036
|
+
self,
|
|
1037
|
+
dataset: DatasetSpec,
|
|
1038
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
1039
|
+
create: bool = True,
|
|
993
1040
|
) -> DatasetMinid:
|
|
994
1041
|
"""Return a MINID to the specified dataset. If no version is specified, use the latest.
|
|
995
1042
|
|
|
996
1043
|
Args:
|
|
997
1044
|
dataset: Specification of the dataset.
|
|
1045
|
+
snapshot_catalog: Snapshot catalog for the dataset version if specified.
|
|
998
1046
|
create: Create a new MINID if one doesn't already exist.
|
|
999
1047
|
|
|
1000
1048
|
Returns:
|
|
@@ -1025,12 +1073,12 @@ class Dataset:
|
|
|
1025
1073
|
f"Minid for dataset {dataset.rid} doesn't exist"
|
|
1026
1074
|
)
|
|
1027
1075
|
self._logger.info("Creating new MINID for dataset %s", dataset.rid)
|
|
1028
|
-
minid_url = self._create_dataset_minid(dataset)
|
|
1076
|
+
minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
|
|
1029
1077
|
# If provided a MINID, use the MINID metadata to get the checksum and download the bag.
|
|
1030
1078
|
r = requests.get(minid_url, headers={"accept": "application/json"})
|
|
1031
1079
|
return DatasetMinid(dataset_version=dataset.version, **r.json())
|
|
1032
1080
|
|
|
1033
|
-
def
|
|
1081
|
+
def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
|
|
1034
1082
|
"""Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
|
|
1035
1083
|
that all the metadata is correct
|
|
1036
1084
|
|
|
@@ -1097,7 +1145,7 @@ class Dataset:
|
|
|
1097
1145
|
return True
|
|
1098
1146
|
|
|
1099
1147
|
# request metadata
|
|
1100
|
-
bag_path = self.
|
|
1148
|
+
bag_path = self._download_dataset_minid(minid)
|
|
1101
1149
|
bag_dir = bag_path.parent
|
|
1102
1150
|
validated_check = bag_dir / "validated_check.txt"
|
|
1103
1151
|
|
|
@@ -1112,7 +1160,9 @@ class Dataset:
|
|
|
1112
1160
|
return Path(bag_path)
|
|
1113
1161
|
|
|
1114
1162
|
def _export_outputs(
|
|
1115
|
-
self,
|
|
1163
|
+
self,
|
|
1164
|
+
dataset: Optional[DatasetSpec] = None,
|
|
1165
|
+
snapshot_catalog: Optional[DerivaML] = None,
|
|
1116
1166
|
) -> list[dict[str, Any]]:
|
|
1117
1167
|
"""Return and output specification for the datasets in the provided model
|
|
1118
1168
|
|
|
@@ -1150,9 +1200,13 @@ class Dataset:
|
|
|
1150
1200
|
"source": {"api": "schema", "skip_root_path": True},
|
|
1151
1201
|
"destination": {"type": "json", "name": "schema"},
|
|
1152
1202
|
},
|
|
1153
|
-
] + self._dataset_specification(
|
|
1203
|
+
] + self._dataset_specification(
|
|
1204
|
+
writer, dataset, snapshot_catalog=snapshot_catalog
|
|
1205
|
+
)
|
|
1154
1206
|
|
|
1155
|
-
def _processor_params(
|
|
1207
|
+
def _processor_params(
|
|
1208
|
+
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
|
|
1209
|
+
) -> list[dict[str, Any]]:
|
|
1156
1210
|
"""
|
|
1157
1211
|
Returns:
|
|
1158
1212
|
a download specification for the datasets in the provided model.
|
|
@@ -1178,7 +1232,7 @@ class Dataset:
|
|
|
1178
1232
|
"processor": "json",
|
|
1179
1233
|
"processor_params": {"query_path": "/schema", "output_path": "schema"},
|
|
1180
1234
|
}
|
|
1181
|
-
] + self._dataset_specification(writer, dataset)
|
|
1235
|
+
] + self._dataset_specification(writer, dataset, snapshot_catalog)
|
|
1182
1236
|
|
|
1183
1237
|
@staticmethod
|
|
1184
1238
|
def _download_dataset_element(
|
|
@@ -1257,7 +1311,9 @@ class Dataset:
|
|
|
1257
1311
|
)
|
|
1258
1312
|
return exports
|
|
1259
1313
|
|
|
1260
|
-
def _generate_dataset_download_spec(
|
|
1314
|
+
def _generate_dataset_download_spec(
|
|
1315
|
+
self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
|
|
1316
|
+
) -> dict[str, Any]:
|
|
1261
1317
|
"""
|
|
1262
1318
|
|
|
1263
1319
|
Returns:
|
|
@@ -1315,7 +1371,7 @@ class Dataset:
|
|
|
1315
1371
|
},
|
|
1316
1372
|
},
|
|
1317
1373
|
]
|
|
1318
|
-
+ self._processor_params(dataset),
|
|
1374
|
+
+ self._processor_params(dataset, snapshot_catalog),
|
|
1319
1375
|
},
|
|
1320
1376
|
}
|
|
1321
1377
|
|
|
@@ -15,8 +15,11 @@ import logging
|
|
|
15
15
|
from datetime import datetime
|
|
16
16
|
import hashlib
|
|
17
17
|
from itertools import chain
|
|
18
|
+
import inspect
|
|
18
19
|
from pathlib import Path
|
|
19
20
|
import requests
|
|
21
|
+
from setuptools_git_versioning import get_latest_file_commit
|
|
22
|
+
import subprocess
|
|
20
23
|
from typing import Optional, Any, Iterable, TYPE_CHECKING
|
|
21
24
|
from deriva.core import (
|
|
22
25
|
ErmrestCatalog,
|
|
@@ -35,6 +38,8 @@ from pydantic import validate_call, ConfigDict
|
|
|
35
38
|
from .execution_configuration import ExecutionConfiguration, Workflow
|
|
36
39
|
from .feature import Feature, FeatureRecord
|
|
37
40
|
from .dataset import Dataset
|
|
41
|
+
from .dataset_aux_classes import DatasetSpec
|
|
42
|
+
from .dataset_bag import DatasetBag
|
|
38
43
|
from .deriva_model import DerivaModel
|
|
39
44
|
from .upload import (
|
|
40
45
|
table_path,
|
|
@@ -56,6 +61,18 @@ from .deriva_definitions import (
|
|
|
56
61
|
FileSpec,
|
|
57
62
|
)
|
|
58
63
|
|
|
64
|
+
try:
|
|
65
|
+
from icecream import ic
|
|
66
|
+
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
67
|
+
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
from IPython import get_ipython
|
|
72
|
+
except ImportError: # Graceful fallback if IPython isn't installed.
|
|
73
|
+
get_ipython = lambda: None
|
|
74
|
+
|
|
75
|
+
|
|
59
76
|
if TYPE_CHECKING:
|
|
60
77
|
from .execution import Execution
|
|
61
78
|
|
|
@@ -132,6 +149,17 @@ class DerivaML(Dataset):
|
|
|
132
149
|
self.version = model_version
|
|
133
150
|
self.configuration = None
|
|
134
151
|
self._execution: Optional[Execution] = None
|
|
152
|
+
self._notebook = None
|
|
153
|
+
try:
|
|
154
|
+
from IPython import get_ipython
|
|
155
|
+
|
|
156
|
+
ipython = get_ipython()
|
|
157
|
+
# Check if running in Jupyter's ZMQ kernel (used by notebooks)
|
|
158
|
+
if ipython is not None and "IPKernelApp" in ipython.config:
|
|
159
|
+
self._notebook = Path(ipython.user_ns.get("__session__"))
|
|
160
|
+
# Check if running in Jupyter's ZMQ kernel (used by notebooks)
|
|
161
|
+
except (ImportError, AttributeError):
|
|
162
|
+
pass
|
|
135
163
|
|
|
136
164
|
self.domain_schema = self.model.domain_schema
|
|
137
165
|
self.project_name = project_name or self.domain_schema
|
|
@@ -705,6 +733,28 @@ class DerivaML(Dataset):
|
|
|
705
733
|
for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()
|
|
706
734
|
]
|
|
707
735
|
|
|
736
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
737
|
+
def download_dataset_bag(
|
|
738
|
+
self,
|
|
739
|
+
dataset: DatasetSpec,
|
|
740
|
+
execution_rid: Optional[RID] = None,
|
|
741
|
+
) -> DatasetBag:
|
|
742
|
+
"""Download a dataset onto the local file system. Create a MINID for the dataset if one doesn't already exist.
|
|
743
|
+
|
|
744
|
+
Args:
|
|
745
|
+
dataset: Specification of the dataset to be downloaded.
|
|
746
|
+
execution_rid: Execution RID for the dataset.
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
|
|
750
|
+
for the dataset.
|
|
751
|
+
"""
|
|
752
|
+
return self._download_dataset_bag(
|
|
753
|
+
dataset=dataset,
|
|
754
|
+
execution_rid=execution_rid,
|
|
755
|
+
snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
|
|
756
|
+
)
|
|
757
|
+
|
|
708
758
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
709
759
|
def download_asset(self, asset_rid: RID, dest_dir: Path) -> Path:
|
|
710
760
|
"""Download an asset from a URL and place it in a local directory.
|
|
@@ -808,8 +858,10 @@ class DerivaML(Dataset):
|
|
|
808
858
|
Iterable of the RIDs of the files that were added.
|
|
809
859
|
"""
|
|
810
860
|
defined_types = self.list_vocabulary_terms(MLVocab.file_type)
|
|
811
|
-
if execution_rid and self.resolve_rid(execution_rid).table.name !=
|
|
812
|
-
raise DerivaMLException(
|
|
861
|
+
if execution_rid and self.resolve_rid(execution_rid).table.name != "Execution":
|
|
862
|
+
raise DerivaMLException(
|
|
863
|
+
f"RID {execution_rid} is not for an execution table."
|
|
864
|
+
)
|
|
813
865
|
|
|
814
866
|
def check_file_type(dtype: str) -> bool:
|
|
815
867
|
for term in defined_types:
|
|
@@ -862,18 +914,11 @@ class DerivaML(Dataset):
|
|
|
862
914
|
self, file_types: Optional[list[str]] = None
|
|
863
915
|
) -> list[dict[str, Any]]:
|
|
864
916
|
"""Return the contents of the file table. Denormalized file types into the file record."""
|
|
865
|
-
atable = next(
|
|
866
|
-
self._model.schemas[self._ml_schema]
|
|
867
|
-
.tables[MLVocab.dataset_type]
|
|
868
|
-
.find_associations()
|
|
869
|
-
).name
|
|
870
917
|
ml_path = self.pathBuilder.schemas[self._ml_schema]
|
|
871
|
-
atable_path = ml_path.tables[atable]
|
|
872
918
|
file_path = ml_path.File
|
|
873
919
|
type_path = ml_path.File_File_Type
|
|
874
920
|
|
|
875
921
|
# Get a list of all the dataset_type values associated with this dataset_table.
|
|
876
|
-
files = []
|
|
877
922
|
path = file_path.link(type_path)
|
|
878
923
|
path = path.attributes(
|
|
879
924
|
path.File.RID,
|
|
@@ -885,10 +930,12 @@ class DerivaML(Dataset):
|
|
|
885
930
|
)
|
|
886
931
|
file_map = {}
|
|
887
932
|
for f in path.fetch():
|
|
888
|
-
file_map.setdefault(f[
|
|
933
|
+
file_map.setdefault(f["RID"], f | {"File_Types": []})["File_Types"].append(
|
|
934
|
+
f["File_Type"]
|
|
935
|
+
)
|
|
889
936
|
|
|
890
937
|
# Now get rid of the File_Type key and return the result
|
|
891
|
-
return [
|
|
938
|
+
return [(f, f.pop("File_Type"))[0] for f in file_map.values()]
|
|
892
939
|
|
|
893
940
|
def list_workflows(self) -> list[Workflow]:
|
|
894
941
|
"""Return a list of all the workflows in the catalog."""
|
|
@@ -901,6 +948,7 @@ class DerivaML(Dataset):
|
|
|
901
948
|
version=w["Version"],
|
|
902
949
|
description=w["Description"],
|
|
903
950
|
rid=w["RID"],
|
|
951
|
+
checksum=w["Checksum"],
|
|
904
952
|
)
|
|
905
953
|
for w in workflow_path.entities().fetch()
|
|
906
954
|
]
|
|
@@ -917,33 +965,18 @@ class DerivaML(Dataset):
|
|
|
917
965
|
"""
|
|
918
966
|
|
|
919
967
|
# Check to make sure that the workflow is not already in the table. If it's not, add it.
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
response = requests.get(url)
|
|
924
|
-
response.raise_for_status()
|
|
925
|
-
except Exception:
|
|
926
|
-
raise DerivaMLException(f"Invalid URL: {url}")
|
|
927
|
-
else:
|
|
928
|
-
sha256_hash = hashlib.sha256()
|
|
929
|
-
sha256_hash.update(response.content)
|
|
930
|
-
checksum = "SHA-256: " + sha256_hash.hexdigest()
|
|
931
|
-
return checksum
|
|
968
|
+
|
|
969
|
+
if workflow_rid := self.lookup_workflow(workflow.url):
|
|
970
|
+
return workflow_rid
|
|
932
971
|
|
|
933
972
|
ml_schema_path = self.pathBuilder.schemas[self.ml_schema]
|
|
934
973
|
try:
|
|
935
|
-
url_column = ml_schema_path.Workflow.URL
|
|
936
|
-
workflow_record = list(
|
|
937
|
-
ml_schema_path.Workflow.filter(url_column == workflow.url).entities()
|
|
938
|
-
)[0]
|
|
939
|
-
workflow_rid = workflow_record["RID"]
|
|
940
|
-
except IndexError:
|
|
941
974
|
# Record doesn't exist already
|
|
942
975
|
workflow_record = {
|
|
943
976
|
"URL": workflow.url,
|
|
944
977
|
"Name": workflow.name,
|
|
945
978
|
"Description": workflow.description,
|
|
946
|
-
"Checksum":
|
|
979
|
+
"Checksum": workflow.checksum,
|
|
947
980
|
"Version": workflow.version,
|
|
948
981
|
MLVocab.workflow_type: self.lookup_term(
|
|
949
982
|
MLVocab.workflow_type, workflow.workflow_type
|
|
@@ -955,6 +988,125 @@ class DerivaML(Dataset):
|
|
|
955
988
|
raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
|
|
956
989
|
return workflow_rid
|
|
957
990
|
|
|
991
|
+
def lookup_workflow(self, url: str) -> Optional[RID]:
|
|
992
|
+
workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
|
|
993
|
+
try:
|
|
994
|
+
url_column = workflow_path.URL
|
|
995
|
+
return list(workflow_path.filter(url_column == url).entities())[0]["RID"]
|
|
996
|
+
except IndexError:
|
|
997
|
+
return None
|
|
998
|
+
|
|
999
|
+
def create_workflow(
|
|
1000
|
+
self, name: str, workflow_type: str, description: str = "", create: bool = True
|
|
1001
|
+
) -> RID:
|
|
1002
|
+
"""Identify current executing program and return a workflow RID for it
|
|
1003
|
+
|
|
1004
|
+
Determane the notebook of script that is currently being executed. Assume that this is
|
|
1005
|
+
being executed from a cloned GitHub repository. Determine the remote repository name for
|
|
1006
|
+
this object. Then either retrieve an existing workflow for this executable of create
|
|
1007
|
+
a new one.
|
|
1008
|
+
|
|
1009
|
+
Args:
|
|
1010
|
+
name: The name of the workflow.
|
|
1011
|
+
workflow_type: The type of the workflow.
|
|
1012
|
+
description: The description of the workflow.
|
|
1013
|
+
create: Whether or not to create a new workflow.
|
|
1014
|
+
"""
|
|
1015
|
+
# Make sure type is correct.
|
|
1016
|
+
self.lookup_term(MLVocab.workflow_type, workflow_type)
|
|
1017
|
+
filename, github_url, is_dirty = self._github_url()
|
|
1018
|
+
|
|
1019
|
+
if is_dirty:
|
|
1020
|
+
self._logger.warning(
|
|
1021
|
+
f"File {filename} has been modified since last commit. Consider commiting before executing"
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
sha256_hash = hashlib.sha256()
|
|
1025
|
+
if self._notebook:
|
|
1026
|
+
# If you are in a notebook, strip out the outputs before computing the checksum.
|
|
1027
|
+
result = subprocess.run(
|
|
1028
|
+
["nbstripout", "-t", filename],
|
|
1029
|
+
capture_output=True,
|
|
1030
|
+
text=False,
|
|
1031
|
+
check=True,
|
|
1032
|
+
)
|
|
1033
|
+
sha256_hash.update(result.stdout)
|
|
1034
|
+
else:
|
|
1035
|
+
with open(filename, "rb") as f:
|
|
1036
|
+
sha256_hash.update(f.read())
|
|
1037
|
+
checksum = "SHA-256:" + sha256_hash.hexdigest()
|
|
1038
|
+
|
|
1039
|
+
workflow = Workflow(
|
|
1040
|
+
name=name,
|
|
1041
|
+
url=github_url,
|
|
1042
|
+
checksum=checksum,
|
|
1043
|
+
description=description,
|
|
1044
|
+
workflow_type=workflow_type,
|
|
1045
|
+
)
|
|
1046
|
+
return self.add_workflow(workflow) if create else None
|
|
1047
|
+
|
|
1048
|
+
def _github_url(self) -> tuple[str, str, bool]:
|
|
1049
|
+
"""Return a GitHUB URL for the latest commit of the script from which this routine is called.
|
|
1050
|
+
|
|
1051
|
+
This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
|
|
1052
|
+
the file is in a gitHUB repository and commited. It returns a URL to the last commited version of this
|
|
1053
|
+
file in GitHUB.
|
|
1054
|
+
|
|
1055
|
+
Returns: A tuple with the filename, gethub_url and a boolaen to indicated if uncommited changes
|
|
1056
|
+
have been made to the file.
|
|
1057
|
+
|
|
1058
|
+
"""
|
|
1059
|
+
|
|
1060
|
+
# Get the name of the script that is calling this function.
|
|
1061
|
+
if self._notebook:
|
|
1062
|
+
# Try to get the __session__ variable from the user namespace.
|
|
1063
|
+
filename = Path("").absolute().parent / self._notebook
|
|
1064
|
+
else:
|
|
1065
|
+
stack = inspect.stack()
|
|
1066
|
+
if len(stack) > 1:
|
|
1067
|
+
filename = Path(
|
|
1068
|
+
stack[2].filename
|
|
1069
|
+
) # Get the caller's filename, which is two up the stack from here.
|
|
1070
|
+
else:
|
|
1071
|
+
raise DerivaMLException(
|
|
1072
|
+
f"Looking for caller failed"
|
|
1073
|
+
) # Stack is too shallow
|
|
1074
|
+
|
|
1075
|
+
# Get repo URL from local github repo.
|
|
1076
|
+
try:
|
|
1077
|
+
result = subprocess.run(
|
|
1078
|
+
["git", "remote", "get-url", "origin"], capture_output=True, text=True
|
|
1079
|
+
)
|
|
1080
|
+
github_url = result.stdout.strip().removesuffix(".git")
|
|
1081
|
+
except subprocess.CalledProcessError:
|
|
1082
|
+
raise DerivaMLException(f"No GIT remote found")
|
|
1083
|
+
|
|
1084
|
+
# Find the root directory for the repository
|
|
1085
|
+
repo_root = filename
|
|
1086
|
+
while repo_root != repo_root.root:
|
|
1087
|
+
if (repo_root / ".git").exists():
|
|
1088
|
+
break
|
|
1089
|
+
else:
|
|
1090
|
+
repo_root = repo_root.parent
|
|
1091
|
+
|
|
1092
|
+
# Now check to see if file has been modified since the last commit.
|
|
1093
|
+
try:
|
|
1094
|
+
result = subprocess.run(
|
|
1095
|
+
["git", "status", "--porcelain"],
|
|
1096
|
+
capture_output=True,
|
|
1097
|
+
text=True,
|
|
1098
|
+
check=True,
|
|
1099
|
+
)
|
|
1100
|
+
is_dirty = bool(
|
|
1101
|
+
" M " in result.stdout.strip()
|
|
1102
|
+
) # Returns True if output indicates a modified file
|
|
1103
|
+
except subprocess.CalledProcessError:
|
|
1104
|
+
is_dirty = False # If Git command fails, assume no changes
|
|
1105
|
+
|
|
1106
|
+
sha = get_latest_file_commit(filename)
|
|
1107
|
+
url = f"{github_url}/blob/{sha}/{filename.relative_to(repo_root)}"
|
|
1108
|
+
return filename, url, is_dirty
|
|
1109
|
+
|
|
958
1110
|
# @validate_call
|
|
959
1111
|
def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
|
|
960
1112
|
"""Create an execution object
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from sympy import cxxcode
|
|
2
|
+
|
|
3
|
+
from deriva_ml import DerivaML, execution_configuration
|
|
4
|
+
|
|
5
|
+
def execute(host, catalog, script):
|
|
6
|
+
workflow_rid = foobar
|
|
7
|
+
execution_configuration = cxxcode(
|
|
8
|
+
|
|
9
|
+
)
|
|
10
|
+
ml_instance = DerivaML()
|
|
11
|
+
ml_instance.create_execution(configuration)
|
|
12
|
+
script
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
from deriva_ml import DerivaML, ExecutionConfiguration, DatasetSpec, RID, DerivaMLException
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
import json
|
|
19
|
+
import traceback
|
|
20
|
+
import argparse
|
|
21
|
+
import requests
|
|
22
|
+
from requests.exceptions import HTTPError, ConnectionError
|
|
23
|
+
from deriva.transfer import GenericDownloader
|
|
24
|
+
from deriva.transfer.download import DerivaDownloadError, DerivaDownloadConfigurationError, \
|
|
25
|
+
DerivaDownloadAuthenticationError, DerivaDownloadAuthorizationError, DerivaDownloadTimeoutError, \
|
|
26
|
+
DerivaDownloadBaggingError
|
|
27
|
+
from deriva.core import BaseCLI, KeyValuePairArgs, format_credential, format_exception, urlparse
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DerivaMLExecCLI(BaseCLI):
|
|
31
|
+
def __init__(self, description, epilog, **kwargs):
|
|
32
|
+
|
|
33
|
+
BaseCLI.__init__(self, description, epilog, **kwargs)
|
|
34
|
+
self.parser.add_argument("--catalog", default=1, metavar="<1>", help="Catalog number. Default: 1")
|
|
35
|
+
self.parser.add_argument("--timeout", metavar="<seconds>",
|
|
36
|
+
help="Total number of seconds elapsed before the download is aborted.")
|
|
37
|
+
self.parser.add_argument("output_dir", metavar="<output dir>", help="Path to an output directory.")
|
|
38
|
+
self.parser.add_argument("envars", metavar="[key=value key=value ...]",
|
|
39
|
+
nargs=argparse.REMAINDER, action=KeyValuePairArgs, default={},
|
|
40
|
+
help="Variable length of whitespace-delimited key=value pair arguments used for "
|
|
41
|
+
"string interpolation in specific parts of the configuration file. "
|
|
42
|
+
"For example: key1=value1 key2=value2")
|
|
43
|
+
|
|
44
|
+
def main(self):
|
|
45
|
+
try:
|
|
46
|
+
args = self.parse_cli()
|
|
47
|
+
except ValueError as e:
|
|
48
|
+
sys.stderr.write(str(e))
|
|
49
|
+
return 2
|
|
50
|
+
if not args.quiet:
|
|
51
|
+
sys.stderr.write("\n")
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
try:
|
|
55
|
+
ml_instance = DerivaML(args.hostname, args.catalog)
|
|
56
|
+
downloaded = self.execute()
|
|
57
|
+
sys.stdout.write("\n%s\n" % (json.dumps(downloaded)))
|
|
58
|
+
except ConnectionError as e:
|
|
59
|
+
raise DerivaDownloadError("Connection error occurred. %s" % format_exception(e))
|
|
60
|
+
except HTTPError as e:
|
|
61
|
+
if e.response.status_code == requests.codes.unauthorized:
|
|
62
|
+
raise DerivaDownloadAuthenticationError(
|
|
63
|
+
"The requested service requires authentication and a valid login session could "
|
|
64
|
+
"not be found for the specified host. Server responded: %s" % e)
|
|
65
|
+
elif e.response.status_code == requests.codes.forbidden:
|
|
66
|
+
raise DerivaDownloadAuthorizationError(
|
|
67
|
+
"A requested operation was forbidden. Server responded: %s" % e)
|
|
68
|
+
except (DerivaDownloadError, DerivaDownloadConfigurationError, DerivaDownloadAuthenticationError,
|
|
69
|
+
DerivaDownloadAuthorizationError, DerivaDownloadTimeoutError, DerivaDownloadBaggingError) as e:
|
|
70
|
+
sys.stderr.write(("\n" if not args.quiet else "") + format_exception(e))
|
|
71
|
+
if args.debug:
|
|
72
|
+
traceback.print_exc()
|
|
73
|
+
return 1
|
|
74
|
+
except:
|
|
75
|
+
sys.stderr.write("An unexpected error occurred.")
|
|
76
|
+
traceback.print_exc()
|
|
77
|
+
return 1
|
|
78
|
+
finally:
|
|
79
|
+
if not args.quiet:
|
|
80
|
+
sys.stderr.write("\n\n")
|
|
81
|
+
return 0
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def do_stuff():
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
def main(datasets: list[RID], model: list[RID], hostname: str, catalog_id: str):
|
|
88
|
+
my_url = DerivaML.github_url()
|
|
89
|
+
ml_instance = DerivaML(hostname, catalog_id)
|
|
90
|
+
ml_instance.lookup_workflow(my_url)
|
|
91
|
+
config = ExecutionConfiguration(
|
|
92
|
+
datasets=[DatasetSpec(rid=dataset,
|
|
93
|
+
version=ml_instance.dataset_version(dataset)) for dataset in datasets],
|
|
94
|
+
assets=model,
|
|
95
|
+
workflow= ml_instance.lookup_workflow(my_url)
|
|
96
|
+
)
|
|
97
|
+
execution = ml_instance.create_execution(config)
|
|
98
|
+
with execution as e:
|
|
99
|
+
do_stuff()
|
|
100
|
+
execution.upload_execution_outputs()
|
|
101
|
+
|
|
102
|
+
if __name__ == "__main__":
|
|
103
|
+
main(datasets, model, hostname, catalog_id)
|
|
104
|
+
if __file__ == matplotlib_inline
|
|
@@ -12,6 +12,7 @@ import os
|
|
|
12
12
|
import shutil
|
|
13
13
|
from datetime import datetime
|
|
14
14
|
from pathlib import Path
|
|
15
|
+
import requests
|
|
15
16
|
from tempfile import NamedTemporaryFile
|
|
16
17
|
from typing import Iterable, Any, Optional
|
|
17
18
|
from deriva.core import format_exception
|
|
@@ -28,7 +29,6 @@ from .deriva_definitions import (
|
|
|
28
29
|
)
|
|
29
30
|
from .deriva_ml_base import DerivaML, FeatureRecord
|
|
30
31
|
from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
|
|
31
|
-
from .dataset import Dataset
|
|
32
32
|
from .dataset_bag import DatasetBag
|
|
33
33
|
from .execution_configuration import ExecutionConfiguration
|
|
34
34
|
from .execution_environment import get_execution_environment
|
|
@@ -51,6 +51,12 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
51
51
|
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
52
52
|
|
|
53
53
|
|
|
54
|
+
try:
|
|
55
|
+
from jupyter_server.serverapp import list_running_servers
|
|
56
|
+
except ImportError:
|
|
57
|
+
list_running_servers = lambda: []
|
|
58
|
+
|
|
59
|
+
|
|
54
60
|
class Execution:
|
|
55
61
|
"""The Execution class is used to capture the context of an activity within DerivaML. While these are primarily
|
|
56
62
|
computational, manual processes can be represented by an execution as well.
|
|
@@ -100,6 +106,7 @@ class Execution:
|
|
|
100
106
|
self.configuration = configuration
|
|
101
107
|
self._ml_object = ml_object
|
|
102
108
|
self.start_time = None
|
|
109
|
+
self.stop_time = None
|
|
103
110
|
self.status = Status.created
|
|
104
111
|
self.uploaded_assets: list[Path] = []
|
|
105
112
|
|
|
@@ -221,8 +228,9 @@ class Execution:
|
|
|
221
228
|
Returns:
|
|
222
229
|
the location of the unpacked and validated dataset_table bag and the RID of the bag
|
|
223
230
|
"""
|
|
224
|
-
|
|
225
|
-
|
|
231
|
+
return self._ml_object.download_dataset_bag(
|
|
232
|
+
dataset, execution_rid=self.execution_rid
|
|
233
|
+
)
|
|
226
234
|
|
|
227
235
|
@validate_call
|
|
228
236
|
def update_status(self, status: Status, msg: str) -> None:
|
|
@@ -243,6 +251,35 @@ class Execution:
|
|
|
243
251
|
]
|
|
244
252
|
)
|
|
245
253
|
|
|
254
|
+
def _create_notebook_checkpoint(self):
|
|
255
|
+
"""Trigger a checkpoint creation using Jupyter's API."""
|
|
256
|
+
notebook_name = self._ml_object._notebook
|
|
257
|
+
servers = list_running_servers()
|
|
258
|
+
# Look for the server running this notebook.
|
|
259
|
+
root = Path("").absolute().parent.as_posix()
|
|
260
|
+
servers = list(list_running_servers())
|
|
261
|
+
# Jupyterhub seems to handle root_dir differently then server case.
|
|
262
|
+
server = (
|
|
263
|
+
servers
|
|
264
|
+
if len(servers) == 1
|
|
265
|
+
else [s for s in servers if s["root_dir"] == root]
|
|
266
|
+
)[0]
|
|
267
|
+
notebook_url = f"{server['url']}api/contents/{notebook_name}"
|
|
268
|
+
|
|
269
|
+
# Get notebook content
|
|
270
|
+
response = requests.get(
|
|
271
|
+
notebook_url, headers={"Authorization": f"Token {server['token']}"}
|
|
272
|
+
)
|
|
273
|
+
if response.status_code == 200:
|
|
274
|
+
notebook_content = response.json()["content"]
|
|
275
|
+
# Execution metadata cannot be in a directory, so map path into filename.
|
|
276
|
+
checkpoint_path = (
|
|
277
|
+
self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
|
|
278
|
+
/ f"{notebook_name.as_posix().replace('/','_')}.checkpoint"
|
|
279
|
+
)
|
|
280
|
+
with open(checkpoint_path, "w", encoding="utf-8") as f:
|
|
281
|
+
json.dump(notebook_content, f)
|
|
282
|
+
|
|
246
283
|
def execution_start(self) -> None:
|
|
247
284
|
"""Start an execution, uploading status to catalog"""
|
|
248
285
|
|
|
@@ -252,11 +289,15 @@ class Execution:
|
|
|
252
289
|
|
|
253
290
|
def execution_stop(self) -> None:
|
|
254
291
|
"""Finish the execution and update the duration and status of execution."""
|
|
255
|
-
|
|
292
|
+
self.stop_time = datetime.now()
|
|
293
|
+
duration = self.stop_time - self.start_time
|
|
256
294
|
hours, remainder = divmod(duration.total_seconds(), 3600)
|
|
257
295
|
minutes, seconds = divmod(remainder, 60)
|
|
258
296
|
duration = f"{round(hours, 0)}H {round(minutes, 0)}min {round(seconds, 4)}sec"
|
|
259
297
|
|
|
298
|
+
if self._ml_object._notebook:
|
|
299
|
+
self._create_notebook_checkpoint()
|
|
300
|
+
|
|
260
301
|
self.update_status(Status.completed, "Algorithm execution ended.")
|
|
261
302
|
self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
|
|
262
303
|
[{"RID": self.execution_rid, "Duration": duration}]
|
|
@@ -33,18 +33,18 @@ class Workflow(BaseModel):
|
|
|
33
33
|
version: Optional[str] = None
|
|
34
34
|
description: Optional[str] = ""
|
|
35
35
|
rid: Optional[RID] = None
|
|
36
|
+
checksum: Optional[str]
|
|
37
|
+
|
|
36
38
|
|
|
37
39
|
|
|
38
40
|
class ExecutionConfiguration(BaseModel):
|
|
39
41
|
"""Define the parameters that are used to configure a specific execution.
|
|
40
42
|
|
|
41
43
|
Attributes:
|
|
42
|
-
datasets: List of
|
|
43
|
-
|
|
44
|
-
needed, a dictionary that defines the rid and the materialization parameter for the
|
|
45
|
-
download_dataset_bag method can be specified, e.g. datasets=[{'rid': RID, 'materialize': True}].
|
|
44
|
+
datasets: List of dataset specifications which specify the dataset RID, version and if the dataset
|
|
45
|
+
should be materialized.
|
|
46
46
|
assets: List of assets to be downloaded prior to execution. The values must be RIDs in an asset table
|
|
47
|
-
workflow: A workflow instance. Must have a name, URI to the workflow instance, and a type.
|
|
47
|
+
workflow: A RID for a workflow instance. Must have a name, URI to the workflow instance, and a type.
|
|
48
48
|
description: A description of the execution. Can use Markdown format.
|
|
49
49
|
"""
|
|
50
50
|
|
|
@@ -70,8 +70,11 @@ exec_asset_regex = (
|
|
|
70
70
|
exec_metadata_dir_regex = (
|
|
71
71
|
exec_dir_regex + r"/execution-metadata/(?P<execution_metadata_type>[-\w]+)"
|
|
72
72
|
)
|
|
73
|
+
|
|
74
|
+
# May have more than one suffix
|
|
73
75
|
exec_metadata_regex = (
|
|
74
|
-
exec_metadata_dir_regex
|
|
76
|
+
exec_metadata_dir_regex
|
|
77
|
+
+ r"/(?P<filename>[-\w]+([.][\w]+)*)[.](?P<file_ext>[a-z0-9]*)$"
|
|
75
78
|
)
|
|
76
79
|
feature_dir_regex = exec_dir_regex + r"/feature"
|
|
77
80
|
feature_table_dir_regex = (
|
|
@@ -1,16 +1,18 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: deriva-ml
|
|
3
|
-
Version: 1.8.
|
|
3
|
+
Version: 1.8.4
|
|
4
4
|
Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
|
|
5
5
|
Author-email: ISRD <isrd-dev@isi.edu>
|
|
6
6
|
Requires-Python: >=3.10
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
License-File: LICENSE
|
|
9
|
-
Requires-Dist: deriva~=1.7.
|
|
9
|
+
Requires-Dist: deriva~=1.7.7
|
|
10
10
|
Requires-Dist: pandas
|
|
11
11
|
Requires-Dist: regex~=2024.7.24
|
|
12
12
|
Requires-Dist: pydantic>=2.10.6
|
|
13
13
|
Requires-Dist: semver>3.0.0
|
|
14
|
+
Requires-Dist: setuptools-git-versioning<3,>=2.0
|
|
15
|
+
Requires-Dist: nbstripout
|
|
14
16
|
|
|
15
17
|
Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
|
|
16
18
|
using a deriva catalog.
|
|
@@ -10,6 +10,7 @@ src/deriva_ml/dataset_bag.py
|
|
|
10
10
|
src/deriva_ml/demo_catalog.py
|
|
11
11
|
src/deriva_ml/deriva_definitions.py
|
|
12
12
|
src/deriva_ml/deriva_ml_base.py
|
|
13
|
+
src/deriva_ml/deriva_ml_execute.py
|
|
13
14
|
src/deriva_ml/deriva_model.py
|
|
14
15
|
src/deriva_ml/execution.py
|
|
15
16
|
src/deriva_ml/execution_configuration.py
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.8.2"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/build/lib/schema_setup/alter_annotation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{deriva_ml-1.8.2 → deriva_ml-1.8.4}/src/deriva_ml/build/lib/schema_setup/table_comments_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|