deriva-ml 1.13.1__py3-none-any.whl → 1.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/database_model.py +5 -11
- deriva_ml/dataset.py +293 -307
- deriva_ml/dataset_aux_classes.py +10 -10
- deriva_ml/demo_catalog.py +90 -67
- deriva_ml/deriva_definitions.py +43 -4
- deriva_ml/deriva_ml_base.py +31 -30
- deriva_ml/deriva_model.py +17 -5
- deriva_ml/execution.py +102 -89
- deriva_ml/execution_configuration.py +2 -1
- deriva_ml/history.py +2 -0
- deriva_ml/schema_setup/annotations.py +341 -126
- deriva_ml/schema_setup/create_schema.py +33 -65
- deriva_ml/schema_setup/policy.json +7 -3
- deriva_ml/upload.py +3 -3
- {deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/METADATA +2 -2
- deriva_ml-1.13.3.dist-info/RECORD +31 -0
- {deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/WHEEL +1 -1
- deriva_ml-1.13.1.dist-info/RECORD +0 -31
- {deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/entry_points.txt +0 -0
- {deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.13.1.dist-info → deriva_ml-1.13.3.dist-info}/top_level.txt +0 -0
deriva_ml/execution.py
CHANGED
|
@@ -12,13 +12,11 @@ import os
|
|
|
12
12
|
from pathlib import Path
|
|
13
13
|
|
|
14
14
|
from pydantic import validate_call, ConfigDict
|
|
15
|
-
import regex as re
|
|
16
15
|
import sys
|
|
17
16
|
import shutil
|
|
18
17
|
from typing import Iterable, Any, Optional
|
|
19
18
|
|
|
20
19
|
from deriva.core import format_exception
|
|
21
|
-
from deriva.core.datapath import DataPathException
|
|
22
20
|
from deriva.core.hatrac_store import HatracStore
|
|
23
21
|
from .deriva_definitions import (
|
|
24
22
|
RID,
|
|
@@ -29,6 +27,7 @@ from .deriva_definitions import (
|
|
|
29
27
|
MLAsset,
|
|
30
28
|
ExecMetadataType,
|
|
31
29
|
ExecAssetType,
|
|
30
|
+
FileSpec,
|
|
32
31
|
DRY_RUN_RID,
|
|
33
32
|
)
|
|
34
33
|
from .deriva_ml_base import DerivaML, FeatureRecord
|
|
@@ -66,29 +65,43 @@ except ImportError:
|
|
|
66
65
|
return s
|
|
67
66
|
|
|
68
67
|
|
|
69
|
-
class
|
|
70
|
-
|
|
68
|
+
# Platform-specific base class
|
|
69
|
+
if sys.version_info >= (3, 12):
|
|
71
70
|
|
|
72
|
-
|
|
73
|
-
|
|
71
|
+
class AssetFilePath(Path):
|
|
72
|
+
"""
|
|
73
|
+
Create a new Path object that has additional information related to the use of this path as an asset.
|
|
74
74
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
75
|
+
Args:
|
|
76
|
+
asset_path: Local path to the location of the asset.
|
|
77
|
+
asset_name: The name of the asset in the catalog (e.g. the asset table name).
|
|
78
|
+
file_name: Name of the local file that contains the contents of the asset.
|
|
79
|
+
asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
|
|
80
|
+
asset_types: A list of terms from the Asset_Type controlled vocabulary.
|
|
81
|
+
asset_rid: The RID of the asset if it has been uploaded into an asset table
|
|
82
|
+
"""
|
|
82
83
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
asset_path: str | Path,
|
|
87
|
+
asset_name: str,
|
|
88
|
+
file_name: str,
|
|
89
|
+
asset_metadata: dict[str, Any],
|
|
90
|
+
asset_types: list[str] | str,
|
|
91
|
+
asset_rid: Optional["RID"] = None,
|
|
92
|
+
):
|
|
93
|
+
super().__init__(asset_path)
|
|
94
|
+
# These assignments happen after __new__ returns the instance
|
|
95
|
+
self.asset_name = asset_name
|
|
96
|
+
self.file_name = file_name
|
|
97
|
+
self.asset_metadata = asset_metadata
|
|
98
|
+
self.asset_types = (
|
|
99
|
+
asset_types if isinstance(asset_types, list) else [asset_types]
|
|
100
|
+
)
|
|
101
|
+
self.asset_rid = asset_rid
|
|
102
|
+
else:
|
|
103
|
+
|
|
104
|
+
class AssetFilePath(type(Path())):
|
|
92
105
|
"""
|
|
93
106
|
Create a new Path object that has additional information related to the use of this path as an asset.
|
|
94
107
|
|
|
@@ -100,15 +113,26 @@ class AssetFilePath(type(Path())):
|
|
|
100
113
|
asset_types: A list of terms from the Asset_Type controlled vocabulary.
|
|
101
114
|
asset_rid: The RID of the asset if it has been uploaded into an asset table
|
|
102
115
|
"""
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
116
|
+
|
|
117
|
+
def __new__(
|
|
118
|
+
cls,
|
|
119
|
+
asset_path: str | Path,
|
|
120
|
+
asset_name: str,
|
|
121
|
+
file_name: str,
|
|
122
|
+
asset_metadata: dict[str, Any],
|
|
123
|
+
asset_types: list[str] | str,
|
|
124
|
+
asset_rid: Optional["RID"] = None,
|
|
125
|
+
):
|
|
126
|
+
# Only pass the path to the base Path class
|
|
127
|
+
obj = super().__new__(cls, asset_path)
|
|
128
|
+
obj.asset_name = asset_name
|
|
129
|
+
obj.file_name = file_name
|
|
130
|
+
obj.asset_metadata = asset_metadata
|
|
131
|
+
obj.asset_types = (
|
|
132
|
+
asset_types if isinstance(asset_types, list) else [asset_types]
|
|
133
|
+
)
|
|
134
|
+
obj.asset_rid = asset_rid
|
|
135
|
+
return obj
|
|
112
136
|
|
|
113
137
|
|
|
114
138
|
class Execution:
|
|
@@ -155,7 +179,7 @@ class Execution:
|
|
|
155
179
|
Args:
|
|
156
180
|
configuration: Execution configuration object that describes the execution.
|
|
157
181
|
ml_object: The DerivaML instance that created the execution.
|
|
158
|
-
reload: RID of previously initialized execution object.
|
|
182
|
+
reload: RID of a previously initialized execution object.
|
|
159
183
|
"""
|
|
160
184
|
self.asset_paths: list[AssetFilePath] = []
|
|
161
185
|
self.configuration = configuration
|
|
@@ -476,7 +500,7 @@ class Execution:
|
|
|
476
500
|
"""Download an asset from a URL and place it in a local directory.
|
|
477
501
|
|
|
478
502
|
Args:
|
|
479
|
-
asset_rid:
|
|
503
|
+
asset_rid: RID of the asset.
|
|
480
504
|
dest_dir: Destination directory for the asset.
|
|
481
505
|
update_catalog: Whether to update the catalog execution information after downloading.
|
|
482
506
|
|
|
@@ -656,20 +680,9 @@ class Execution:
|
|
|
656
680
|
with open(feature_file, "r") as feature_values:
|
|
657
681
|
entities = [json.loads(line.strip()) for line in feature_values]
|
|
658
682
|
# Update the asset columns in the feature and add to the catalog.
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
)
|
|
663
|
-
except DataPathException as e:
|
|
664
|
-
if re.match(
|
|
665
|
-
rf'DETAIL: +Key +\("Execution", +"{target_table}", +"Feature_Name"\)=\(.*\) already exists',
|
|
666
|
-
e.message,
|
|
667
|
-
):
|
|
668
|
-
self._logger.info(
|
|
669
|
-
f"Skipping reload of feature values for {feature_table}"
|
|
670
|
-
)
|
|
671
|
-
else:
|
|
672
|
-
raise e
|
|
683
|
+
self._ml_object.domain_path.tables[feature_table].insert(
|
|
684
|
+
[map_path(e) for e in entities], on_conflict_skip=True
|
|
685
|
+
)
|
|
673
686
|
|
|
674
687
|
def _update_asset_execution_table(
|
|
675
688
|
self,
|
|
@@ -694,27 +707,17 @@ class Execution:
|
|
|
694
707
|
asset_exe = self._model.find_association(asset_table_name, "Execution")
|
|
695
708
|
asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
|
|
696
709
|
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
except DataPathException as e:
|
|
709
|
-
if re.match(
|
|
710
|
-
rf'DETAIL: +Key +\("{asset_table_name}", +"Execution"\)=\(.*\) already exists',
|
|
711
|
-
e.message,
|
|
712
|
-
):
|
|
713
|
-
self._logger.info(
|
|
714
|
-
f"Skipping reload of execution assocations for {asset_table_name}"
|
|
715
|
-
)
|
|
716
|
-
else:
|
|
717
|
-
raise e
|
|
710
|
+
asset_exe_path.insert(
|
|
711
|
+
[
|
|
712
|
+
{
|
|
713
|
+
asset_table_name: asset_path.asset_rid,
|
|
714
|
+
"Execution": self.execution_rid,
|
|
715
|
+
"Asset_Role": asset_role,
|
|
716
|
+
}
|
|
717
|
+
for asset_path in asset_list
|
|
718
|
+
],
|
|
719
|
+
on_conflict_skip=True,
|
|
720
|
+
)
|
|
718
721
|
|
|
719
722
|
# Now add in the type names via the asset_asset_type association table.
|
|
720
723
|
# Get the list of types for each file in the asset.
|
|
@@ -740,24 +743,15 @@ class Execution:
|
|
|
740
743
|
type_path = pb.schemas[asset_asset_type.schema.name].tables[
|
|
741
744
|
asset_asset_type.name
|
|
742
745
|
]
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
if re.match(
|
|
753
|
-
rf'DETAIL: +Key +\("{asset_table_name}", +"Asset_Type"\)=\(.*\) already exists',
|
|
754
|
-
e.message,
|
|
755
|
-
):
|
|
756
|
-
self._logger.info(
|
|
757
|
-
f"Skipping reload of execution asset types for {asset_table_name}"
|
|
758
|
-
)
|
|
759
|
-
else:
|
|
760
|
-
raise e
|
|
746
|
+
|
|
747
|
+
type_path.insert(
|
|
748
|
+
[
|
|
749
|
+
{asset_table_name: asset.asset_rid, "Asset_Type": t}
|
|
750
|
+
for asset in asset_list
|
|
751
|
+
for t in asset_type_map[asset.file_name]
|
|
752
|
+
],
|
|
753
|
+
on_conflict_skip=True,
|
|
754
|
+
)
|
|
761
755
|
|
|
762
756
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
763
757
|
def asset_file_path(
|
|
@@ -889,19 +883,25 @@ class Execution:
|
|
|
889
883
|
feature.Execution = self.execution_rid
|
|
890
884
|
file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
|
|
891
885
|
|
|
892
|
-
@validate_call
|
|
893
|
-
def create_dataset(
|
|
886
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
887
|
+
def create_dataset(
|
|
888
|
+
self,
|
|
889
|
+
dataset_types: str | list[str],
|
|
890
|
+
description: str,
|
|
891
|
+
version: Optional[DatasetVersion] = None,
|
|
892
|
+
) -> RID:
|
|
894
893
|
"""Create a new dataset with specified types.
|
|
895
894
|
|
|
896
895
|
Args:
|
|
897
896
|
dataset_types: param description:
|
|
898
897
|
description: Markdown description of the dataset being created.
|
|
898
|
+
version: Version to assign to the dataset. Defaults to 0.1.0
|
|
899
899
|
|
|
900
900
|
Returns:
|
|
901
901
|
RID of the newly created dataset.
|
|
902
902
|
"""
|
|
903
903
|
return self._ml_object.create_dataset(
|
|
904
|
-
dataset_types, description, self.execution_rid
|
|
904
|
+
dataset_types, description, self.execution_rid, version=version
|
|
905
905
|
)
|
|
906
906
|
|
|
907
907
|
def add_dataset_members(
|
|
@@ -959,6 +959,19 @@ class Execution:
|
|
|
959
959
|
execution_rid=self.execution_rid,
|
|
960
960
|
)
|
|
961
961
|
|
|
962
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
963
|
+
def add_files(
|
|
964
|
+
self,
|
|
965
|
+
files: Iterable[FileSpec],
|
|
966
|
+
file_types: str | list[str],
|
|
967
|
+
) -> Iterable[RID]:
|
|
968
|
+
"""Add files to the file table"""
|
|
969
|
+
return self._ml_object.add_files(
|
|
970
|
+
files=files,
|
|
971
|
+
file_types=file_types,
|
|
972
|
+
execution_rid=self.execution_rid,
|
|
973
|
+
)
|
|
974
|
+
|
|
962
975
|
def __str__(self):
|
|
963
976
|
items = [
|
|
964
977
|
f"caching_dir: {self._cache_dir}",
|
|
@@ -325,7 +325,8 @@ class ExecutionConfiguration(BaseModel):
|
|
|
325
325
|
should be materialized.
|
|
326
326
|
assets: List of assets to be downloaded prior to execution. The values must be RIDs in an asset table
|
|
327
327
|
parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
|
|
328
|
-
workflow:
|
|
328
|
+
workflow: Either a Workflow object, or a RID for a workflow instance.
|
|
329
|
+
parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
|
|
329
330
|
description: A description of the execution. Can use Markdown format.
|
|
330
331
|
"""
|
|
331
332
|
|
deriva_ml/history.py
CHANGED
|
@@ -54,6 +54,8 @@ def datetime_epoch_us(dt):
|
|
|
54
54
|
# -- --------------------------------------------------------------------------------------
|
|
55
55
|
# Take the iso format string (same as RMT) and return the version number
|
|
56
56
|
#
|
|
57
|
+
|
|
58
|
+
|
|
57
59
|
def iso_to_snap(iso_datetime):
|
|
58
60
|
rmt = datetime.fromisoformat(iso_datetime)
|
|
59
61
|
return urlb32_encode(datetime_epoch_us(rmt))
|