deriva-ml 1.14.0__py3-none-any.whl → 1.14.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +25 -30
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1489 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +4 -0
- deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
- deriva_ml/{dataset.py → dataset/dataset.py} +405 -428
- deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
- deriva_ml/{history.py → dataset/history.py} +51 -33
- deriva_ml/{upload.py → dataset/upload.py} +48 -70
- deriva_ml/demo_catalog.py +233 -183
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/{execution.py → execution/execution.py} +365 -252
- deriva_ml/execution/execution_configuration.py +163 -0
- deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
- deriva_ml/feature.py +83 -46
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
- deriva_ml/{database_model.py → model/database.py} +52 -74
- deriva_ml/model/sql_mapper.py +44 -0
- deriva_ml/run_notebook.py +19 -11
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/{schema_setup → schema}/annotations.py +31 -22
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
- deriva_ml-1.14.26.dist-info/RECORD +40 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
- deriva_ml/deriva_definitions.py +0 -391
- deriva_ml/deriva_ml_base.py +0 -1046
- deriva_ml/execution_environment.py +0 -139
- deriva_ml/schema_setup/table_comments_utils.py +0 -56
- deriva_ml/test-files/execution-parameters.json +0 -1
- deriva_ml/test-files/notebook-parameters.json +0 -5
- deriva_ml/test_functions.py +0 -141
- deriva_ml/test_notebook.ipynb +0 -197
- deriva_ml-1.14.0.dist-info/RECORD +0 -31
- /deriva_ml/{schema_setup → execution}/__init__.py +0 -0
- /deriva_ml/{schema_setup → schema}/policy.json +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0
|
@@ -1,52 +1,78 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""Execution management for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides functionality for managing and tracking executions in DerivaML. An execution
|
|
4
|
+
represents a computational or manual process that operates on datasets and produces outputs.
|
|
5
|
+
The module includes:
|
|
6
|
+
|
|
7
|
+
- Execution class: Core class for managing execution state and context
|
|
8
|
+
- Asset management: Track input and output files
|
|
9
|
+
- Status tracking: Monitor and update execution progress
|
|
10
|
+
- Dataset handling: Download and materialize required datasets
|
|
11
|
+
- Provenance tracking: Record relationships between inputs, processes, and outputs
|
|
12
|
+
|
|
13
|
+
The Execution class serves as the primary interface for managing the lifecycle of a computational
|
|
14
|
+
or manual process within DerivaML.
|
|
15
|
+
|
|
16
|
+
Typical usage example:
|
|
17
|
+
>>> config = ExecutionConfiguration(workflow="analysis_workflow", description="Data analysis")
|
|
18
|
+
>>> with ml.create_execution(config) as execution:
|
|
19
|
+
... execution.download_dataset_bag(dataset_spec)
|
|
20
|
+
... # Run analysis
|
|
21
|
+
... execution.upload_execution_outputs()
|
|
3
22
|
"""
|
|
4
23
|
|
|
5
24
|
from __future__ import annotations
|
|
6
25
|
|
|
7
|
-
from collections import defaultdict
|
|
8
|
-
from datetime import datetime
|
|
9
26
|
import json
|
|
10
27
|
import logging
|
|
11
28
|
import os
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
|
|
14
|
-
from pydantic import validate_call, ConfigDict
|
|
15
|
-
import sys
|
|
16
29
|
import shutil
|
|
17
|
-
|
|
30
|
+
import sys
|
|
31
|
+
from collections import defaultdict
|
|
32
|
+
from datetime import datetime
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
from typing import Any, Iterable, List
|
|
18
35
|
|
|
19
36
|
from deriva.core import format_exception
|
|
20
37
|
from deriva.core.hatrac_store import HatracStore
|
|
21
|
-
from
|
|
38
|
+
from pydantic import ConfigDict, validate_call
|
|
39
|
+
|
|
40
|
+
from deriva_ml.core.base import DerivaML
|
|
41
|
+
from deriva_ml.core.definitions import (
|
|
42
|
+
DRY_RUN_RID,
|
|
22
43
|
RID,
|
|
23
|
-
Status,
|
|
24
|
-
FileUploadState,
|
|
25
|
-
DerivaMLException,
|
|
26
|
-
MLVocab,
|
|
27
|
-
MLAsset,
|
|
28
|
-
ExecMetadataType,
|
|
29
44
|
ExecAssetType,
|
|
45
|
+
ExecMetadataType,
|
|
30
46
|
FileSpec,
|
|
31
|
-
|
|
47
|
+
FileUploadState,
|
|
48
|
+
MLAsset,
|
|
49
|
+
MLVocab,
|
|
50
|
+
Status,
|
|
32
51
|
)
|
|
33
|
-
from .
|
|
34
|
-
from .
|
|
35
|
-
from .dataset_bag import DatasetBag
|
|
36
|
-
from .
|
|
37
|
-
|
|
38
|
-
|
|
52
|
+
from deriva_ml.core.exceptions import DerivaMLException
|
|
53
|
+
from deriva_ml.dataset.aux_classes import DatasetSpec, DatasetVersion, VersionPart
|
|
54
|
+
from deriva_ml.dataset.dataset_bag import DatasetBag
|
|
55
|
+
from deriva_ml.dataset.upload import (
|
|
56
|
+
asset_file_path,
|
|
57
|
+
asset_root,
|
|
58
|
+
asset_type_path,
|
|
39
59
|
execution_root,
|
|
40
60
|
feature_root,
|
|
41
|
-
asset_root,
|
|
42
61
|
feature_value_path,
|
|
43
62
|
is_feature_dir,
|
|
63
|
+
normalize_asset_dir,
|
|
44
64
|
table_path,
|
|
45
65
|
upload_directory,
|
|
46
|
-
normalize_asset_dir,
|
|
47
|
-
asset_file_path,
|
|
48
|
-
asset_type_path,
|
|
49
66
|
)
|
|
67
|
+
from deriva_ml.execution.environment import get_execution_environment
|
|
68
|
+
from deriva_ml.execution.execution_configuration import ExecutionConfiguration
|
|
69
|
+
from deriva_ml.execution.workflow import Workflow
|
|
70
|
+
from deriva_ml.feature import FeatureRecord
|
|
71
|
+
|
|
72
|
+
# Keep pycharm from complaining about undefined references in docstrings.
|
|
73
|
+
execution: Execution
|
|
74
|
+
ml: DerivaML
|
|
75
|
+
dataset_spec: DatasetSpec
|
|
50
76
|
|
|
51
77
|
try:
|
|
52
78
|
from icecream import ic
|
|
@@ -55,7 +81,7 @@ except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
|
55
81
|
|
|
56
82
|
|
|
57
83
|
try:
|
|
58
|
-
from IPython.display import
|
|
84
|
+
from IPython.display import Markdown, display
|
|
59
85
|
except ImportError:
|
|
60
86
|
|
|
61
87
|
def display(s):
|
|
@@ -69,16 +95,27 @@ except ImportError:
|
|
|
69
95
|
if sys.version_info >= (3, 12):
|
|
70
96
|
|
|
71
97
|
class AssetFilePath(Path):
|
|
72
|
-
"""
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
98
|
+
"""Extended Path class for managing asset files.
|
|
99
|
+
|
|
100
|
+
Represents a file path with additional metadata about its role as an asset in the catalog.
|
|
101
|
+
This class extends the standard Path class to include information about the asset's
|
|
102
|
+
catalog representation and type.
|
|
103
|
+
|
|
104
|
+
Attributes:
|
|
105
|
+
asset_name (str): Name of the asset in the catalog (e.g., asset table name).
|
|
106
|
+
file_name (str): Name of the local file containing the asset.
|
|
107
|
+
asset_metadata (dict[str, Any]): Additional columns beyond URL, Length, and checksum.
|
|
108
|
+
asset_types (list[str]): Terms from the Asset_Type controlled vocabulary.
|
|
109
|
+
asset_rid (RID | None): Resource Identifier if uploaded to an asset table.
|
|
110
|
+
|
|
111
|
+
Example:
|
|
112
|
+
>>> path = AssetFilePath(
|
|
113
|
+
... "/path/to/file.txt",
|
|
114
|
+
... asset_name="analysis_output",
|
|
115
|
+
... file_name="results.txt",
|
|
116
|
+
... asset_metadata={"version": "1.0"},
|
|
117
|
+
... asset_types=["text", "results"]
|
|
118
|
+
... )
|
|
82
119
|
"""
|
|
83
120
|
|
|
84
121
|
def __init__(
|
|
@@ -88,16 +125,23 @@ if sys.version_info >= (3, 12):
|
|
|
88
125
|
file_name: str,
|
|
89
126
|
asset_metadata: dict[str, Any],
|
|
90
127
|
asset_types: list[str] | str,
|
|
91
|
-
asset_rid:
|
|
128
|
+
asset_rid: RID | None = None,
|
|
92
129
|
):
|
|
130
|
+
"""Initializes an AssetFilePath instance.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
asset_path: Local path to the asset file.
|
|
134
|
+
asset_name: Name of the asset in the catalog.
|
|
135
|
+
file_name: Name of the local file.
|
|
136
|
+
asset_metadata: Additional metadata columns.
|
|
137
|
+
asset_types: One or more asset type terms.
|
|
138
|
+
asset_rid: Optional Resource Identifier if already in catalog.
|
|
139
|
+
"""
|
|
93
140
|
super().__init__(asset_path)
|
|
94
|
-
# These assignments happen after __new__ returns the instance
|
|
95
141
|
self.asset_name = asset_name
|
|
96
142
|
self.file_name = file_name
|
|
97
143
|
self.asset_metadata = asset_metadata
|
|
98
|
-
self.asset_types = (
|
|
99
|
-
asset_types if isinstance(asset_types, list) else [asset_types]
|
|
100
|
-
)
|
|
144
|
+
self.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
|
|
101
145
|
self.asset_rid = asset_rid
|
|
102
146
|
else:
|
|
103
147
|
|
|
@@ -105,9 +149,9 @@ else:
|
|
|
105
149
|
"""
|
|
106
150
|
Create a new Path object that has additional information related to the use of this path as an asset.
|
|
107
151
|
|
|
108
|
-
|
|
152
|
+
Attrubytes:
|
|
109
153
|
asset_path: Local path to the location of the asset.
|
|
110
|
-
asset_name: The name of the asset in the catalog (e.g
|
|
154
|
+
asset_name: The name of the asset in the catalog (e.g., the asset table name).
|
|
111
155
|
file_name: Name of the local file that contains the contents of the asset.
|
|
112
156
|
asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
|
|
113
157
|
asset_types: A list of terms from the Asset_Type controlled vocabulary.
|
|
@@ -121,65 +165,76 @@ else:
|
|
|
121
165
|
file_name: str,
|
|
122
166
|
asset_metadata: dict[str, Any],
|
|
123
167
|
asset_types: list[str] | str,
|
|
124
|
-
asset_rid:
|
|
168
|
+
asset_rid: RID | None = None,
|
|
125
169
|
):
|
|
126
170
|
# Only pass the path to the base Path class
|
|
127
171
|
obj = super().__new__(cls, asset_path)
|
|
128
172
|
obj.asset_name = asset_name
|
|
129
173
|
obj.file_name = file_name
|
|
130
174
|
obj.asset_metadata = asset_metadata
|
|
131
|
-
obj.asset_types = (
|
|
132
|
-
asset_types if isinstance(asset_types, list) else [asset_types]
|
|
133
|
-
)
|
|
175
|
+
obj.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
|
|
134
176
|
obj.asset_rid = asset_rid
|
|
135
177
|
return obj
|
|
136
178
|
|
|
137
179
|
|
|
138
180
|
class Execution:
|
|
139
|
-
"""
|
|
140
|
-
computational, manual processes can be represented by an execution as well.
|
|
141
|
-
|
|
142
|
-
Within DerivaML, Executions are used to provide providence. Every dataset_table and data file that is generated is
|
|
143
|
-
associated with an execution, which records which program and input parameters were used to generate that data.
|
|
144
|
-
|
|
145
|
-
Execution objects are created from an ExecutionConfiguration, which provides information about what DerivaML
|
|
146
|
-
datasets will be used, what additional files (assets) are required, what code is being run (Workflow) and an
|
|
147
|
-
optional description of the Execution. Side effects of creating an execution object are:
|
|
148
|
-
|
|
149
|
-
1. An execution record is created in the catalog and the RID of that record recorded,
|
|
150
|
-
2. Any specified datasets are downloaded and materialized
|
|
151
|
-
3. Any additional required assets are downloaded.
|
|
181
|
+
"""Manages the lifecycle and context of a DerivaML execution.
|
|
152
182
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
183
|
+
An Execution represents a computational or manual process within DerivaML. It provides:
|
|
184
|
+
- Dataset materialization and access
|
|
185
|
+
- Asset management (inputs and outputs)
|
|
186
|
+
- Status tracking and updates
|
|
187
|
+
- Provenance recording
|
|
188
|
+
- Result upload and cataloging
|
|
156
189
|
|
|
157
|
-
|
|
158
|
-
|
|
190
|
+
The class handles downloading required datasets and assets, tracking execution state,
|
|
191
|
+
and managing the upload of results. Every dataset and file generated is associated
|
|
192
|
+
with an execution record for provenance tracking.
|
|
159
193
|
|
|
160
194
|
Attributes:
|
|
161
|
-
dataset_rids (list[RID]):
|
|
162
|
-
datasets (list[DatasetBag]):
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
195
|
+
dataset_rids (list[RID]): RIDs of datasets used in the execution.
|
|
196
|
+
datasets (list[DatasetBag]): Materialized dataset objects.
|
|
197
|
+
configuration (ExecutionConfiguration): Execution settings and parameters.
|
|
198
|
+
workflow_rid (RID): RID of the associated workflow.
|
|
199
|
+
status (Status): Current execution status.
|
|
200
|
+
asset_paths (list[AssetFilePath]): Paths to execution assets.
|
|
201
|
+
parameters (dict): Execution parameters.
|
|
202
|
+
start_time (datetime | None): When execution started.
|
|
203
|
+
stop_time (datetime | None): When execution completed.
|
|
204
|
+
|
|
205
|
+
Example:
|
|
206
|
+
>>> config = ExecutionConfiguration(
|
|
207
|
+
... workflow="analysis",
|
|
208
|
+
... description="Process samples",
|
|
209
|
+
... parameters={"threshold": 0.5}
|
|
210
|
+
... )
|
|
211
|
+
>>> with ml.create_execution(config) as execution:
|
|
212
|
+
... execution.download_dataset_bag(dataset_spec)
|
|
213
|
+
... # Run analysis
|
|
214
|
+
... execution.upload_execution_outputs()
|
|
167
215
|
"""
|
|
168
216
|
|
|
169
217
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
170
218
|
def __init__(
|
|
171
219
|
self,
|
|
172
220
|
configuration: ExecutionConfiguration,
|
|
173
|
-
ml_object:
|
|
174
|
-
reload:
|
|
221
|
+
ml_object: DerivaML,
|
|
222
|
+
reload: RID | None = None,
|
|
175
223
|
dry_run: bool = False,
|
|
176
224
|
):
|
|
177
|
-
"""
|
|
225
|
+
"""Initializes an Execution instance.
|
|
226
|
+
|
|
227
|
+
Creates a new execution or reloads an existing one. Initializes the execution
|
|
228
|
+
environment, downloads required datasets, and sets up asset tracking.
|
|
178
229
|
|
|
179
230
|
Args:
|
|
180
|
-
configuration:
|
|
181
|
-
ml_object:
|
|
182
|
-
reload: RID of
|
|
231
|
+
configuration: Settings and parameters for the execution.
|
|
232
|
+
ml_object: DerivaML instance managing the execution.
|
|
233
|
+
reload: Optional RID of existing execution to reload.
|
|
234
|
+
dry_run: If True, don't create catalog records or upload results.
|
|
235
|
+
|
|
236
|
+
Raises:
|
|
237
|
+
DerivaMLException: If initialization fails or configuration is invalid.
|
|
183
238
|
"""
|
|
184
239
|
self.asset_paths: list[AssetFilePath] = []
|
|
185
240
|
self.configuration = configuration
|
|
@@ -189,10 +244,10 @@ class Execution:
|
|
|
189
244
|
self.start_time = None
|
|
190
245
|
self.stop_time = None
|
|
191
246
|
self.status = Status.created
|
|
192
|
-
self.uploaded_assets:
|
|
247
|
+
self.uploaded_assets: dict[str, list[AssetFilePath]] | None = None
|
|
193
248
|
self.configuration.argv = sys.argv
|
|
194
249
|
|
|
195
|
-
self.dataset_rids:
|
|
250
|
+
self.dataset_rids: List[RID] = []
|
|
196
251
|
self.datasets: list[DatasetBag] = []
|
|
197
252
|
self.parameters = self.configuration.parameters
|
|
198
253
|
|
|
@@ -203,32 +258,21 @@ class Execution:
|
|
|
203
258
|
# Make sure we have a good workflow.
|
|
204
259
|
if isinstance(self.configuration.workflow, Workflow):
|
|
205
260
|
self.workflow_rid = (
|
|
206
|
-
self._ml_object.add_workflow(self.configuration.workflow)
|
|
207
|
-
if not self._dry_run
|
|
208
|
-
else DRY_RUN_RID
|
|
261
|
+
self._ml_object.add_workflow(self.configuration.workflow) if not self._dry_run else DRY_RUN_RID
|
|
209
262
|
)
|
|
210
263
|
else:
|
|
211
264
|
self.workflow_rid = self.configuration.workflow
|
|
212
|
-
if (
|
|
213
|
-
|
|
214
|
-
!= "Workflow"
|
|
215
|
-
):
|
|
216
|
-
raise DerivaMLException(
|
|
217
|
-
"Workflow specified in execution configuration is not a Workflow"
|
|
218
|
-
)
|
|
265
|
+
if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
|
|
266
|
+
raise DerivaMLException("Workflow specified in execution configuration is not a Workflow")
|
|
219
267
|
|
|
220
268
|
# Validate the datasets and assets to be valid.
|
|
221
269
|
for d in self.configuration.datasets:
|
|
222
270
|
if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
|
|
223
|
-
raise DerivaMLException(
|
|
224
|
-
"Dataset specified in execution configuration is not a dataset"
|
|
225
|
-
)
|
|
271
|
+
raise DerivaMLException("Dataset specified in execution configuration is not a dataset")
|
|
226
272
|
|
|
227
273
|
for a in self.configuration.assets:
|
|
228
274
|
if not self._model.is_asset(self._ml_object.resolve_rid(a).table.name):
|
|
229
|
-
raise DerivaMLException(
|
|
230
|
-
"Asset specified in execution configuration is not a asset table"
|
|
231
|
-
)
|
|
275
|
+
raise DerivaMLException("Asset specified in execution configuration is not a asset table")
|
|
232
276
|
|
|
233
277
|
schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
|
|
234
278
|
if reload:
|
|
@@ -247,16 +291,11 @@ class Execution:
|
|
|
247
291
|
]
|
|
248
292
|
)[0]["RID"]
|
|
249
293
|
|
|
250
|
-
if (
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
):
|
|
254
|
-
# Put execution_rid into cell output so we can find it later.
|
|
255
|
-
display(
|
|
256
|
-
Markdown(f"Execution RID: {self._ml_object.cite(self.execution_rid)}")
|
|
257
|
-
)
|
|
294
|
+
if isinstance(self.configuration.workflow, Workflow) and self.configuration.workflow.is_notebook:
|
|
295
|
+
# Put execution_rid into the cell output so we can find it later.
|
|
296
|
+
display(Markdown(f"Execution RID: {self._ml_object.cite(self.execution_rid)}"))
|
|
258
297
|
|
|
259
|
-
# Create a directory for execution rid so we can recover state in case of a crash.
|
|
298
|
+
# Create a directory for execution rid so we can recover the state in case of a crash.
|
|
260
299
|
execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
|
|
261
300
|
self._initialize_execution(reload)
|
|
262
301
|
|
|
@@ -266,12 +305,12 @@ class Execution:
|
|
|
266
305
|
f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
|
|
267
306
|
ExecMetadataType.runtime_env.value,
|
|
268
307
|
)
|
|
269
|
-
with open(
|
|
308
|
+
with Path(runtime_env_path).open("w") as fp:
|
|
270
309
|
json.dump(get_execution_environment(), fp)
|
|
271
310
|
|
|
272
|
-
def _initialize_execution(self, reload:
|
|
273
|
-
"""Initialize the execution by a configuration
|
|
274
|
-
|
|
311
|
+
def _initialize_execution(self, reload: RID | None = None) -> None:
|
|
312
|
+
"""Initialize the execution by a configuration in the Execution_Metadata table.
|
|
313
|
+
Set up a working directory and download all the assets and data.
|
|
275
314
|
|
|
276
315
|
:raise DerivaMLException: If there is an issue initializing the execution.
|
|
277
316
|
|
|
@@ -283,9 +322,7 @@ class Execution:
|
|
|
283
322
|
"""
|
|
284
323
|
# Materialize bdbag
|
|
285
324
|
for dataset in self.configuration.datasets:
|
|
286
|
-
self.update_status(
|
|
287
|
-
Status.initializing, f"Materialize bag {dataset.rid}... "
|
|
288
|
-
)
|
|
325
|
+
self.update_status(Status.initializing, f"Materialize bag {dataset.rid}... ")
|
|
289
326
|
self.datasets.append(self.download_dataset_bag(dataset))
|
|
290
327
|
self.dataset_rids.append(dataset.rid)
|
|
291
328
|
|
|
@@ -293,10 +330,7 @@ class Execution:
|
|
|
293
330
|
schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
|
|
294
331
|
if self.dataset_rids and not (reload or self._dry_run):
|
|
295
332
|
schema_path.Dataset_Execution.insert(
|
|
296
|
-
[
|
|
297
|
-
{"Dataset": d, "Execution": self.execution_rid}
|
|
298
|
-
for d in self.dataset_rids
|
|
299
|
-
]
|
|
333
|
+
[{"Dataset": d, "Execution": self.execution_rid} for d in self.dataset_rids]
|
|
300
334
|
)
|
|
301
335
|
|
|
302
336
|
# Download assets....
|
|
@@ -305,9 +339,7 @@ class Execution:
|
|
|
305
339
|
for asset_rid in self.configuration.assets:
|
|
306
340
|
asset_table = self._ml_object.resolve_rid(asset_rid).table.name
|
|
307
341
|
dest_dir = (
|
|
308
|
-
execution_root(self._ml_object.working_dir, self.execution_rid)
|
|
309
|
-
/ "downloaded-assets"
|
|
310
|
-
/ asset_table
|
|
342
|
+
execution_root(self._ml_object.working_dir, self.execution_rid) / "downloaded-assets" / asset_table
|
|
311
343
|
)
|
|
312
344
|
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
313
345
|
self.asset_paths.setdefault(asset_table, []).append(
|
|
@@ -325,7 +357,7 @@ class Execution:
|
|
|
325
357
|
"configuration.json",
|
|
326
358
|
ExecMetadataType.execution_config.value,
|
|
327
359
|
)
|
|
328
|
-
with
|
|
360
|
+
with Path(cfile).open("w", encoding="utf-8") as config_file:
|
|
329
361
|
json.dump(self.configuration.model_dump(), config_file)
|
|
330
362
|
|
|
331
363
|
for parameter_file in self.configuration.parameters:
|
|
@@ -355,7 +387,7 @@ class Execution:
|
|
|
355
387
|
|
|
356
388
|
@property
|
|
357
389
|
def _feature_root(self) -> Path:
|
|
358
|
-
"""The root path to all execution
|
|
390
|
+
"""The root path to all execution-specific files.
|
|
359
391
|
:return:
|
|
360
392
|
|
|
361
393
|
Args:
|
|
@@ -367,7 +399,7 @@ class Execution:
|
|
|
367
399
|
|
|
368
400
|
@property
|
|
369
401
|
def _asset_root(self) -> Path:
|
|
370
|
-
"""The root path to all execution
|
|
402
|
+
"""The root path to all execution-specific files.
|
|
371
403
|
:return:
|
|
372
404
|
|
|
373
405
|
Args:
|
|
@@ -379,26 +411,47 @@ class Execution:
|
|
|
379
411
|
|
|
380
412
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
381
413
|
def download_dataset_bag(self, dataset: DatasetSpec) -> DatasetBag:
|
|
382
|
-
"""
|
|
383
|
-
|
|
414
|
+
"""Downloads and materializes a dataset for use in the execution.
|
|
415
|
+
|
|
416
|
+
Downloads the specified dataset as a BDBag and materializes it in the execution's
|
|
417
|
+
working directory. The dataset version is determined by the DatasetSpec.
|
|
384
418
|
|
|
385
419
|
Args:
|
|
386
|
-
dataset:
|
|
420
|
+
dataset: Specification of the dataset to download, including version and
|
|
421
|
+
materialization options.
|
|
387
422
|
|
|
388
423
|
Returns:
|
|
389
|
-
|
|
424
|
+
DatasetBag: Object containing:
|
|
425
|
+
- path: Local filesystem path to downloaded dataset
|
|
426
|
+
- rid: Dataset's Resource Identifier
|
|
427
|
+
- minid: Dataset's Minimal Viable Identifier
|
|
428
|
+
|
|
429
|
+
Raises:
|
|
430
|
+
DerivaMLException: If download or materialization fails.
|
|
431
|
+
|
|
432
|
+
Example:
|
|
433
|
+
>>> spec = DatasetSpec(rid="1-abc123", version="1.2.0")
|
|
434
|
+
>>> bag = execution.download_dataset_bag(spec)
|
|
435
|
+
>>> print(f"Downloaded to {bag.path}")
|
|
390
436
|
"""
|
|
391
|
-
return self._ml_object.download_dataset_bag(
|
|
392
|
-
dataset, execution_rid=self.execution_rid
|
|
393
|
-
)
|
|
437
|
+
return self._ml_object.download_dataset_bag(dataset, execution_rid=self.execution_rid)
|
|
394
438
|
|
|
395
439
|
@validate_call
|
|
396
440
|
def update_status(self, status: Status, msg: str) -> None:
|
|
397
|
-
"""
|
|
441
|
+
"""Updates the execution's status in the catalog.
|
|
442
|
+
|
|
443
|
+
Records a new status and associated message in the catalog, allowing remote
|
|
444
|
+
tracking of execution progress.
|
|
398
445
|
|
|
399
446
|
Args:
|
|
400
|
-
status:
|
|
401
|
-
msg:
|
|
447
|
+
status: New status value (e.g., running, completed, failed).
|
|
448
|
+
msg: Description of the status change or current state.
|
|
449
|
+
|
|
450
|
+
Raises:
|
|
451
|
+
DerivaMLException: If status update fails.
|
|
452
|
+
|
|
453
|
+
Example:
|
|
454
|
+
>>> execution.update_status(Status.running, "Processing sample 1 of 10")
|
|
402
455
|
"""
|
|
403
456
|
self.status = status
|
|
404
457
|
self._logger.info(msg)
|
|
@@ -417,14 +470,36 @@ class Execution:
|
|
|
417
470
|
)
|
|
418
471
|
|
|
419
472
|
def execution_start(self) -> None:
|
|
420
|
-
"""
|
|
421
|
-
|
|
473
|
+
"""Marks the execution as started.
|
|
474
|
+
|
|
475
|
+
Records the start time and updates the execution's status to 'running'.
|
|
476
|
+
This should be called before beginning the main execution work.
|
|
477
|
+
|
|
478
|
+
Example:
|
|
479
|
+
>>> execution.execution_start()
|
|
480
|
+
>>> try:
|
|
481
|
+
... # Run analysis
|
|
482
|
+
... execution.execution_stop()
|
|
483
|
+
... except Exception:
|
|
484
|
+
... execution.update_status(Status.failed, "Analysis error")
|
|
485
|
+
"""
|
|
422
486
|
self.start_time = datetime.now()
|
|
423
487
|
self.uploaded_assets = None
|
|
424
488
|
self.update_status(Status.initializing, "Start execution ...")
|
|
425
489
|
|
|
426
490
|
def execution_stop(self) -> None:
|
|
427
|
-
"""
|
|
491
|
+
"""Marks the execution as completed.
|
|
492
|
+
|
|
493
|
+
Records the stop time and updates the execution's status to 'completed'.
|
|
494
|
+
This should be called after all execution work is finished.
|
|
495
|
+
|
|
496
|
+
Example:
|
|
497
|
+
>>> try:
|
|
498
|
+
... # Run analysis
|
|
499
|
+
... execution.execution_stop()
|
|
500
|
+
... except Exception:
|
|
501
|
+
... execution.update_status(Status.failed, "Analysis error")
|
|
502
|
+
"""
|
|
428
503
|
self.stop_time = datetime.now()
|
|
429
504
|
duration = self.stop_time - self.start_time
|
|
430
505
|
hours, remainder = divmod(duration.total_seconds(), 3600)
|
|
@@ -433,22 +508,22 @@ class Execution:
|
|
|
433
508
|
|
|
434
509
|
self.update_status(Status.completed, "Algorithm execution ended.")
|
|
435
510
|
if not self._dry_run:
|
|
436
|
-
self._ml_object.pathBuilder.schemas[
|
|
437
|
-
self.
|
|
438
|
-
|
|
511
|
+
self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
|
|
512
|
+
[{"RID": self.execution_rid, "Duration": duration}]
|
|
513
|
+
)
|
|
439
514
|
|
|
440
515
|
def _upload_execution_dirs(self) -> dict[str, list[AssetFilePath]]:
|
|
441
516
|
"""Upload execution assets at _working_dir/Execution_asset.
|
|
442
517
|
|
|
443
518
|
This routine uploads the contents of the
|
|
444
|
-
Execution_Asset directory
|
|
519
|
+
Execution_Asset directory and then updates the execution_asset table in the ML schema to have references
|
|
445
520
|
to these newly uploaded files.
|
|
446
521
|
|
|
447
522
|
Returns:
|
|
448
523
|
dict: Results of the upload operation.
|
|
449
524
|
|
|
450
525
|
Raises:
|
|
451
|
-
DerivaMLException: If there is an issue uploading the assets.
|
|
526
|
+
DerivaMLException: If there is an issue when uploading the assets.
|
|
452
527
|
"""
|
|
453
528
|
|
|
454
529
|
try:
|
|
@@ -494,9 +569,7 @@ class Execution:
|
|
|
494
569
|
return asset_map
|
|
495
570
|
|
|
496
571
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
497
|
-
def download_asset(
|
|
498
|
-
self, asset_rid: RID, dest_dir: Path, update_catalog=True
|
|
499
|
-
) -> AssetFilePath:
|
|
572
|
+
def download_asset(self, asset_rid: RID, dest_dir: Path, update_catalog=True) -> AssetFilePath:
|
|
500
573
|
"""Download an asset from a URL and place it in a local directory.
|
|
501
574
|
|
|
502
575
|
Args:
|
|
@@ -513,25 +586,17 @@ class Execution:
|
|
|
513
586
|
raise DerivaMLException(f"RID {asset_rid} is not for an asset table.")
|
|
514
587
|
|
|
515
588
|
asset_record = self._ml_object.retrieve_rid(asset_rid)
|
|
516
|
-
asset_metadata = {
|
|
517
|
-
k: v
|
|
518
|
-
for k, v in asset_record.items()
|
|
519
|
-
if k in self._model.asset_metadata(asset_table)
|
|
520
|
-
}
|
|
589
|
+
asset_metadata = {k: v for k, v in asset_record.items() if k in self._model.asset_metadata(asset_table)}
|
|
521
590
|
asset_url = asset_record["URL"]
|
|
522
591
|
asset_filename = dest_dir / asset_record["Filename"]
|
|
523
592
|
hs = HatracStore("https", self._ml_object.host_name, self._ml_object.credential)
|
|
524
593
|
hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
|
|
525
594
|
|
|
526
|
-
asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
|
|
527
|
-
type_path = self._ml_object.pathBuilder.schemas[
|
|
528
|
-
asset_type_table.schema.name
|
|
529
|
-
].tables[asset_type_table.name]
|
|
595
|
+
asset_type_table, _col_l, _col_r = self._model.find_association(asset_table, MLVocab.asset_type)
|
|
596
|
+
type_path = self._ml_object.pathBuilder.schemas[asset_type_table.schema.name].tables[asset_type_table.name]
|
|
530
597
|
asset_types = [
|
|
531
598
|
asset_type[MLVocab.asset_type.value]
|
|
532
|
-
for asset_type in type_path.filter(
|
|
533
|
-
type_path.columns[asset_table.name] == asset_rid
|
|
534
|
-
)
|
|
599
|
+
for asset_type in type_path.filter(type_path.columns[asset_table.name] == asset_rid)
|
|
535
600
|
.attributes(type_path.Asset_Type)
|
|
536
601
|
.fetch()
|
|
537
602
|
]
|
|
@@ -557,47 +622,58 @@ class Execution:
|
|
|
557
622
|
self,
|
|
558
623
|
assets_dir: str | Path,
|
|
559
624
|
) -> dict[Any, FileUploadState] | None:
|
|
560
|
-
"""
|
|
625
|
+
"""Uploads assets from a directory to the catalog.
|
|
561
626
|
|
|
562
|
-
|
|
563
|
-
|
|
627
|
+
Scans the specified directory for assets and uploads them to the catalog,
|
|
628
|
+
recording their metadata and types. Assets are organized by their types
|
|
629
|
+
and associated with the execution.
|
|
564
630
|
|
|
565
631
|
Args:
|
|
566
|
-
assets_dir: Directory containing
|
|
632
|
+
assets_dir: Directory containing assets to upload.
|
|
567
633
|
|
|
568
634
|
Returns:
|
|
569
|
-
|
|
635
|
+
dict[Any, FileUploadState] | None: Mapping of assets to their upload states,
|
|
636
|
+
or None if no assets were found.
|
|
570
637
|
|
|
571
638
|
Raises:
|
|
572
|
-
DerivaMLException: If
|
|
639
|
+
DerivaMLException: If upload fails or assets are invalid.
|
|
640
|
+
|
|
641
|
+
Example:
|
|
642
|
+
>>> states = execution.upload_assets("output/results")
|
|
643
|
+
>>> for asset, state in states.items():
|
|
644
|
+
... print(f"{asset}: {state}")
|
|
573
645
|
"""
|
|
574
646
|
|
|
575
647
|
def path_to_asset(path: str) -> str:
|
|
576
648
|
"""Pull the asset name out of a path to that asset in the filesystem"""
|
|
577
649
|
components = path.split("/")
|
|
578
|
-
return components[
|
|
579
|
-
components.index("asset") + 2
|
|
580
|
-
] # Look for asset in the path to find the name
|
|
650
|
+
return components[components.index("asset") + 2] # Look for asset in the path to find the name
|
|
581
651
|
|
|
582
652
|
if not self._model.is_asset(Path(assets_dir).name):
|
|
583
653
|
raise DerivaMLException("Directory does not have name of an asset table.")
|
|
584
654
|
results = upload_directory(self._model, assets_dir)
|
|
585
655
|
return {path_to_asset(p): r for p, r in results.items()}
|
|
586
656
|
|
|
587
|
-
def upload_execution_outputs(
|
|
588
|
-
|
|
589
|
-
) -> dict[str, list[AssetFilePath]]:
|
|
590
|
-
"""Upload all the assets and metadata associated with the current execution.
|
|
657
|
+
def upload_execution_outputs(self, clean_folder: bool = True) -> dict[str, list[AssetFilePath]]:
|
|
658
|
+
"""Uploads all outputs from the execution to the catalog.
|
|
591
659
|
|
|
592
|
-
|
|
660
|
+
Scans the execution's output directories for assets, features, and other results,
|
|
661
|
+
then uploads them to the catalog. Can optionally clean up the output folders
|
|
662
|
+
after successful upload.
|
|
593
663
|
|
|
594
664
|
Args:
|
|
595
|
-
clean_folder:
|
|
665
|
+
clean_folder: Whether to delete output folders after upload. Defaults to True.
|
|
596
666
|
|
|
597
667
|
Returns:
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
668
|
+
dict[str, list[AssetFilePath]]: Mapping of asset types to their file paths.
|
|
669
|
+
|
|
670
|
+
Raises:
|
|
671
|
+
DerivaMLException: If upload fails or outputs are invalid.
|
|
672
|
+
|
|
673
|
+
Example:
|
|
674
|
+
>>> outputs = execution.upload_execution_outputs()
|
|
675
|
+
>>> for type_name, paths in outputs.items():
|
|
676
|
+
... print(f"{type_name}: {len(paths)} files")
|
|
601
677
|
"""
|
|
602
678
|
if self._dry_run:
|
|
603
679
|
return {}
|
|
@@ -613,21 +689,40 @@ class Execution:
|
|
|
613
689
|
raise e
|
|
614
690
|
|
|
615
691
|
def _clean_folder_contents(self, folder_path: Path):
|
|
616
|
-
"""
|
|
692
|
+
"""Clean up folder contents with Windows-compatible error handling.
|
|
617
693
|
|
|
618
694
|
Args:
|
|
619
|
-
folder_path: Path
|
|
695
|
+
folder_path: Path to the folder to clean
|
|
620
696
|
"""
|
|
697
|
+
import time
|
|
698
|
+
|
|
699
|
+
MAX_RETRIES = 3
|
|
700
|
+
RETRY_DELAY = 1 # seconds
|
|
701
|
+
|
|
702
|
+
def remove_with_retry(path: Path, is_dir: bool = False) -> bool:
|
|
703
|
+
for attempt in range(MAX_RETRIES):
|
|
704
|
+
try:
|
|
705
|
+
if is_dir:
|
|
706
|
+
shutil.rmtree(path)
|
|
707
|
+
else:
|
|
708
|
+
Path(path).unlink()
|
|
709
|
+
return True
|
|
710
|
+
except (OSError, PermissionError) as e:
|
|
711
|
+
if attempt == MAX_RETRIES - 1:
|
|
712
|
+
self.update_status(Status.failed, format_exception(e))
|
|
713
|
+
return False
|
|
714
|
+
time.sleep(RETRY_DELAY)
|
|
715
|
+
return False
|
|
716
|
+
|
|
621
717
|
try:
|
|
622
718
|
with os.scandir(folder_path) as entries:
|
|
623
719
|
for entry in entries:
|
|
624
720
|
if entry.is_dir() and not entry.is_symlink():
|
|
625
|
-
|
|
721
|
+
remove_with_retry(Path(entry.path), is_dir=True)
|
|
626
722
|
else:
|
|
627
|
-
|
|
723
|
+
remove_with_retry(Path(entry.path))
|
|
628
724
|
except OSError as e:
|
|
629
|
-
|
|
630
|
-
self.update_status(Status.failed, error)
|
|
725
|
+
self.update_status(Status.failed, format_exception(e))
|
|
631
726
|
|
|
632
727
|
def _update_feature_table(
|
|
633
728
|
self,
|
|
@@ -642,28 +737,21 @@ class Execution:
|
|
|
642
737
|
target_table: str:
|
|
643
738
|
feature_name: str:
|
|
644
739
|
feature_file: str | Path:
|
|
645
|
-
uploaded_files: Dictionary whose key
|
|
740
|
+
uploaded_files: Dictionary whose key is an asset name, file-name pair, and whose value is a filename,
|
|
741
|
+
RID of that asset.
|
|
646
742
|
"""
|
|
647
743
|
|
|
648
744
|
# Get the column names of all the Feature columns that should be the RID of an asset
|
|
649
745
|
asset_columns = [
|
|
650
|
-
c.name
|
|
651
|
-
for c in self._ml_object.feature_record_class(
|
|
652
|
-
target_table, feature_name
|
|
653
|
-
).feature.asset_columns
|
|
746
|
+
c.name for c in self._ml_object.feature_record_class(target_table, feature_name).feature.asset_columns
|
|
654
747
|
]
|
|
655
748
|
|
|
656
749
|
# Get the names of the columns in the feature that are assets.
|
|
657
750
|
asset_columns = [
|
|
658
|
-
c.name
|
|
659
|
-
for c in self._ml_object.feature_record_class(
|
|
660
|
-
target_table, feature_name
|
|
661
|
-
).feature.asset_columns
|
|
751
|
+
c.name for c in self._ml_object.feature_record_class(target_table, feature_name).feature.asset_columns
|
|
662
752
|
]
|
|
663
753
|
|
|
664
|
-
feature_table = self._ml_object.feature_record_class(
|
|
665
|
-
target_table, feature_name
|
|
666
|
-
).feature.feature_table.name
|
|
754
|
+
feature_table = self._ml_object.feature_record_class(target_table, feature_name).feature.feature_table.name
|
|
667
755
|
asset_map = {
|
|
668
756
|
(asset_table, asset.file_name): asset.asset_rid
|
|
669
757
|
for asset_table, assets in uploaded_files.items()
|
|
@@ -677,41 +765,37 @@ class Execution:
|
|
|
677
765
|
return e
|
|
678
766
|
|
|
679
767
|
# Load the JSON file that has the set of records that contain the feature values.
|
|
680
|
-
with open(
|
|
768
|
+
with Path(feature_file).open("r") as feature_values:
|
|
681
769
|
entities = [json.loads(line.strip()) for line in feature_values]
|
|
682
770
|
# Update the asset columns in the feature and add to the catalog.
|
|
683
|
-
self._ml_object.domain_path.tables[feature_table].insert(
|
|
684
|
-
[map_path(e) for e in entities], on_conflict_skip=True
|
|
685
|
-
)
|
|
771
|
+
self._ml_object.domain_path.tables[feature_table].insert([map_path(e) for e in entities], on_conflict_skip=True)
|
|
686
772
|
|
|
687
773
|
def _update_asset_execution_table(
|
|
688
774
|
self,
|
|
689
775
|
uploaded_assets: dict[str, list[AssetFilePath]],
|
|
690
776
|
asset_role: str = "Output",
|
|
691
777
|
):
|
|
692
|
-
"""Add entry to association table connecting an asset to an execution RID
|
|
778
|
+
"""Add entry to the association table connecting an asset to an execution RID
|
|
693
779
|
|
|
694
780
|
Args:
|
|
695
|
-
uploaded_assets: Dictionary whose key is the name of an asset table
|
|
781
|
+
uploaded_assets: Dictionary whose key is the name of an asset table and whose value is a list of RIDs for
|
|
696
782
|
newly added assets to that table.
|
|
697
783
|
asset_role: A term or list of terms from the Asset_Role vocabulary.
|
|
698
784
|
"""
|
|
699
|
-
# Make sure
|
|
785
|
+
# Make sure the asset role is in the controlled vocabulary table.
|
|
700
786
|
self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
|
|
701
787
|
|
|
702
788
|
pb = self._ml_object.pathBuilder
|
|
703
789
|
for asset_table, asset_list in uploaded_assets.items():
|
|
704
|
-
asset_table_name = asset_table.split("/")[
|
|
705
|
-
|
|
706
|
-
] # Peel off the schema from the asset table
|
|
707
|
-
asset_exe = self._model.find_association(asset_table_name, "Execution")
|
|
790
|
+
asset_table_name = asset_table.split("/")[1] # Peel off the schema from the asset table
|
|
791
|
+
asset_exe, asset_fk, execution_fk = self._model.find_association(asset_table_name, "Execution")
|
|
708
792
|
asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
|
|
709
793
|
|
|
710
794
|
asset_exe_path.insert(
|
|
711
795
|
[
|
|
712
796
|
{
|
|
713
|
-
|
|
714
|
-
|
|
797
|
+
asset_fk: asset_path.asset_rid,
|
|
798
|
+
execution_fk: self.execution_rid,
|
|
715
799
|
"Asset_Role": asset_role,
|
|
716
800
|
}
|
|
717
801
|
for asset_path in asset_list
|
|
@@ -724,25 +808,20 @@ class Execution:
|
|
|
724
808
|
if asset_role == "Input":
|
|
725
809
|
return
|
|
726
810
|
asset_type_map = {}
|
|
727
|
-
with
|
|
811
|
+
with Path(
|
|
728
812
|
asset_type_path(
|
|
729
813
|
self._working_dir,
|
|
730
814
|
self.execution_rid,
|
|
731
815
|
self._model.name_to_table(asset_table_name),
|
|
732
|
-
)
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
for line in f:
|
|
816
|
+
)
|
|
817
|
+
).open("r") as asset_type_file:
|
|
818
|
+
for line in asset_type_file:
|
|
736
819
|
asset_type_map.update(json.loads(line.strip()))
|
|
737
820
|
for asset_path in asset_list:
|
|
738
821
|
asset_path.asset_types = asset_type_map[asset_path.file_name]
|
|
739
822
|
|
|
740
|
-
asset_asset_type = self._model.find_association(
|
|
741
|
-
|
|
742
|
-
)
|
|
743
|
-
type_path = pb.schemas[asset_asset_type.schema.name].tables[
|
|
744
|
-
asset_asset_type.name
|
|
745
|
-
]
|
|
823
|
+
asset_asset_type, _, _ = self._model.find_association(asset_table_name, "Asset_Type")
|
|
824
|
+
type_path = pb.schemas[asset_asset_type.schema.name].tables[asset_asset_type.name]
|
|
746
825
|
|
|
747
826
|
type_path.insert(
|
|
748
827
|
[
|
|
@@ -758,13 +837,13 @@ class Execution:
|
|
|
758
837
|
self,
|
|
759
838
|
asset_name: str,
|
|
760
839
|
file_name: str | Path,
|
|
761
|
-
asset_types:
|
|
840
|
+
asset_types: list[str] | str | None = None,
|
|
762
841
|
copy_file=False,
|
|
763
842
|
**kwargs,
|
|
764
843
|
) -> AssetFilePath:
|
|
765
844
|
"""Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
|
|
766
845
|
|
|
767
|
-
Given the name of an asset table, and a file name, register the file for upload
|
|
846
|
+
Given the name of an asset table, and a file name, register the file for upload and return a path to that
|
|
768
847
|
file in the upload directory. In addition to the filename, additional asset metadata and file asset types may
|
|
769
848
|
be specified.
|
|
770
849
|
|
|
@@ -772,13 +851,13 @@ class Execution:
|
|
|
772
851
|
to a new file with the specified name is returned. The caller can then open that file for writing.
|
|
773
852
|
|
|
774
853
|
If the provided filename refers to an existing file and the copy_file argument is False (the default), then the
|
|
775
|
-
returned path contains a symbolic link to that file. If the copy_file argument is True then the contents of
|
|
854
|
+
returned path contains a symbolic link to that file. If the copy_file argument is True, then the contents of
|
|
776
855
|
file_name are copied into the target directory.
|
|
777
856
|
|
|
778
857
|
Args:
|
|
779
858
|
asset_name: Type of asset to be uploaded. Must be a term in Asset_Type controlled vocabulary.
|
|
780
859
|
file_name: Name of file to be uploaded.
|
|
781
|
-
asset_types: Type of asset to be uploaded. Defaults to name of the asset.
|
|
860
|
+
asset_types: Type of asset to be uploaded. Defaults to the name of the asset.
|
|
782
861
|
**kwargs: Any additional metadata values that may be part of the asset table.
|
|
783
862
|
|
|
784
863
|
Returns:
|
|
@@ -810,15 +889,17 @@ class Execution:
|
|
|
810
889
|
if copy_file:
|
|
811
890
|
asset_path.write_bytes(file_name.read_bytes())
|
|
812
891
|
else:
|
|
813
|
-
|
|
892
|
+
try:
|
|
893
|
+
asset_path.symlink_to(file_name)
|
|
894
|
+
except (OSError, PermissionError):
|
|
895
|
+
# Fallback to copy if symlink fails (common on Windows)
|
|
896
|
+
asset_path.write_bytes(file_name.read_bytes())
|
|
814
897
|
|
|
815
898
|
# Persist the asset types into a file
|
|
816
|
-
with
|
|
817
|
-
asset_type_path(self._working_dir, self.execution_rid, asset_table)
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
) as f:
|
|
821
|
-
f.write(json.dumps({file_name.name: asset_types}) + "\n")
|
|
899
|
+
with Path(
|
|
900
|
+
asset_type_path(self._working_dir, self.execution_rid, asset_table)
|
|
901
|
+
).open("a") as asset_type_file:
|
|
902
|
+
asset_type_file.write(json.dumps({file_name.name: asset_types}) + "\n")
|
|
822
903
|
|
|
823
904
|
return AssetFilePath(
|
|
824
905
|
asset_path=asset_path,
|
|
@@ -838,26 +919,33 @@ class Execution:
|
|
|
838
919
|
Pathlib path to the file in which to place table values.
|
|
839
920
|
"""
|
|
840
921
|
if table not in self._model.schemas[self._ml_object.domain_schema].tables:
|
|
841
|
-
raise DerivaMLException(
|
|
842
|
-
"Table '{}' not found in domain schema".format(table)
|
|
843
|
-
)
|
|
922
|
+
raise DerivaMLException("Table '{}' not found in domain schema".format(table))
|
|
844
923
|
|
|
845
|
-
return table_path(
|
|
846
|
-
self._working_dir, schema=self._ml_object.domain_schema, table=table
|
|
847
|
-
)
|
|
924
|
+
return table_path(self._working_dir, schema=self._ml_object.domain_schema, table=table)
|
|
848
925
|
|
|
849
926
|
def execute(self) -> Execution:
|
|
850
|
-
"""Initiate an execution with provided configuration. Can be used in a context manager."""
|
|
927
|
+
"""Initiate an execution with the provided configuration. Can be used in a context manager."""
|
|
851
928
|
self.execution_start()
|
|
852
929
|
return self
|
|
853
930
|
|
|
854
931
|
@validate_call
|
|
855
932
|
def add_features(self, features: Iterable[FeatureRecord]) -> None:
|
|
856
|
-
"""
|
|
857
|
-
|
|
933
|
+
"""Adds feature records to the catalog.
|
|
934
|
+
|
|
935
|
+
Associates feature records with this execution and uploads them to the catalog.
|
|
936
|
+
Features represent measurable properties or characteristics of records.
|
|
937
|
+
|
|
938
|
+
NOTE: The catalog is not updated until upload_execution_outputs() is called.
|
|
858
939
|
|
|
859
940
|
Args:
|
|
860
|
-
features:
|
|
941
|
+
features: Feature records to add, each containing a value and metadata.
|
|
942
|
+
|
|
943
|
+
Raises:
|
|
944
|
+
DerivaMLException: If feature addition fails or features are invalid.
|
|
945
|
+
|
|
946
|
+
Example:
|
|
947
|
+
>>> feature = FeatureRecord(value="high", confidence=0.95)
|
|
948
|
+
>>> execution.add_features([feature])
|
|
861
949
|
"""
|
|
862
950
|
|
|
863
951
|
# Make sure feature list is homogeneous:
|
|
@@ -878,7 +966,7 @@ class Execution:
|
|
|
878
966
|
feature_name=feature.feature_name,
|
|
879
967
|
exec_rid=self.execution_rid,
|
|
880
968
|
)
|
|
881
|
-
with open(
|
|
969
|
+
with Path(json_path).open("a", encoding="utf-8") as file:
|
|
882
970
|
for feature in features:
|
|
883
971
|
feature.Execution = self.execution_rid
|
|
884
972
|
file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
|
|
@@ -888,7 +976,7 @@ class Execution:
|
|
|
888
976
|
self,
|
|
889
977
|
dataset_types: str | list[str],
|
|
890
978
|
description: str,
|
|
891
|
-
version:
|
|
979
|
+
version: DatasetVersion | None = None,
|
|
892
980
|
) -> RID:
|
|
893
981
|
"""Create a new dataset with specified types.
|
|
894
982
|
|
|
@@ -900,14 +988,12 @@ class Execution:
|
|
|
900
988
|
Returns:
|
|
901
989
|
RID of the newly created dataset.
|
|
902
990
|
"""
|
|
903
|
-
return self._ml_object.create_dataset(
|
|
904
|
-
dataset_types, description, self.execution_rid, version=version
|
|
905
|
-
)
|
|
991
|
+
return self._ml_object.create_dataset(dataset_types, description, self.execution_rid, version=version)
|
|
906
992
|
|
|
907
993
|
def add_dataset_members(
|
|
908
994
|
self,
|
|
909
995
|
dataset_rid: RID,
|
|
910
|
-
members: list[RID],
|
|
996
|
+
members: list[RID] | dict[str, list[RID]],
|
|
911
997
|
validate: bool = True,
|
|
912
998
|
description: str = "",
|
|
913
999
|
) -> None:
|
|
@@ -920,7 +1006,7 @@ class Execution:
|
|
|
920
1006
|
been configured to be a dataset element type.
|
|
921
1007
|
|
|
922
1008
|
Args:
|
|
923
|
-
dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
|
|
1009
|
+
dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
|
|
924
1010
|
members: List of RIDs of members to add to the dataset_table. RID must be to a table type that is a
|
|
925
1011
|
dataset element type (see DerivaML.add_dataset_element_type).
|
|
926
1012
|
validate: Check rid_list to make sure elements are not already in the dataset_table.
|
|
@@ -943,7 +1029,7 @@ class Execution:
|
|
|
943
1029
|
dataset_rid: RID to a dataset_table
|
|
944
1030
|
component: Which version of the dataset_table to increment.
|
|
945
1031
|
dataset_rid: RID of the dataset whose version is to be incremented.
|
|
946
|
-
component: Major, Minor or Patch
|
|
1032
|
+
component: Major, Minor, or Patch
|
|
947
1033
|
description: Description of the version update of the dataset_table.
|
|
948
1034
|
|
|
949
1035
|
Returns:
|
|
@@ -963,13 +1049,42 @@ class Execution:
|
|
|
963
1049
|
def add_files(
|
|
964
1050
|
self,
|
|
965
1051
|
files: Iterable[FileSpec],
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
1052
|
+
dataset_types: str | list[str] | None = None,
|
|
1053
|
+
description: str = "",
|
|
1054
|
+
) -> RID:
|
|
1055
|
+
"""Adds files to the catalog with their metadata.
|
|
1056
|
+
|
|
1057
|
+
Registers files in the catalog along with their metadata (MD5, length, URL) and associates them with
|
|
1058
|
+
specified file types.
|
|
1059
|
+
|
|
1060
|
+
Args:
|
|
1061
|
+
files: File specifications containing MD5 checksum, length, and URL.
|
|
1062
|
+
dataset_types: One or more dataset type terms from File_Type vocabulary.
|
|
1063
|
+
description: Description of the files.
|
|
1064
|
+
|
|
1065
|
+
Returns:
|
|
1066
|
+
RID: Dataset RID that identifes newly added files. Will be nested to mirror origioanl directory structure
|
|
1067
|
+
of the files.
|
|
1068
|
+
|
|
1069
|
+
Raises:
|
|
1070
|
+
DerivaMLInvalidTerm: If file_types are invalid or execution_rid is not an execution record.
|
|
1071
|
+
|
|
1072
|
+
Examples:
|
|
1073
|
+
Add a single file type:
|
|
1074
|
+
>>> files = [FileSpec(url="path/to/file.txt", md5="abc123", length=1000)]
|
|
1075
|
+
>>> rids = exe.add_files(files, file_types="text")
|
|
1076
|
+
|
|
1077
|
+
Add multiple file types:
|
|
1078
|
+
>>> rids = exe.add_files(
|
|
1079
|
+
... files=[FileSpec(url="image.png", md5="def456", length=2000)],
|
|
1080
|
+
... file_types=["image", "png"],
|
|
1081
|
+
... )
|
|
1082
|
+
"""
|
|
969
1083
|
return self._ml_object.add_files(
|
|
970
1084
|
files=files,
|
|
971
|
-
|
|
1085
|
+
dataset_types=dataset_types,
|
|
972
1086
|
execution_rid=self.execution_rid,
|
|
1087
|
+
description=description,
|
|
973
1088
|
)
|
|
974
1089
|
|
|
975
1090
|
def __str__(self):
|
|
@@ -1015,7 +1130,5 @@ class Execution:
|
|
|
1015
1130
|
Status.failed,
|
|
1016
1131
|
f"Exception type: {exc_type}, Exception value: {exc_value}",
|
|
1017
1132
|
)
|
|
1018
|
-
logging.error(
|
|
1019
|
-
f"Exception type: {exc_type}, Exception value: {exc_value}, Exception traceback: {exc_tb}"
|
|
1020
|
-
)
|
|
1133
|
+
logging.error(f"Exception type: {exc_type}, Exception value: {exc_value}, Exception traceback: {exc_tb}")
|
|
1021
1134
|
return False
|