deriva-ml 1.17.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/.DS_Store +0 -0
- deriva_ml/__init__.py +79 -0
- deriva_ml/bump_version.py +142 -0
- deriva_ml/core/__init__.py +39 -0
- deriva_ml/core/base.py +1527 -0
- deriva_ml/core/config.py +69 -0
- deriva_ml/core/constants.py +36 -0
- deriva_ml/core/definitions.py +74 -0
- deriva_ml/core/enums.py +222 -0
- deriva_ml/core/ermrest.py +288 -0
- deriva_ml/core/exceptions.py +28 -0
- deriva_ml/core/filespec.py +116 -0
- deriva_ml/dataset/__init__.py +12 -0
- deriva_ml/dataset/aux_classes.py +225 -0
- deriva_ml/dataset/dataset.py +1519 -0
- deriva_ml/dataset/dataset_bag.py +450 -0
- deriva_ml/dataset/history.py +109 -0
- deriva_ml/dataset/upload.py +439 -0
- deriva_ml/demo_catalog.py +495 -0
- deriva_ml/execution/__init__.py +26 -0
- deriva_ml/execution/environment.py +290 -0
- deriva_ml/execution/execution.py +1180 -0
- deriva_ml/execution/execution_configuration.py +147 -0
- deriva_ml/execution/workflow.py +413 -0
- deriva_ml/feature.py +228 -0
- deriva_ml/install_kernel.py +71 -0
- deriva_ml/model/__init__.py +0 -0
- deriva_ml/model/catalog.py +485 -0
- deriva_ml/model/database.py +719 -0
- deriva_ml/protocols/dataset.py +19 -0
- deriva_ml/run_notebook.py +228 -0
- deriva_ml/schema/__init__.py +3 -0
- deriva_ml/schema/annotations.py +473 -0
- deriva_ml/schema/check_schema.py +104 -0
- deriva_ml/schema/create_schema.py +393 -0
- deriva_ml/schema/deriva-ml-reference.json +8525 -0
- deriva_ml/schema/policy.json +81 -0
- deriva_ml/schema/table_comments_utils.py +57 -0
- deriva_ml/test.py +94 -0
- deriva_ml-1.17.10.dist-info/METADATA +38 -0
- deriva_ml-1.17.10.dist-info/RECORD +45 -0
- deriva_ml-1.17.10.dist-info/WHEEL +5 -0
- deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
- deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
- deriva_ml-1.17.10.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1180 @@
|
|
|
1
|
+
"""Execution management for DerivaML.
|
|
2
|
+
|
|
3
|
+
This module provides functionality for managing and tracking executions in DerivaML. An execution
|
|
4
|
+
represents a computational or manual process that operates on datasets and produces outputs.
|
|
5
|
+
The module includes:
|
|
6
|
+
|
|
7
|
+
- Execution class: Core class for managing execution state and context
|
|
8
|
+
- Asset management: Track input and output files
|
|
9
|
+
- Status tracking: Monitor and update execution progress
|
|
10
|
+
- Dataset handling: Download and materialize required datasets
|
|
11
|
+
- Provenance tracking: Record relationships between inputs, processes, and outputs
|
|
12
|
+
|
|
13
|
+
The Execution class serves as the primary interface for managing the lifecycle of a computational
|
|
14
|
+
or manual process within DerivaML.
|
|
15
|
+
|
|
16
|
+
Typical usage example:
|
|
17
|
+
>>> config = ExecutionConfiguration(workflow="analysis_workflow", description="Data analysis")
|
|
18
|
+
>>> with ml.create_execution(config) as execution:
|
|
19
|
+
... execution.download_dataset_bag(dataset_spec)
|
|
20
|
+
... # Run analysis
|
|
21
|
+
... execution.upload_execution_outputs()
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import json
|
|
27
|
+
import logging
|
|
28
|
+
import os
|
|
29
|
+
import shutil
|
|
30
|
+
import sys
|
|
31
|
+
from collections import defaultdict
|
|
32
|
+
from datetime import datetime
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
from typing import Any, Iterable, List
|
|
35
|
+
|
|
36
|
+
from deriva.core import format_exception
|
|
37
|
+
from deriva.core.hatrac_store import HatracStore
|
|
38
|
+
from pydantic import ConfigDict, validate_call
|
|
39
|
+
|
|
40
|
+
from deriva_ml.core.base import DerivaML
|
|
41
|
+
from deriva_ml.core.definitions import (
|
|
42
|
+
DRY_RUN_RID,
|
|
43
|
+
RID,
|
|
44
|
+
ExecMetadataType,
|
|
45
|
+
FileSpec,
|
|
46
|
+
FileUploadState,
|
|
47
|
+
MLAsset,
|
|
48
|
+
MLVocab,
|
|
49
|
+
Status,
|
|
50
|
+
)
|
|
51
|
+
from deriva_ml.core.exceptions import DerivaMLException
|
|
52
|
+
from deriva_ml.dataset.aux_classes import DatasetSpec, DatasetVersion, VersionPart
|
|
53
|
+
from deriva_ml.dataset.dataset_bag import DatasetBag
|
|
54
|
+
from deriva_ml.dataset.upload import (
|
|
55
|
+
asset_file_path,
|
|
56
|
+
asset_root,
|
|
57
|
+
asset_type_path,
|
|
58
|
+
execution_root,
|
|
59
|
+
feature_root,
|
|
60
|
+
feature_value_path,
|
|
61
|
+
is_feature_dir,
|
|
62
|
+
normalize_asset_dir,
|
|
63
|
+
table_path,
|
|
64
|
+
upload_directory,
|
|
65
|
+
)
|
|
66
|
+
from deriva_ml.execution.environment import get_execution_environment
|
|
67
|
+
from deriva_ml.execution.execution_configuration import ExecutionConfiguration
|
|
68
|
+
from deriva_ml.execution.workflow import Workflow
|
|
69
|
+
from deriva_ml.feature import FeatureRecord
|
|
70
|
+
|
|
71
|
+
# Keep pycharm from complaining about undefined references in docstrings.
|
|
72
|
+
execution: Execution
|
|
73
|
+
ml: DerivaML
|
|
74
|
+
dataset_spec: DatasetSpec
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
from icecream import ic
|
|
78
|
+
except ImportError: # Graceful fallback if IceCream isn't installed.
|
|
79
|
+
ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a) # noqa
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
from IPython.display import Markdown, display
|
|
84
|
+
except ImportError:
|
|
85
|
+
|
|
86
|
+
def display(s):
|
|
87
|
+
print(s)
|
|
88
|
+
|
|
89
|
+
def Markdown(s):
|
|
90
|
+
return s
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# Platform-specific base class
|
|
94
|
+
if sys.version_info >= (3, 12):
|
|
95
|
+
|
|
96
|
+
class AssetFilePath(Path):
|
|
97
|
+
"""Extended Path class for managing asset files.
|
|
98
|
+
|
|
99
|
+
Represents a file path with additional metadata about its role as an asset in the catalog.
|
|
100
|
+
This class extends the standard Path class to include information about the asset's
|
|
101
|
+
catalog representation and type.
|
|
102
|
+
|
|
103
|
+
Attributes:
|
|
104
|
+
asset_name (str): Name of the asset in the catalog (e.g., asset table name).
|
|
105
|
+
file_name (str): Name of the local file containing the asset.
|
|
106
|
+
asset_metadata (dict[str, Any]): Additional columns beyond URL, Length, and checksum.
|
|
107
|
+
asset_types (list[str]): Terms from the Asset_Type controlled vocabulary.
|
|
108
|
+
asset_rid (RID | None): Resource Identifier if uploaded to an asset table.
|
|
109
|
+
|
|
110
|
+
Example:
|
|
111
|
+
>>> path = AssetFilePath(
|
|
112
|
+
... "/path/to/file.txt",
|
|
113
|
+
... asset_name="analysis_output",
|
|
114
|
+
... file_name="results.txt",
|
|
115
|
+
... asset_metadata={"version": "1.0"},
|
|
116
|
+
... asset_types=["text", "results"]
|
|
117
|
+
... )
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(
|
|
121
|
+
self,
|
|
122
|
+
asset_path: str | Path,
|
|
123
|
+
asset_name: str,
|
|
124
|
+
file_name: str,
|
|
125
|
+
asset_metadata: dict[str, Any],
|
|
126
|
+
asset_types: list[str] | str,
|
|
127
|
+
asset_rid: RID | None = None,
|
|
128
|
+
):
|
|
129
|
+
"""Initializes an AssetFilePath instance.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
asset_path: Local path to the asset file.
|
|
133
|
+
asset_name: Name of the asset in the catalog.
|
|
134
|
+
file_name: Name of the local file.
|
|
135
|
+
asset_metadata: Additional metadata columns.
|
|
136
|
+
asset_types: One or more asset type terms.
|
|
137
|
+
asset_rid: Optional Resource Identifier if already in catalog.
|
|
138
|
+
"""
|
|
139
|
+
super().__init__(asset_path)
|
|
140
|
+
self.asset_name = asset_name
|
|
141
|
+
self.file_name = file_name
|
|
142
|
+
self.asset_metadata = asset_metadata
|
|
143
|
+
self.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
|
|
144
|
+
self.asset_rid = asset_rid
|
|
145
|
+
else:
|
|
146
|
+
|
|
147
|
+
class AssetFilePath(type(Path())):
|
|
148
|
+
"""
|
|
149
|
+
Create a new Path object that has additional information related to the use of this path as an asset.
|
|
150
|
+
|
|
151
|
+
Attrubytes:
|
|
152
|
+
asset_path: Local path to the location of the asset.
|
|
153
|
+
asset_name: The name of the asset in the catalog (e.g., the asset table name).
|
|
154
|
+
file_name: Name of the local file that contains the contents of the asset.
|
|
155
|
+
asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
|
|
156
|
+
asset_types: A list of terms from the Asset_Type controlled vocabulary.
|
|
157
|
+
asset_rid: The RID of the asset if it has been uploaded into an asset table
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
def __new__(
|
|
161
|
+
cls,
|
|
162
|
+
asset_path: str | Path,
|
|
163
|
+
asset_name: str,
|
|
164
|
+
file_name: str,
|
|
165
|
+
asset_metadata: dict[str, Any],
|
|
166
|
+
asset_types: list[str] | str,
|
|
167
|
+
asset_rid: RID | None = None,
|
|
168
|
+
):
|
|
169
|
+
# Only pass the path to the base Path class
|
|
170
|
+
obj = super().__new__(cls, asset_path)
|
|
171
|
+
obj.asset_name = asset_name
|
|
172
|
+
obj.file_name = file_name
|
|
173
|
+
obj.asset_metadata = asset_metadata
|
|
174
|
+
obj.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
|
|
175
|
+
obj.asset_rid = asset_rid
|
|
176
|
+
return obj
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class Execution:
|
|
180
|
+
"""Manages the lifecycle and context of a DerivaML execution.
|
|
181
|
+
|
|
182
|
+
An Execution represents a computational or manual process within DerivaML. It provides:
|
|
183
|
+
- Dataset materialization and access
|
|
184
|
+
- Asset management (inputs and outputs)
|
|
185
|
+
- Status tracking and updates
|
|
186
|
+
- Provenance recording
|
|
187
|
+
- Result upload and cataloging
|
|
188
|
+
|
|
189
|
+
The class handles downloading required datasets and assets, tracking execution state,
|
|
190
|
+
and managing the upload of results. Every dataset and file generated is associated
|
|
191
|
+
with an execution record for provenance tracking.
|
|
192
|
+
|
|
193
|
+
Attributes:
|
|
194
|
+
dataset_rids (list[RID]): RIDs of datasets used in the execution.
|
|
195
|
+
datasets (list[DatasetBag]): Materialized dataset objects.
|
|
196
|
+
configuration (ExecutionConfiguration): Execution settings and parameters.
|
|
197
|
+
workflow_rid (RID): RID of the associated workflow.
|
|
198
|
+
status (Status): Current execution status.
|
|
199
|
+
asset_paths (list[AssetFilePath]): Paths to execution assets.
|
|
200
|
+
start_time (datetime | None): When execution started.
|
|
201
|
+
stop_time (datetime | None): When execution completed.
|
|
202
|
+
|
|
203
|
+
Example:
|
|
204
|
+
>>> config = ExecutionConfiguration(
|
|
205
|
+
... workflow="analysis",
|
|
206
|
+
... description="Process samples",
|
|
207
|
+
... )
|
|
208
|
+
>>> with ml.create_execution(config) as execution:
|
|
209
|
+
... execution.download_dataset_bag(dataset_spec)
|
|
210
|
+
... # Run analysis
|
|
211
|
+
... execution.upload_execution_outputs()
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
215
|
+
def __init__(
|
|
216
|
+
self,
|
|
217
|
+
configuration: ExecutionConfiguration,
|
|
218
|
+
ml_object: DerivaML,
|
|
219
|
+
workflow: Workflow | RID | None = None,
|
|
220
|
+
reload: RID | None = None,
|
|
221
|
+
dry_run: bool = False,
|
|
222
|
+
):
|
|
223
|
+
"""Initializes an Execution instance.
|
|
224
|
+
|
|
225
|
+
Creates a new execution or reloads an existing one. Initializes the execution
|
|
226
|
+
environment, downloads required datasets, and sets up asset tracking.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
configuration: Settings and parameters for the execution.
|
|
230
|
+
ml_object: DerivaML instance managing the execution.
|
|
231
|
+
workflow: Optional workflow RID or Workflow object. If not specified, the workflow RID is taken from
|
|
232
|
+
the ExecutionConfiguration object
|
|
233
|
+
reload: Optional RID of existing execution to reload.
|
|
234
|
+
dry_run: If True, don't create catalog records or upload results.
|
|
235
|
+
|
|
236
|
+
Raises:
|
|
237
|
+
DerivaMLException: If initialization fails or configuration is invalid.
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
self.asset_paths: dict[str, list[AssetFilePath]] = {}
|
|
241
|
+
self.configuration = configuration
|
|
242
|
+
self._ml_object = ml_object
|
|
243
|
+
self._model = ml_object.model
|
|
244
|
+
self._logger = ml_object._logger
|
|
245
|
+
self.start_time = None
|
|
246
|
+
self.stop_time = None
|
|
247
|
+
self.status = Status.created
|
|
248
|
+
self.uploaded_assets: dict[str, list[AssetFilePath]] | None = None
|
|
249
|
+
self.configuration.argv = sys.argv
|
|
250
|
+
|
|
251
|
+
self.dataset_rids: List[RID] = []
|
|
252
|
+
self.datasets: list[DatasetBag] = []
|
|
253
|
+
|
|
254
|
+
self._working_dir = self._ml_object.working_dir
|
|
255
|
+
self._cache_dir = self._ml_object.cache_dir
|
|
256
|
+
self._dry_run = dry_run
|
|
257
|
+
|
|
258
|
+
# Make sure we have a good workflow.
|
|
259
|
+
if workflow:
|
|
260
|
+
self.configuration.workflow = workflow
|
|
261
|
+
if isinstance(self.configuration.workflow, Workflow):
|
|
262
|
+
self._ml_object.lookup_term(MLVocab.workflow_type, configuration.workflow.workflow_type)
|
|
263
|
+
self.workflow_rid = (
|
|
264
|
+
self._ml_object.add_workflow(self.configuration.workflow) if not self._dry_run else DRY_RUN_RID
|
|
265
|
+
)
|
|
266
|
+
else:
|
|
267
|
+
self.workflow_rid = self.configuration.workflow
|
|
268
|
+
if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
|
|
269
|
+
raise DerivaMLException("Workflow specified in execution configuration is not a Workflow")
|
|
270
|
+
|
|
271
|
+
# Validate the datasets and assets to be valid.
|
|
272
|
+
for d in self.configuration.datasets:
|
|
273
|
+
if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
|
|
274
|
+
raise DerivaMLException("Dataset specified in execution configuration is not a dataset")
|
|
275
|
+
|
|
276
|
+
for a in self.configuration.assets:
|
|
277
|
+
if not self._model.is_asset(self._ml_object.resolve_rid(a).table.name):
|
|
278
|
+
raise DerivaMLException("Asset specified in execution configuration is not a asset table")
|
|
279
|
+
|
|
280
|
+
schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
|
|
281
|
+
if reload:
|
|
282
|
+
self.execution_rid = reload
|
|
283
|
+
if self.execution_rid == DRY_RUN_RID:
|
|
284
|
+
self._dry_run = True
|
|
285
|
+
elif self._dry_run:
|
|
286
|
+
self.execution_rid = DRY_RUN_RID
|
|
287
|
+
else:
|
|
288
|
+
self.execution_rid = schema_path.Execution.insert(
|
|
289
|
+
[
|
|
290
|
+
{
|
|
291
|
+
"Description": self.configuration.description,
|
|
292
|
+
"Workflow": self.workflow_rid,
|
|
293
|
+
}
|
|
294
|
+
]
|
|
295
|
+
)[0]["RID"]
|
|
296
|
+
|
|
297
|
+
if rid_path := os.environ.get("DERIVA_ML_SAVE_EXECUTION_RID", None):
|
|
298
|
+
# Put execution_rid into the provided file path so we can find it later.
|
|
299
|
+
with Path(rid_path).open("w") as f:
|
|
300
|
+
json.dump(
|
|
301
|
+
{
|
|
302
|
+
"hostname": self._ml_object.host_name,
|
|
303
|
+
"catalog_id": self._ml_object.catalog_id,
|
|
304
|
+
"workflow_rid": self.workflow_rid,
|
|
305
|
+
"execution_rid": self.execution_rid,
|
|
306
|
+
},
|
|
307
|
+
f,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
# Create a directory for execution rid so we can recover the state in case of a crash.
|
|
311
|
+
execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
|
|
312
|
+
self._initialize_execution(reload)
|
|
313
|
+
|
|
314
|
+
def _save_runtime_environment(self):
|
|
315
|
+
runtime_env_path = self.asset_file_path(
|
|
316
|
+
asset_name="Execution_Metadata",
|
|
317
|
+
file_name=f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
|
|
318
|
+
asset_types=ExecMetadataType.runtime_env.value,
|
|
319
|
+
)
|
|
320
|
+
with Path(runtime_env_path).open("w") as fp:
|
|
321
|
+
json.dump(get_execution_environment(), fp)
|
|
322
|
+
|
|
323
|
+
def _upload_hydra_config_assets(self):
|
|
324
|
+
"""Upload hydra assets to the catalog."""
|
|
325
|
+
hydra_runtime_output_dir = self._ml_object.hydra_runtime_output_dir
|
|
326
|
+
if hydra_runtime_output_dir:
|
|
327
|
+
timestamp = hydra_runtime_output_dir.parts[-1]
|
|
328
|
+
for hydra_asset in hydra_runtime_output_dir.rglob("*"):
|
|
329
|
+
if hydra_asset.is_dir():
|
|
330
|
+
continue
|
|
331
|
+
asset = self.asset_file_path(
|
|
332
|
+
asset_name=MLAsset.execution_metadata,
|
|
333
|
+
file_name=hydra_runtime_output_dir / hydra_asset,
|
|
334
|
+
rename_file=f"hydra-{timestamp}-{hydra_asset.name}",
|
|
335
|
+
asset_types=ExecMetadataType.execution_config.value,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
def _initialize_execution(self, reload: RID | None = None) -> None:
|
|
339
|
+
"""Initialize the execution by a configuration in the Execution_Metadata table.
|
|
340
|
+
Set up a working directory and download all the assets and data.
|
|
341
|
+
|
|
342
|
+
:raise DerivaMLException: If there is an issue initializing the execution.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
reload: RID of previously initialized execution.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
|
|
349
|
+
"""
|
|
350
|
+
# Materialize bdbag
|
|
351
|
+
for dataset in self.configuration.datasets:
|
|
352
|
+
self.update_status(Status.initializing, f"Materialize bag {dataset.rid}... ")
|
|
353
|
+
self.datasets.append(self.download_dataset_bag(dataset))
|
|
354
|
+
self.dataset_rids.append(dataset.rid)
|
|
355
|
+
|
|
356
|
+
# Update execution info
|
|
357
|
+
schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
|
|
358
|
+
if self.dataset_rids and not (reload or self._dry_run):
|
|
359
|
+
schema_path.Dataset_Execution.insert(
|
|
360
|
+
[{"Dataset": d, "Execution": self.execution_rid} for d in self.dataset_rids]
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# Download assets....
|
|
364
|
+
self.update_status(Status.running, "Downloading assets ...")
|
|
365
|
+
self.asset_paths = {}
|
|
366
|
+
for asset_rid in self.configuration.assets:
|
|
367
|
+
asset_table = self._ml_object.resolve_rid(asset_rid).table.name
|
|
368
|
+
dest_dir = (
|
|
369
|
+
execution_root(self._ml_object.working_dir, self.execution_rid) / "downloaded-assets" / asset_table
|
|
370
|
+
)
|
|
371
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
372
|
+
self.asset_paths.setdefault(asset_table, []).append(
|
|
373
|
+
self.download_asset(
|
|
374
|
+
asset_rid=asset_rid,
|
|
375
|
+
dest_dir=dest_dir,
|
|
376
|
+
update_catalog=not (reload or self._dry_run),
|
|
377
|
+
)
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
# Save configuration details for later upload
|
|
381
|
+
if not reload:
|
|
382
|
+
cfile = self.asset_file_path(
|
|
383
|
+
asset_name=MLAsset.execution_metadata,
|
|
384
|
+
file_name="configuration.json",
|
|
385
|
+
asset_types=ExecMetadataType.execution_config.value,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
with Path(cfile).open("w", encoding="utf-8") as config_file:
|
|
389
|
+
json.dump(self.configuration.model_dump(mode="json"), config_file)
|
|
390
|
+
lock_file = Path(self.configuration.workflow.git_root) / "uv.lock"
|
|
391
|
+
if lock_file.exists():
|
|
392
|
+
_ = self.asset_file_path(
|
|
393
|
+
asset_name=MLAsset.execution_metadata,
|
|
394
|
+
file_name=lock_file,
|
|
395
|
+
asset_types=ExecMetadataType.execution_config.value,
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
self._upload_hydra_config_assets()
|
|
399
|
+
|
|
400
|
+
# save runtime env
|
|
401
|
+
self._save_runtime_environment()
|
|
402
|
+
|
|
403
|
+
# Now upload the files so we have the info in case the execution fails.
|
|
404
|
+
self.uploaded_assets = self._upload_execution_dirs()
|
|
405
|
+
self.start_time = datetime.now()
|
|
406
|
+
self.update_status(Status.pending, "Initialize status finished.")
|
|
407
|
+
|
|
408
|
+
@property
|
|
409
|
+
def working_dir(self) -> Path:
|
|
410
|
+
"""Return the working directory for the execution."""
|
|
411
|
+
return self._execution_root
|
|
412
|
+
|
|
413
|
+
@property
|
|
414
|
+
def _execution_root(self) -> Path:
|
|
415
|
+
"""
|
|
416
|
+
|
|
417
|
+
Args:
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
:return:
|
|
421
|
+
|
|
422
|
+
"""
|
|
423
|
+
return execution_root(self._working_dir, self.execution_rid)
|
|
424
|
+
|
|
425
|
+
@property
|
|
426
|
+
def _feature_root(self) -> Path:
|
|
427
|
+
"""The root path to all execution-specific files.
|
|
428
|
+
:return:
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
|
|
434
|
+
"""
|
|
435
|
+
return feature_root(self._working_dir, self.execution_rid)
|
|
436
|
+
|
|
437
|
+
@property
|
|
438
|
+
def _asset_root(self) -> Path:
|
|
439
|
+
"""The root path to all execution-specific files.
|
|
440
|
+
:return:
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
|
|
446
|
+
"""
|
|
447
|
+
return asset_root(self._working_dir, self.execution_rid)
|
|
448
|
+
|
|
449
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
450
|
+
def download_dataset_bag(self, dataset: DatasetSpec) -> DatasetBag:
|
|
451
|
+
"""Downloads and materializes a dataset for use in the execution.
|
|
452
|
+
|
|
453
|
+
Downloads the specified dataset as a BDBag and materializes it in the execution's
|
|
454
|
+
working directory. The dataset version is determined by the DatasetSpec.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
dataset: Specification of the dataset to download, including version and
|
|
458
|
+
materialization options.
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
DatasetBag: Object containing:
|
|
462
|
+
- path: Local filesystem path to downloaded dataset
|
|
463
|
+
- rid: Dataset's Resource Identifier
|
|
464
|
+
- minid: Dataset's Minimal Viable Identifier
|
|
465
|
+
|
|
466
|
+
Raises:
|
|
467
|
+
DerivaMLException: If download or materialization fails.
|
|
468
|
+
|
|
469
|
+
Example:
|
|
470
|
+
>>> spec = DatasetSpec(rid="1-abc123", version="1.2.0")
|
|
471
|
+
>>> bag = execution.download_dataset_bag(spec)
|
|
472
|
+
>>> print(f"Downloaded to {bag.path}")
|
|
473
|
+
"""
|
|
474
|
+
return self._ml_object.download_dataset_bag(dataset, execution_rid=self.execution_rid)
|
|
475
|
+
|
|
476
|
+
@validate_call
|
|
477
|
+
def update_status(self, status: Status, msg: str) -> None:
|
|
478
|
+
"""Updates the execution's status in the catalog.
|
|
479
|
+
|
|
480
|
+
Records a new status and associated message in the catalog, allowing remote
|
|
481
|
+
tracking of execution progress.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
status: New status value (e.g., running, completed, failed).
|
|
485
|
+
msg: Description of the status change or current state.
|
|
486
|
+
|
|
487
|
+
Raises:
|
|
488
|
+
DerivaMLException: If status update fails.
|
|
489
|
+
|
|
490
|
+
Example:
|
|
491
|
+
>>> execution.update_status(Status.running, "Processing sample 1 of 10")
|
|
492
|
+
"""
|
|
493
|
+
self.status = status
|
|
494
|
+
self._logger.info(msg)
|
|
495
|
+
|
|
496
|
+
if self._dry_run:
|
|
497
|
+
return
|
|
498
|
+
|
|
499
|
+
self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
|
|
500
|
+
[
|
|
501
|
+
{
|
|
502
|
+
"RID": self.execution_rid,
|
|
503
|
+
"Status": self.status.value,
|
|
504
|
+
"Status_Detail": msg,
|
|
505
|
+
}
|
|
506
|
+
]
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
def execution_start(self) -> None:
|
|
510
|
+
"""Marks the execution as started.
|
|
511
|
+
|
|
512
|
+
Records the start time and updates the execution's status to 'running'.
|
|
513
|
+
This should be called before beginning the main execution work.
|
|
514
|
+
|
|
515
|
+
Example:
|
|
516
|
+
>>> execution.execution_start()
|
|
517
|
+
>>> try:
|
|
518
|
+
... # Run analysis
|
|
519
|
+
... execution.execution_stop()
|
|
520
|
+
... except Exception:
|
|
521
|
+
... execution.update_status(Status.failed, "Analysis error")
|
|
522
|
+
"""
|
|
523
|
+
self.start_time = datetime.now()
|
|
524
|
+
self.uploaded_assets = None
|
|
525
|
+
self.update_status(Status.initializing, "Start execution ...")
|
|
526
|
+
|
|
527
|
+
def execution_stop(self) -> None:
|
|
528
|
+
"""Marks the execution as completed.
|
|
529
|
+
|
|
530
|
+
Records the stop time and updates the execution's status to 'completed'.
|
|
531
|
+
This should be called after all execution work is finished.
|
|
532
|
+
|
|
533
|
+
Example:
|
|
534
|
+
>>> try:
|
|
535
|
+
... # Run analysis
|
|
536
|
+
... execution.execution_stop()
|
|
537
|
+
... except Exception:
|
|
538
|
+
... execution.update_status(Status.failed, "Analysis error")
|
|
539
|
+
"""
|
|
540
|
+
self.stop_time = datetime.now()
|
|
541
|
+
duration = self.stop_time - self.start_time
|
|
542
|
+
hours, remainder = divmod(duration.total_seconds(), 3600)
|
|
543
|
+
minutes, seconds = divmod(remainder, 60)
|
|
544
|
+
duration = f"{round(hours, 0)}H {round(minutes, 0)}min {round(seconds, 4)}sec"
|
|
545
|
+
|
|
546
|
+
self.update_status(Status.completed, "Algorithm execution ended.")
|
|
547
|
+
if not self._dry_run:
|
|
548
|
+
self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
|
|
549
|
+
[{"RID": self.execution_rid, "Duration": duration}]
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
def _upload_execution_dirs(self) -> dict[str, list[AssetFilePath]]:
|
|
553
|
+
"""Upload execution assets at _working_dir/Execution_asset.
|
|
554
|
+
|
|
555
|
+
This routine uploads the contents of the
|
|
556
|
+
Execution_Asset directory and then updates the execution_asset table in the ML schema to have references
|
|
557
|
+
to these newly uploaded files.
|
|
558
|
+
|
|
559
|
+
Returns:
|
|
560
|
+
dict: Results of the upload operation.
|
|
561
|
+
|
|
562
|
+
Raises:
|
|
563
|
+
DerivaMLException: If there is an issue when uploading the assets.
|
|
564
|
+
"""
|
|
565
|
+
|
|
566
|
+
try:
|
|
567
|
+
self.update_status(Status.running, "Uploading execution files...")
|
|
568
|
+
results = upload_directory(self._model, self._asset_root)
|
|
569
|
+
except RuntimeError as e:
|
|
570
|
+
error = format_exception(e)
|
|
571
|
+
self.update_status(Status.failed, error)
|
|
572
|
+
raise DerivaMLException(f"Fail to upload execution_assets. Error: {error}")
|
|
573
|
+
|
|
574
|
+
asset_map = {}
|
|
575
|
+
for path, status in results.items():
|
|
576
|
+
asset_table, file_name = normalize_asset_dir(path)
|
|
577
|
+
|
|
578
|
+
asset_map.setdefault(asset_table, []).append(
|
|
579
|
+
AssetFilePath(
|
|
580
|
+
asset_path=path,
|
|
581
|
+
asset_name=asset_table,
|
|
582
|
+
file_name=file_name,
|
|
583
|
+
asset_metadata={
|
|
584
|
+
k: v
|
|
585
|
+
for k, v in status.result.items()
|
|
586
|
+
if k in self._model.asset_metadata(asset_table.split("/")[1])
|
|
587
|
+
},
|
|
588
|
+
asset_types=[],
|
|
589
|
+
asset_rid=status.result["RID"],
|
|
590
|
+
)
|
|
591
|
+
)
|
|
592
|
+
self._update_asset_execution_table(asset_map)
|
|
593
|
+
self.update_status(Status.running, "Updating features...")
|
|
594
|
+
|
|
595
|
+
for p in self._feature_root.glob("**/*.jsonl"):
|
|
596
|
+
m = is_feature_dir(p.parent)
|
|
597
|
+
self._update_feature_table(
|
|
598
|
+
target_table=m["target_table"],
|
|
599
|
+
feature_name=m["feature_name"],
|
|
600
|
+
feature_file=p,
|
|
601
|
+
uploaded_files=asset_map,
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
self.update_status(Status.running, "Upload assets complete")
|
|
605
|
+
return asset_map
|
|
606
|
+
|
|
607
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
608
|
+
def download_asset(self, asset_rid: RID, dest_dir: Path, update_catalog=True) -> AssetFilePath:
|
|
609
|
+
"""Download an asset from a URL and place it in a local directory.
|
|
610
|
+
|
|
611
|
+
Args:
|
|
612
|
+
asset_rid: RID of the asset.
|
|
613
|
+
dest_dir: Destination directory for the asset.
|
|
614
|
+
update_catalog: Whether to update the catalog execution information after downloading.
|
|
615
|
+
|
|
616
|
+
Returns:
|
|
617
|
+
A tuple with the name of the asset table and a Path object to the downloaded asset.
|
|
618
|
+
"""
|
|
619
|
+
|
|
620
|
+
asset_table = self._ml_object.resolve_rid(asset_rid).table
|
|
621
|
+
if not self._model.is_asset(asset_table):
|
|
622
|
+
raise DerivaMLException(f"RID {asset_rid} is not for an asset table.")
|
|
623
|
+
|
|
624
|
+
asset_record = self._ml_object.retrieve_rid(asset_rid)
|
|
625
|
+
asset_metadata = {k: v for k, v in asset_record.items() if k in self._model.asset_metadata(asset_table)}
|
|
626
|
+
asset_url = asset_record["URL"]
|
|
627
|
+
asset_filename = dest_dir / asset_record["Filename"]
|
|
628
|
+
hs = HatracStore("https", self._ml_object.host_name, self._ml_object.credential)
|
|
629
|
+
hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
|
|
630
|
+
|
|
631
|
+
asset_type_table, _col_l, _col_r = self._model.find_association(asset_table, MLVocab.asset_type)
|
|
632
|
+
type_path = self._ml_object.pathBuilder.schemas[asset_type_table.schema.name].tables[asset_type_table.name]
|
|
633
|
+
asset_types = [
|
|
634
|
+
asset_type[MLVocab.asset_type.value]
|
|
635
|
+
for asset_type in type_path.filter(type_path.columns[asset_table.name] == asset_rid)
|
|
636
|
+
.attributes(type_path.Asset_Type)
|
|
637
|
+
.fetch()
|
|
638
|
+
]
|
|
639
|
+
|
|
640
|
+
asset_path = AssetFilePath(
|
|
641
|
+
file_name=asset_filename,
|
|
642
|
+
asset_rid=asset_rid,
|
|
643
|
+
asset_path=asset_filename,
|
|
644
|
+
asset_metadata=asset_metadata,
|
|
645
|
+
asset_name=asset_table.name,
|
|
646
|
+
asset_types=asset_types,
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
if update_catalog:
|
|
650
|
+
self._update_asset_execution_table(
|
|
651
|
+
{f"{asset_table.schema.name}/{asset_table.name}": [asset_path]},
|
|
652
|
+
asset_role="Input",
|
|
653
|
+
)
|
|
654
|
+
return asset_path
|
|
655
|
+
|
|
656
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
657
|
+
def upload_assets(
|
|
658
|
+
self,
|
|
659
|
+
assets_dir: str | Path,
|
|
660
|
+
) -> dict[Any, FileUploadState] | None:
|
|
661
|
+
"""Uploads assets from a directory to the catalog.
|
|
662
|
+
|
|
663
|
+
Scans the specified directory for assets and uploads them to the catalog,
|
|
664
|
+
recording their metadata and types. Assets are organized by their types
|
|
665
|
+
and associated with the execution.
|
|
666
|
+
|
|
667
|
+
Args:
|
|
668
|
+
assets_dir: Directory containing assets to upload.
|
|
669
|
+
|
|
670
|
+
Returns:
|
|
671
|
+
dict[Any, FileUploadState] | None: Mapping of assets to their upload states,
|
|
672
|
+
or None if no assets were found.
|
|
673
|
+
|
|
674
|
+
Raises:
|
|
675
|
+
DerivaMLException: If upload fails or assets are invalid.
|
|
676
|
+
|
|
677
|
+
Example:
|
|
678
|
+
>>> states = execution.upload_assets("output/results")
|
|
679
|
+
>>> for asset, state in states.items():
|
|
680
|
+
... print(f"{asset}: {state}")
|
|
681
|
+
"""
|
|
682
|
+
|
|
683
|
+
def path_to_asset(path: str) -> str:
|
|
684
|
+
"""Pull the asset name out of a path to that asset in the filesystem"""
|
|
685
|
+
components = path.split("/")
|
|
686
|
+
return components[components.index("asset") + 2] # Look for asset in the path to find the name
|
|
687
|
+
|
|
688
|
+
if not self._model.is_asset(Path(assets_dir).name):
|
|
689
|
+
raise DerivaMLException("Directory does not have name of an asset table.")
|
|
690
|
+
results = upload_directory(self._model, assets_dir)
|
|
691
|
+
return {path_to_asset(p): r for p, r in results.items()}
|
|
692
|
+
|
|
693
|
+
def upload_execution_outputs(self, clean_folder: bool = True) -> dict[str, list[AssetFilePath]]:
|
|
694
|
+
"""Uploads all outputs from the execution to the catalog.
|
|
695
|
+
|
|
696
|
+
Scans the execution's output directories for assets, features, and other results,
|
|
697
|
+
then uploads them to the catalog. Can optionally clean up the output folders
|
|
698
|
+
after successful upload.
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
clean_folder: Whether to delete output folders after upload. Defaults to True.
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
dict[str, list[AssetFilePath]]: Mapping of asset types to their file paths.
|
|
705
|
+
|
|
706
|
+
Raises:
|
|
707
|
+
DerivaMLException: If upload fails or outputs are invalid.
|
|
708
|
+
|
|
709
|
+
Example:
|
|
710
|
+
>>> outputs = execution.upload_execution_outputs()
|
|
711
|
+
>>> for type_name, paths in outputs.items():
|
|
712
|
+
... print(f"{type_name}: {len(paths)} files")
|
|
713
|
+
"""
|
|
714
|
+
if self._dry_run:
|
|
715
|
+
return {}
|
|
716
|
+
try:
|
|
717
|
+
self.uploaded_assets = self._upload_execution_dirs()
|
|
718
|
+
self.update_status(Status.completed, "Successfully end the execution.")
|
|
719
|
+
if clean_folder:
|
|
720
|
+
self._clean_folder_contents(self._execution_root)
|
|
721
|
+
return self.uploaded_assets
|
|
722
|
+
except Exception as e:
|
|
723
|
+
error = format_exception(e)
|
|
724
|
+
self.update_status(Status.failed, error)
|
|
725
|
+
raise e
|
|
726
|
+
|
|
727
|
+
def _clean_folder_contents(self, folder_path: Path):
|
|
728
|
+
"""Clean up folder contents with Windows-compatible error handling.
|
|
729
|
+
|
|
730
|
+
Args:
|
|
731
|
+
folder_path: Path to the folder to clean
|
|
732
|
+
"""
|
|
733
|
+
import time
|
|
734
|
+
|
|
735
|
+
MAX_RETRIES = 3
|
|
736
|
+
RETRY_DELAY = 1 # seconds
|
|
737
|
+
|
|
738
|
+
def remove_with_retry(path: Path, is_dir: bool = False) -> bool:
|
|
739
|
+
for attempt in range(MAX_RETRIES):
|
|
740
|
+
try:
|
|
741
|
+
if is_dir:
|
|
742
|
+
shutil.rmtree(path)
|
|
743
|
+
else:
|
|
744
|
+
Path(path).unlink()
|
|
745
|
+
return True
|
|
746
|
+
except (OSError, PermissionError) as e:
|
|
747
|
+
if attempt == MAX_RETRIES - 1:
|
|
748
|
+
self.update_status(Status.failed, format_exception(e))
|
|
749
|
+
return False
|
|
750
|
+
time.sleep(RETRY_DELAY)
|
|
751
|
+
return False
|
|
752
|
+
|
|
753
|
+
try:
|
|
754
|
+
with os.scandir(folder_path) as entries:
|
|
755
|
+
for entry in entries:
|
|
756
|
+
if entry.is_dir() and not entry.is_symlink():
|
|
757
|
+
remove_with_retry(Path(entry.path), is_dir=True)
|
|
758
|
+
else:
|
|
759
|
+
remove_with_retry(Path(entry.path))
|
|
760
|
+
except OSError as e:
|
|
761
|
+
self.update_status(Status.failed, format_exception(e))
|
|
762
|
+
|
|
763
|
+
def _update_feature_table(
|
|
764
|
+
self,
|
|
765
|
+
target_table: str,
|
|
766
|
+
feature_name: str,
|
|
767
|
+
feature_file: str | Path,
|
|
768
|
+
uploaded_files: dict[str, list[AssetFilePath]],
|
|
769
|
+
) -> None:
|
|
770
|
+
"""
|
|
771
|
+
|
|
772
|
+
Args:
|
|
773
|
+
target_table: str:
|
|
774
|
+
feature_name: str:
|
|
775
|
+
feature_file: str | Path:
|
|
776
|
+
uploaded_files: Dictionary whose key is an asset name, file-name pair, and whose value is a filename,
|
|
777
|
+
RID of that asset.
|
|
778
|
+
"""
|
|
779
|
+
|
|
780
|
+
# Get the column names of all the Feature columns that should be the RID of an asset
|
|
781
|
+
asset_columns = [
|
|
782
|
+
c.name for c in self._ml_object.feature_record_class(target_table, feature_name).feature.asset_columns
|
|
783
|
+
]
|
|
784
|
+
|
|
785
|
+
# Get the names of the columns in the feature that are assets.
|
|
786
|
+
asset_columns = [
|
|
787
|
+
c.name for c in self._ml_object.feature_record_class(target_table, feature_name).feature.asset_columns
|
|
788
|
+
]
|
|
789
|
+
|
|
790
|
+
feature_table = self._ml_object.feature_record_class(target_table, feature_name).feature.feature_table.name
|
|
791
|
+
asset_map = {
|
|
792
|
+
(asset_table, asset.file_name): asset.asset_rid
|
|
793
|
+
for asset_table, assets in uploaded_files.items()
|
|
794
|
+
for asset in assets
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
def map_path(e):
|
|
798
|
+
"""Go through the asset columns and replace the file name with the RID for the uploaded file."""
|
|
799
|
+
for c in asset_columns:
|
|
800
|
+
e[c] = asset_map[normalize_asset_dir(e[c])]
|
|
801
|
+
return e
|
|
802
|
+
|
|
803
|
+
# Load the JSON file that has the set of records that contain the feature values.
|
|
804
|
+
with Path(feature_file).open("r") as feature_values:
|
|
805
|
+
entities = [json.loads(line.strip()) for line in feature_values]
|
|
806
|
+
# Update the asset columns in the feature and add to the catalog.
|
|
807
|
+
self._ml_object.domain_path.tables[feature_table].insert([map_path(e) for e in entities], on_conflict_skip=True)
|
|
808
|
+
|
|
809
|
+
def _update_asset_execution_table(
|
|
810
|
+
self,
|
|
811
|
+
uploaded_assets: dict[str, list[AssetFilePath]],
|
|
812
|
+
asset_role: str = "Output",
|
|
813
|
+
) -> None:
|
|
814
|
+
"""Add entry to the association table connecting an asset to an execution RID
|
|
815
|
+
|
|
816
|
+
Args:
|
|
817
|
+
uploaded_assets: Dictionary whose key is the name of an asset table and whose value is a list of RIDs for
|
|
818
|
+
newly added assets to that table.
|
|
819
|
+
asset_role: A term or list of terms from the Asset_Role vocabulary.
|
|
820
|
+
"""
|
|
821
|
+
# Make sure the asset role is in the controlled vocabulary table.
|
|
822
|
+
if self._dry_run:
|
|
823
|
+
# Don't do any updates of we are doing a dry run.
|
|
824
|
+
return
|
|
825
|
+
self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
|
|
826
|
+
|
|
827
|
+
pb = self._ml_object.pathBuilder
|
|
828
|
+
for asset_table, asset_list in uploaded_assets.items():
|
|
829
|
+
asset_table_name = asset_table.split("/")[1] # Peel off the schema from the asset table
|
|
830
|
+
asset_exe, asset_fk, execution_fk = self._model.find_association(asset_table_name, "Execution")
|
|
831
|
+
asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
|
|
832
|
+
|
|
833
|
+
asset_exe_path.insert(
|
|
834
|
+
[
|
|
835
|
+
{
|
|
836
|
+
asset_fk: asset_path.asset_rid,
|
|
837
|
+
execution_fk: self.execution_rid,
|
|
838
|
+
"Asset_Role": asset_role,
|
|
839
|
+
}
|
|
840
|
+
for asset_path in asset_list
|
|
841
|
+
],
|
|
842
|
+
on_conflict_skip=True,
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
# Now add in the type names via the asset_asset_type association table.
|
|
846
|
+
# Get the list of types for each file in the asset.
|
|
847
|
+
if asset_role == "Input":
|
|
848
|
+
return
|
|
849
|
+
asset_type_map = {}
|
|
850
|
+
with Path(
|
|
851
|
+
asset_type_path(
|
|
852
|
+
self._working_dir,
|
|
853
|
+
self.execution_rid,
|
|
854
|
+
self._model.name_to_table(asset_table_name),
|
|
855
|
+
)
|
|
856
|
+
).open("r") as asset_type_file:
|
|
857
|
+
for line in asset_type_file:
|
|
858
|
+
asset_type_map.update(json.loads(line.strip()))
|
|
859
|
+
for asset_path in asset_list:
|
|
860
|
+
asset_path.asset_types = asset_type_map[asset_path.file_name]
|
|
861
|
+
|
|
862
|
+
asset_asset_type, _, _ = self._model.find_association(asset_table_name, "Asset_Type")
|
|
863
|
+
type_path = pb.schemas[asset_asset_type.schema.name].tables[asset_asset_type.name]
|
|
864
|
+
|
|
865
|
+
type_path.insert(
|
|
866
|
+
[
|
|
867
|
+
{asset_table_name: asset.asset_rid, "Asset_Type": t}
|
|
868
|
+
for asset in asset_list
|
|
869
|
+
for t in asset_type_map[asset.file_name]
|
|
870
|
+
],
|
|
871
|
+
on_conflict_skip=True,
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
875
|
+
def asset_file_path(
|
|
876
|
+
self,
|
|
877
|
+
asset_name: str,
|
|
878
|
+
file_name: str | Path,
|
|
879
|
+
asset_types: list[str] | str | None = None,
|
|
880
|
+
copy_file=False,
|
|
881
|
+
rename_file: str | None = None,
|
|
882
|
+
**kwargs,
|
|
883
|
+
) -> AssetFilePath:
|
|
884
|
+
"""Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
|
|
885
|
+
|
|
886
|
+
Given the name of an asset table, and a file name, register the file for upload and return a path to that
|
|
887
|
+
file in the upload directory. In addition to the filename, additional asset metadata and file asset types may
|
|
888
|
+
be specified.
|
|
889
|
+
|
|
890
|
+
This routine has three modes, depending on if file_name refers to an existing file. If it doesn't, a path
|
|
891
|
+
to a new file with the specified name is returned. The caller can then open that file for writing.
|
|
892
|
+
|
|
893
|
+
If the provided filename refers to an existing file and the copy_file argument is False (the default), then the
|
|
894
|
+
returned path contains a symbolic link to that file. If the copy_file argument is True, then the contents of
|
|
895
|
+
file_name are copied into the target directory.
|
|
896
|
+
|
|
897
|
+
Args:
|
|
898
|
+
asset_name: Type of asset to be uploaded. Must be a term in Asset_Type controlled vocabulary.
|
|
899
|
+
file_name: Name of file to be uploaded.
|
|
900
|
+
asset_types: Type of asset to be uploaded. Defaults to the name of the asset.
|
|
901
|
+
copy_file: Whether to copy the file rather than creating a symbolic link.
|
|
902
|
+
rename_file: If provided, the file will be renamed to this name if the file already exists..
|
|
903
|
+
**kwargs: Any additional metadata values that may be part of the asset table.
|
|
904
|
+
|
|
905
|
+
Returns:
|
|
906
|
+
Path in which to place asset files.
|
|
907
|
+
|
|
908
|
+
Raises:
|
|
909
|
+
DerivaException: If the asset type is not defined.
|
|
910
|
+
"""
|
|
911
|
+
if not self._model.is_asset(asset_name):
|
|
912
|
+
DerivaMLException(f"Table {asset_name} is not an asset")
|
|
913
|
+
|
|
914
|
+
asset_table = self._model.name_to_table(asset_name)
|
|
915
|
+
|
|
916
|
+
asset_types = asset_types or kwargs.get("Asset_Type", None) or asset_name
|
|
917
|
+
asset_types = [asset_types] if isinstance(asset_types, str) else asset_types
|
|
918
|
+
for t in asset_types:
|
|
919
|
+
self._ml_object.lookup_term(MLVocab.asset_type, t)
|
|
920
|
+
|
|
921
|
+
# Determine if we will need to rename an existing file as the asset.
|
|
922
|
+
file_name = Path(file_name)
|
|
923
|
+
if file_name.name == "_implementations.log":
|
|
924
|
+
# There is a funny bug with S3 hatrac if we have the leading _ in the filename.
|
|
925
|
+
file_name = file_name.with_name("-implementations.log")
|
|
926
|
+
|
|
927
|
+
target_name = Path(rename_file) if file_name.exists() and rename_file else file_name
|
|
928
|
+
asset_path = asset_file_path(
|
|
929
|
+
prefix=self._working_dir,
|
|
930
|
+
exec_rid=self.execution_rid,
|
|
931
|
+
asset_table=self._model.name_to_table(asset_name),
|
|
932
|
+
file_name=target_name.name,
|
|
933
|
+
metadata=kwargs,
|
|
934
|
+
)
|
|
935
|
+
|
|
936
|
+
if file_name.exists():
|
|
937
|
+
if copy_file:
|
|
938
|
+
asset_path.write_bytes(file_name.read_bytes())
|
|
939
|
+
else:
|
|
940
|
+
try:
|
|
941
|
+
asset_path.symlink_to(file_name)
|
|
942
|
+
except (OSError, PermissionError):
|
|
943
|
+
# Fallback to copy if symlink fails (common on Windows)
|
|
944
|
+
asset_path.write_bytes(file_name.read_bytes())
|
|
945
|
+
|
|
946
|
+
# Persist the asset types into a file
|
|
947
|
+
with Path(asset_type_path(self._working_dir, self.execution_rid, asset_table)).open("a") as asset_type_file:
|
|
948
|
+
asset_type_file.write(json.dumps({target_name.name: asset_types}) + "\n")
|
|
949
|
+
|
|
950
|
+
return AssetFilePath(
|
|
951
|
+
asset_path=asset_path,
|
|
952
|
+
asset_name=asset_name,
|
|
953
|
+
file_name=target_name.name,
|
|
954
|
+
asset_metadata=kwargs,
|
|
955
|
+
asset_types=asset_types,
|
|
956
|
+
)
|
|
957
|
+
|
|
958
|
+
def table_path(self, table: str) -> Path:
|
|
959
|
+
"""Return a local file path to a CSV to add values to a table on upload.
|
|
960
|
+
|
|
961
|
+
Args:
|
|
962
|
+
table: Name of table to be uploaded.
|
|
963
|
+
|
|
964
|
+
Returns:
|
|
965
|
+
Pathlib path to the file in which to place table values.
|
|
966
|
+
"""
|
|
967
|
+
if table not in self._model.schemas[self._ml_object.domain_schema].tables:
|
|
968
|
+
raise DerivaMLException("Table '{}' not found in domain schema".format(table))
|
|
969
|
+
|
|
970
|
+
return table_path(self._working_dir, schema=self._ml_object.domain_schema, table=table)
|
|
971
|
+
|
|
972
|
+
def execute(self) -> Execution:
|
|
973
|
+
"""Initiate an execution with the provided configuration. Can be used in a context manager."""
|
|
974
|
+
self.execution_start()
|
|
975
|
+
return self
|
|
976
|
+
|
|
977
|
+
@validate_call
|
|
978
|
+
def add_features(self, features: Iterable[FeatureRecord]) -> None:
|
|
979
|
+
"""Adds feature records to the catalog.
|
|
980
|
+
|
|
981
|
+
Associates feature records with this execution and uploads them to the catalog.
|
|
982
|
+
Features represent measurable properties or characteristics of records.
|
|
983
|
+
|
|
984
|
+
NOTE: The catalog is not updated until upload_execution_outputs() is called.
|
|
985
|
+
|
|
986
|
+
Args:
|
|
987
|
+
features: Feature records to add, each containing a value and metadata.
|
|
988
|
+
|
|
989
|
+
Raises:
|
|
990
|
+
DerivaMLException: If feature addition fails or features are invalid.
|
|
991
|
+
|
|
992
|
+
Example:
|
|
993
|
+
>>> feature = FeatureRecord(value="high", confidence=0.95)
|
|
994
|
+
>>> execution.add_features([feature])
|
|
995
|
+
"""
|
|
996
|
+
|
|
997
|
+
# Make sure feature list is homogeneous:
|
|
998
|
+
sorted_features = defaultdict(list)
|
|
999
|
+
for f in features:
|
|
1000
|
+
sorted_features[type(f)].append(f)
|
|
1001
|
+
for fs in sorted_features.values():
|
|
1002
|
+
self._add_features(fs)
|
|
1003
|
+
|
|
1004
|
+
def _add_features(self, features: list[FeatureRecord]) -> None:
|
|
1005
|
+
# Update feature records to include current execution_rid
|
|
1006
|
+
first_row = features[0]
|
|
1007
|
+
feature = first_row.feature
|
|
1008
|
+
json_path = feature_value_path(
|
|
1009
|
+
self._working_dir,
|
|
1010
|
+
schema=self._ml_object.domain_schema,
|
|
1011
|
+
target_table=feature.target_table.name,
|
|
1012
|
+
feature_name=feature.feature_name,
|
|
1013
|
+
exec_rid=self.execution_rid,
|
|
1014
|
+
)
|
|
1015
|
+
with Path(json_path).open("a", encoding="utf-8") as file:
|
|
1016
|
+
for feature in features:
|
|
1017
|
+
feature.Execution = self.execution_rid
|
|
1018
|
+
file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
|
|
1019
|
+
|
|
1020
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
1021
|
+
def create_dataset(
|
|
1022
|
+
self,
|
|
1023
|
+
dataset_types: str | list[str],
|
|
1024
|
+
description: str,
|
|
1025
|
+
version: DatasetVersion | None = None,
|
|
1026
|
+
) -> RID:
|
|
1027
|
+
"""Create a new dataset with specified types.
|
|
1028
|
+
|
|
1029
|
+
Args:
|
|
1030
|
+
dataset_types: param description:
|
|
1031
|
+
description: Markdown description of the dataset being created.
|
|
1032
|
+
version: Version to assign to the dataset. Defaults to 0.1.0
|
|
1033
|
+
|
|
1034
|
+
Returns:
|
|
1035
|
+
RID of the newly created dataset.
|
|
1036
|
+
"""
|
|
1037
|
+
return self._ml_object.create_dataset(dataset_types, description, self.execution_rid, version=version)
|
|
1038
|
+
|
|
1039
|
+
def add_dataset_members(
|
|
1040
|
+
self,
|
|
1041
|
+
dataset_rid: RID,
|
|
1042
|
+
members: list[RID] | dict[str, list[RID]],
|
|
1043
|
+
validate: bool = True,
|
|
1044
|
+
description: str = "",
|
|
1045
|
+
) -> None:
|
|
1046
|
+
"""Add additional elements to an existing dataset_table.
|
|
1047
|
+
|
|
1048
|
+
Add new elements to an existing dataset. In addition to adding new members, the minor version number of the
|
|
1049
|
+
dataset is incremented and the description, if provide is applied to that new version.
|
|
1050
|
+
|
|
1051
|
+
The RIDs in the list to not have to be all from the same table, but they must be from a table that has
|
|
1052
|
+
been configured to be a dataset element type.
|
|
1053
|
+
|
|
1054
|
+
Args:
|
|
1055
|
+
dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
|
|
1056
|
+
members: List of RIDs of members to add to the dataset_table. RID must be to a table type that is a
|
|
1057
|
+
dataset element type (see DerivaML.add_dataset_element_type).
|
|
1058
|
+
validate: Check rid_list to make sure elements are not already in the dataset_table.
|
|
1059
|
+
description: Markdown description of the updated dataset.
|
|
1060
|
+
"""
|
|
1061
|
+
return self._ml_object.add_dataset_members(
|
|
1062
|
+
dataset_rid=dataset_rid,
|
|
1063
|
+
members=members,
|
|
1064
|
+
validate=validate,
|
|
1065
|
+
description=description,
|
|
1066
|
+
execution_rid=self.execution_rid,
|
|
1067
|
+
)
|
|
1068
|
+
|
|
1069
|
+
def increment_dataset_version(
|
|
1070
|
+
self, dataset_rid: RID, component: VersionPart, description: str = ""
|
|
1071
|
+
) -> DatasetVersion:
|
|
1072
|
+
"""Increment the version of the specified dataset_table.
|
|
1073
|
+
|
|
1074
|
+
Args:
|
|
1075
|
+
dataset_rid: RID to a dataset_table
|
|
1076
|
+
component: Which version of the dataset_table to increment.
|
|
1077
|
+
dataset_rid: RID of the dataset whose version is to be incremented.
|
|
1078
|
+
component: Major, Minor, or Patch
|
|
1079
|
+
description: Description of the version update of the dataset_table.
|
|
1080
|
+
|
|
1081
|
+
Returns:
|
|
1082
|
+
new semantic version of the dataset_table as a 3-tuple
|
|
1083
|
+
|
|
1084
|
+
Raises:
|
|
1085
|
+
DerivaMLException: if provided RID is not to a dataset_table.
|
|
1086
|
+
"""
|
|
1087
|
+
return self._ml_object.increment_dataset_version(
|
|
1088
|
+
dataset_rid=dataset_rid,
|
|
1089
|
+
component=component,
|
|
1090
|
+
description=description,
|
|
1091
|
+
execution_rid=self.execution_rid,
|
|
1092
|
+
)
|
|
1093
|
+
|
|
1094
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
1095
|
+
def add_files(
|
|
1096
|
+
self,
|
|
1097
|
+
files: Iterable[FileSpec],
|
|
1098
|
+
dataset_types: str | list[str] | None = None,
|
|
1099
|
+
description: str = "",
|
|
1100
|
+
) -> RID:
|
|
1101
|
+
"""Adds files to the catalog with their metadata.
|
|
1102
|
+
|
|
1103
|
+
Registers files in the catalog along with their metadata (MD5, length, URL) and associates them with
|
|
1104
|
+
specified file types.
|
|
1105
|
+
|
|
1106
|
+
Args:
|
|
1107
|
+
files: File specifications containing MD5 checksum, length, and URL.
|
|
1108
|
+
dataset_types: One or more dataset type terms from File_Type vocabulary.
|
|
1109
|
+
description: Description of the files.
|
|
1110
|
+
|
|
1111
|
+
Returns:
|
|
1112
|
+
RID: Dataset RID that identifies newly added files. Will be nested to mirror original directory structure
|
|
1113
|
+
of the files.
|
|
1114
|
+
|
|
1115
|
+
Raises:
|
|
1116
|
+
DerivaMLInvalidTerm: If file_types are invalid or execution_rid is not an execution record.
|
|
1117
|
+
|
|
1118
|
+
Examples:
|
|
1119
|
+
Add a single file type:
|
|
1120
|
+
>>> files = [FileSpec(url="path/to/file.txt", md5="abc123", length=1000)]
|
|
1121
|
+
>>> rids = exe.add_files(files, file_types="text")
|
|
1122
|
+
|
|
1123
|
+
Add multiple file types:
|
|
1124
|
+
>>> rids = exe.add_files(
|
|
1125
|
+
... files=[FileSpec(url="image.png", md5="def456", length=2000)],
|
|
1126
|
+
... file_types=["image", "png"],
|
|
1127
|
+
... )
|
|
1128
|
+
"""
|
|
1129
|
+
return self._ml_object.add_files(
|
|
1130
|
+
files=files,
|
|
1131
|
+
dataset_types=dataset_types,
|
|
1132
|
+
execution_rid=self.execution_rid,
|
|
1133
|
+
description=description,
|
|
1134
|
+
)
|
|
1135
|
+
|
|
1136
|
+
def __str__(self):
|
|
1137
|
+
items = [
|
|
1138
|
+
f"caching_dir: {self._cache_dir}",
|
|
1139
|
+
f"_working_dir: {self._working_dir}",
|
|
1140
|
+
f"execution_rid: {self.execution_rid}",
|
|
1141
|
+
f"workflow_rid: {self.workflow_rid}",
|
|
1142
|
+
f"asset_paths: {self.asset_paths}",
|
|
1143
|
+
f"configuration: {self.configuration}",
|
|
1144
|
+
]
|
|
1145
|
+
return "\n".join(items)
|
|
1146
|
+
|
|
1147
|
+
def __enter__(self):
|
|
1148
|
+
"""
|
|
1149
|
+
Method invoked when entering the context.
|
|
1150
|
+
|
|
1151
|
+
Returns:
|
|
1152
|
+
- self: The instance itself.
|
|
1153
|
+
|
|
1154
|
+
"""
|
|
1155
|
+
self.execution_start()
|
|
1156
|
+
return self
|
|
1157
|
+
|
|
1158
|
+
def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> bool:
|
|
1159
|
+
"""
|
|
1160
|
+
Method invoked when exiting the context.
|
|
1161
|
+
|
|
1162
|
+
Args:
|
|
1163
|
+
exc_type: Exception type.
|
|
1164
|
+
exc_value: Exception value.
|
|
1165
|
+
exc_tb: Exception traceback.
|
|
1166
|
+
|
|
1167
|
+
Returns:
|
|
1168
|
+
bool: True if execution completed successfully, False otherwise.
|
|
1169
|
+
"""
|
|
1170
|
+
if not exc_type:
|
|
1171
|
+
self.update_status(Status.running, "Successfully run Ml.")
|
|
1172
|
+
self.execution_stop()
|
|
1173
|
+
return True
|
|
1174
|
+
else:
|
|
1175
|
+
self.update_status(
|
|
1176
|
+
Status.failed,
|
|
1177
|
+
f"Exception type: {exc_type}, Exception value: {exc_value}",
|
|
1178
|
+
)
|
|
1179
|
+
logging.error(f"Exception type: {exc_type}, Exception value: {exc_value}, Exception traceback: {exc_tb}")
|
|
1180
|
+
return False
|