deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deriva_ml/__init__.py +69 -1
- deriva_ml/asset/__init__.py +17 -0
- deriva_ml/asset/asset.py +357 -0
- deriva_ml/asset/aux_classes.py +100 -0
- deriva_ml/bump_version.py +254 -11
- deriva_ml/catalog/__init__.py +31 -0
- deriva_ml/catalog/clone.py +1939 -0
- deriva_ml/catalog/localize.py +426 -0
- deriva_ml/core/__init__.py +29 -0
- deriva_ml/core/base.py +845 -1067
- deriva_ml/core/config.py +169 -21
- deriva_ml/core/constants.py +120 -19
- deriva_ml/core/definitions.py +123 -13
- deriva_ml/core/enums.py +47 -73
- deriva_ml/core/ermrest.py +226 -193
- deriva_ml/core/exceptions.py +297 -14
- deriva_ml/core/filespec.py +99 -28
- deriva_ml/core/logging_config.py +225 -0
- deriva_ml/core/mixins/__init__.py +42 -0
- deriva_ml/core/mixins/annotation.py +915 -0
- deriva_ml/core/mixins/asset.py +384 -0
- deriva_ml/core/mixins/dataset.py +237 -0
- deriva_ml/core/mixins/execution.py +408 -0
- deriva_ml/core/mixins/feature.py +365 -0
- deriva_ml/core/mixins/file.py +263 -0
- deriva_ml/core/mixins/path_builder.py +145 -0
- deriva_ml/core/mixins/rid_resolution.py +204 -0
- deriva_ml/core/mixins/vocabulary.py +400 -0
- deriva_ml/core/mixins/workflow.py +322 -0
- deriva_ml/core/validation.py +389 -0
- deriva_ml/dataset/__init__.py +2 -1
- deriva_ml/dataset/aux_classes.py +20 -4
- deriva_ml/dataset/catalog_graph.py +575 -0
- deriva_ml/dataset/dataset.py +1242 -1008
- deriva_ml/dataset/dataset_bag.py +1311 -182
- deriva_ml/dataset/history.py +27 -14
- deriva_ml/dataset/upload.py +225 -38
- deriva_ml/demo_catalog.py +126 -110
- deriva_ml/execution/__init__.py +46 -2
- deriva_ml/execution/base_config.py +639 -0
- deriva_ml/execution/execution.py +543 -242
- deriva_ml/execution/execution_configuration.py +26 -11
- deriva_ml/execution/execution_record.py +592 -0
- deriva_ml/execution/find_caller.py +298 -0
- deriva_ml/execution/model_protocol.py +175 -0
- deriva_ml/execution/multirun_config.py +153 -0
- deriva_ml/execution/runner.py +595 -0
- deriva_ml/execution/workflow.py +223 -34
- deriva_ml/experiment/__init__.py +8 -0
- deriva_ml/experiment/experiment.py +411 -0
- deriva_ml/feature.py +6 -1
- deriva_ml/install_kernel.py +143 -6
- deriva_ml/interfaces.py +862 -0
- deriva_ml/model/__init__.py +99 -0
- deriva_ml/model/annotations.py +1278 -0
- deriva_ml/model/catalog.py +286 -60
- deriva_ml/model/database.py +144 -649
- deriva_ml/model/deriva_ml_database.py +308 -0
- deriva_ml/model/handles.py +14 -0
- deriva_ml/run_model.py +319 -0
- deriva_ml/run_notebook.py +507 -38
- deriva_ml/schema/__init__.py +18 -2
- deriva_ml/schema/annotations.py +62 -33
- deriva_ml/schema/create_schema.py +169 -69
- deriva_ml/schema/validation.py +601 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
- deriva_ml-1.17.12.dist-info/RECORD +77 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
- deriva_ml/protocols/dataset.py +0 -19
- deriva_ml/test.py +0 -94
- deriva_ml-1.17.10.dist-info/RECORD +0 -45
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
- {deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0
deriva_ml/execution/execution.py
CHANGED
|
@@ -18,7 +18,14 @@ Typical usage example:
|
|
|
18
18
|
>>> with ml.create_execution(config) as execution:
|
|
19
19
|
... execution.download_dataset_bag(dataset_spec)
|
|
20
20
|
... # Run analysis
|
|
21
|
-
... execution.
|
|
21
|
+
... path = execution.asset_file_path("Model", "model.pt")
|
|
22
|
+
... # Write model to path...
|
|
23
|
+
...
|
|
24
|
+
>>> # IMPORTANT: Upload AFTER the context manager exits
|
|
25
|
+
>>> execution.upload_execution_outputs()
|
|
26
|
+
|
|
27
|
+
The context manager handles start/stop timing automatically. The upload_execution_outputs()
|
|
28
|
+
call must happen AFTER exiting the context manager to ensure proper status tracking.
|
|
22
29
|
"""
|
|
23
30
|
|
|
24
31
|
from __future__ import annotations
|
|
@@ -28,10 +35,11 @@ import logging
|
|
|
28
35
|
import os
|
|
29
36
|
import shutil
|
|
30
37
|
import sys
|
|
38
|
+
import time
|
|
31
39
|
from collections import defaultdict
|
|
32
40
|
from datetime import datetime
|
|
33
41
|
from pathlib import Path
|
|
34
|
-
from typing import Any, Iterable, List
|
|
42
|
+
from typing import Any, Callable, Iterable, List
|
|
35
43
|
|
|
36
44
|
from deriva.core import format_exception
|
|
37
45
|
from deriva.core.hatrac_store import HatracStore
|
|
@@ -47,9 +55,12 @@ from deriva_ml.core.definitions import (
|
|
|
47
55
|
MLAsset,
|
|
48
56
|
MLVocab,
|
|
49
57
|
Status,
|
|
58
|
+
UploadProgress,
|
|
50
59
|
)
|
|
51
60
|
from deriva_ml.core.exceptions import DerivaMLException
|
|
52
|
-
from deriva_ml.
|
|
61
|
+
from deriva_ml.asset.aux_classes import AssetFilePath
|
|
62
|
+
from deriva_ml.dataset.aux_classes import DatasetSpec, DatasetVersion
|
|
63
|
+
from deriva_ml.dataset.dataset import Dataset
|
|
53
64
|
from deriva_ml.dataset.dataset_bag import DatasetBag
|
|
54
65
|
from deriva_ml.dataset.upload import (
|
|
55
66
|
asset_file_path,
|
|
@@ -65,8 +76,10 @@ from deriva_ml.dataset.upload import (
|
|
|
65
76
|
)
|
|
66
77
|
from deriva_ml.execution.environment import get_execution_environment
|
|
67
78
|
from deriva_ml.execution.execution_configuration import ExecutionConfiguration
|
|
79
|
+
from deriva_ml.execution.execution_record import ExecutionRecord
|
|
68
80
|
from deriva_ml.execution.workflow import Workflow
|
|
69
81
|
from deriva_ml.feature import FeatureRecord
|
|
82
|
+
from deriva_ml.model.deriva_ml_database import DerivaMLDatabase
|
|
70
83
|
|
|
71
84
|
# Keep pycharm from complaining about undefined references in docstrings.
|
|
72
85
|
execution: Execution
|
|
@@ -90,92 +103,6 @@ except ImportError:
|
|
|
90
103
|
return s
|
|
91
104
|
|
|
92
105
|
|
|
93
|
-
# Platform-specific base class
|
|
94
|
-
if sys.version_info >= (3, 12):
|
|
95
|
-
|
|
96
|
-
class AssetFilePath(Path):
|
|
97
|
-
"""Extended Path class for managing asset files.
|
|
98
|
-
|
|
99
|
-
Represents a file path with additional metadata about its role as an asset in the catalog.
|
|
100
|
-
This class extends the standard Path class to include information about the asset's
|
|
101
|
-
catalog representation and type.
|
|
102
|
-
|
|
103
|
-
Attributes:
|
|
104
|
-
asset_name (str): Name of the asset in the catalog (e.g., asset table name).
|
|
105
|
-
file_name (str): Name of the local file containing the asset.
|
|
106
|
-
asset_metadata (dict[str, Any]): Additional columns beyond URL, Length, and checksum.
|
|
107
|
-
asset_types (list[str]): Terms from the Asset_Type controlled vocabulary.
|
|
108
|
-
asset_rid (RID | None): Resource Identifier if uploaded to an asset table.
|
|
109
|
-
|
|
110
|
-
Example:
|
|
111
|
-
>>> path = AssetFilePath(
|
|
112
|
-
... "/path/to/file.txt",
|
|
113
|
-
... asset_name="analysis_output",
|
|
114
|
-
... file_name="results.txt",
|
|
115
|
-
... asset_metadata={"version": "1.0"},
|
|
116
|
-
... asset_types=["text", "results"]
|
|
117
|
-
... )
|
|
118
|
-
"""
|
|
119
|
-
|
|
120
|
-
def __init__(
|
|
121
|
-
self,
|
|
122
|
-
asset_path: str | Path,
|
|
123
|
-
asset_name: str,
|
|
124
|
-
file_name: str,
|
|
125
|
-
asset_metadata: dict[str, Any],
|
|
126
|
-
asset_types: list[str] | str,
|
|
127
|
-
asset_rid: RID | None = None,
|
|
128
|
-
):
|
|
129
|
-
"""Initializes an AssetFilePath instance.
|
|
130
|
-
|
|
131
|
-
Args:
|
|
132
|
-
asset_path: Local path to the asset file.
|
|
133
|
-
asset_name: Name of the asset in the catalog.
|
|
134
|
-
file_name: Name of the local file.
|
|
135
|
-
asset_metadata: Additional metadata columns.
|
|
136
|
-
asset_types: One or more asset type terms.
|
|
137
|
-
asset_rid: Optional Resource Identifier if already in catalog.
|
|
138
|
-
"""
|
|
139
|
-
super().__init__(asset_path)
|
|
140
|
-
self.asset_name = asset_name
|
|
141
|
-
self.file_name = file_name
|
|
142
|
-
self.asset_metadata = asset_metadata
|
|
143
|
-
self.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
|
|
144
|
-
self.asset_rid = asset_rid
|
|
145
|
-
else:
|
|
146
|
-
|
|
147
|
-
class AssetFilePath(type(Path())):
|
|
148
|
-
"""
|
|
149
|
-
Create a new Path object that has additional information related to the use of this path as an asset.
|
|
150
|
-
|
|
151
|
-
Attrubytes:
|
|
152
|
-
asset_path: Local path to the location of the asset.
|
|
153
|
-
asset_name: The name of the asset in the catalog (e.g., the asset table name).
|
|
154
|
-
file_name: Name of the local file that contains the contents of the asset.
|
|
155
|
-
asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
|
|
156
|
-
asset_types: A list of terms from the Asset_Type controlled vocabulary.
|
|
157
|
-
asset_rid: The RID of the asset if it has been uploaded into an asset table
|
|
158
|
-
"""
|
|
159
|
-
|
|
160
|
-
def __new__(
|
|
161
|
-
cls,
|
|
162
|
-
asset_path: str | Path,
|
|
163
|
-
asset_name: str,
|
|
164
|
-
file_name: str,
|
|
165
|
-
asset_metadata: dict[str, Any],
|
|
166
|
-
asset_types: list[str] | str,
|
|
167
|
-
asset_rid: RID | None = None,
|
|
168
|
-
):
|
|
169
|
-
# Only pass the path to the base Path class
|
|
170
|
-
obj = super().__new__(cls, asset_path)
|
|
171
|
-
obj.asset_name = asset_name
|
|
172
|
-
obj.file_name = file_name
|
|
173
|
-
obj.asset_metadata = asset_metadata
|
|
174
|
-
obj.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
|
|
175
|
-
obj.asset_rid = asset_rid
|
|
176
|
-
return obj
|
|
177
|
-
|
|
178
|
-
|
|
179
106
|
class Execution:
|
|
180
107
|
"""Manages the lifecycle and context of a DerivaML execution.
|
|
181
108
|
|
|
@@ -201,14 +128,21 @@ class Execution:
|
|
|
201
128
|
stop_time (datetime | None): When execution completed.
|
|
202
129
|
|
|
203
130
|
Example:
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
131
|
+
The context manager handles start/stop timing. Upload must be called AFTER
|
|
132
|
+
the context manager exits::
|
|
133
|
+
|
|
134
|
+
>>> config = ExecutionConfiguration(
|
|
135
|
+
... workflow="analysis",
|
|
136
|
+
... description="Process samples",
|
|
137
|
+
... )
|
|
138
|
+
>>> with ml.create_execution(config) as execution:
|
|
139
|
+
... bag = execution.download_dataset_bag(dataset_spec)
|
|
140
|
+
... # Run analysis using bag.path
|
|
141
|
+
... output_path = execution.asset_file_path("Model", "model.pt")
|
|
142
|
+
... # Write results to output_path
|
|
143
|
+
...
|
|
144
|
+
>>> # IMPORTANT: Call upload AFTER exiting the context manager
|
|
145
|
+
>>> execution.upload_execution_outputs()
|
|
212
146
|
"""
|
|
213
147
|
|
|
214
148
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
@@ -216,7 +150,7 @@ class Execution:
|
|
|
216
150
|
self,
|
|
217
151
|
configuration: ExecutionConfiguration,
|
|
218
152
|
ml_object: DerivaML,
|
|
219
|
-
workflow: Workflow |
|
|
153
|
+
workflow: Workflow | None = None,
|
|
220
154
|
reload: RID | None = None,
|
|
221
155
|
dry_run: bool = False,
|
|
222
156
|
):
|
|
@@ -228,13 +162,32 @@ class Execution:
|
|
|
228
162
|
Args:
|
|
229
163
|
configuration: Settings and parameters for the execution.
|
|
230
164
|
ml_object: DerivaML instance managing the execution.
|
|
231
|
-
workflow: Optional
|
|
232
|
-
|
|
165
|
+
workflow: Optional Workflow object. If not specified, the workflow is taken from
|
|
166
|
+
the ExecutionConfiguration object. Must be a Workflow object, not a RID.
|
|
233
167
|
reload: Optional RID of existing execution to reload.
|
|
234
168
|
dry_run: If True, don't create catalog records or upload results.
|
|
235
169
|
|
|
236
170
|
Raises:
|
|
237
|
-
DerivaMLException: If initialization fails
|
|
171
|
+
DerivaMLException: If initialization fails, configuration is invalid,
|
|
172
|
+
or workflow is not a Workflow object.
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
Create an execution with a workflow::
|
|
176
|
+
|
|
177
|
+
>>> workflow = ml.lookup_workflow("2-ABC1")
|
|
178
|
+
>>> config = ExecutionConfiguration(
|
|
179
|
+
... workflow=workflow,
|
|
180
|
+
... description="Process data"
|
|
181
|
+
... )
|
|
182
|
+
>>> execution = Execution(config, ml)
|
|
183
|
+
|
|
184
|
+
Or pass workflow separately::
|
|
185
|
+
|
|
186
|
+
>>> workflow = ml.lookup_workflow_by_url(
|
|
187
|
+
... "https://github.com/org/repo/blob/abc123/analysis.py"
|
|
188
|
+
... )
|
|
189
|
+
>>> config = ExecutionConfiguration(description="Run analysis")
|
|
190
|
+
>>> execution = Execution(config, ml, workflow=workflow)
|
|
238
191
|
"""
|
|
239
192
|
|
|
240
193
|
self.asset_paths: dict[str, list[AssetFilePath]] = {}
|
|
@@ -244,9 +197,10 @@ class Execution:
|
|
|
244
197
|
self._logger = ml_object._logger
|
|
245
198
|
self.start_time = None
|
|
246
199
|
self.stop_time = None
|
|
247
|
-
self.
|
|
200
|
+
self._status = Status.created
|
|
248
201
|
self.uploaded_assets: dict[str, list[AssetFilePath]] | None = None
|
|
249
202
|
self.configuration.argv = sys.argv
|
|
203
|
+
self._execution_record: ExecutionRecord | None = None # Lazily created after RID is assigned
|
|
250
204
|
|
|
251
205
|
self.dataset_rids: List[RID] = []
|
|
252
206
|
self.datasets: list[DatasetBag] = []
|
|
@@ -255,18 +209,24 @@ class Execution:
|
|
|
255
209
|
self._cache_dir = self._ml_object.cache_dir
|
|
256
210
|
self._dry_run = dry_run
|
|
257
211
|
|
|
258
|
-
# Make sure we have a
|
|
212
|
+
# Make sure we have a valid Workflow object.
|
|
259
213
|
if workflow:
|
|
260
214
|
self.configuration.workflow = workflow
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
215
|
+
|
|
216
|
+
if self.configuration.workflow is None:
|
|
217
|
+
raise DerivaMLException("Workflow must be specified either in configuration or as a parameter")
|
|
218
|
+
|
|
219
|
+
if not isinstance(self.configuration.workflow, Workflow):
|
|
220
|
+
raise DerivaMLException(
|
|
221
|
+
f"Workflow must be a Workflow object, not {type(self.configuration.workflow).__name__}. "
|
|
222
|
+
"Use ml.lookup_workflow(rid) or ml.lookup_workflow_by_url(url) to get a Workflow object."
|
|
265
223
|
)
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
224
|
+
|
|
225
|
+
# Validate workflow type and register in catalog
|
|
226
|
+
self._ml_object.lookup_term(MLVocab.workflow_type, self.configuration.workflow.workflow_type)
|
|
227
|
+
self.workflow_rid = (
|
|
228
|
+
self._ml_object.add_workflow(self.configuration.workflow) if not self._dry_run else DRY_RUN_RID
|
|
229
|
+
)
|
|
270
230
|
|
|
271
231
|
# Validate the datasets and assets to be valid.
|
|
272
232
|
for d in self.configuration.datasets:
|
|
@@ -277,7 +237,7 @@ class Execution:
|
|
|
277
237
|
if not self._model.is_asset(self._ml_object.resolve_rid(a).table.name):
|
|
278
238
|
raise DerivaMLException("Asset specified in execution configuration is not a asset table")
|
|
279
239
|
|
|
280
|
-
schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
|
|
240
|
+
schema_path = self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema]
|
|
281
241
|
if reload:
|
|
282
242
|
self.execution_rid = reload
|
|
283
243
|
if self.execution_rid == DRY_RUN_RID:
|
|
@@ -309,6 +269,18 @@ class Execution:
|
|
|
309
269
|
|
|
310
270
|
# Create a directory for execution rid so we can recover the state in case of a crash.
|
|
311
271
|
execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
|
|
272
|
+
|
|
273
|
+
# Create the ExecutionRecord to handle catalog state operations
|
|
274
|
+
if not self._dry_run:
|
|
275
|
+
self._execution_record = ExecutionRecord(
|
|
276
|
+
execution_rid=self.execution_rid,
|
|
277
|
+
workflow=self.configuration.workflow,
|
|
278
|
+
status=Status.created,
|
|
279
|
+
description=self.configuration.description,
|
|
280
|
+
_ml_instance=self._ml_object,
|
|
281
|
+
_logger=self._logger,
|
|
282
|
+
)
|
|
283
|
+
|
|
312
284
|
self._initialize_execution(reload)
|
|
313
285
|
|
|
314
286
|
def _save_runtime_environment(self):
|
|
@@ -321,31 +293,33 @@ class Execution:
|
|
|
321
293
|
json.dump(get_execution_environment(), fp)
|
|
322
294
|
|
|
323
295
|
def _upload_hydra_config_assets(self):
|
|
324
|
-
"""Upload hydra assets to the catalog."""
|
|
296
|
+
"""Upload hydra assets to the catalog with Hydra_Config type."""
|
|
325
297
|
hydra_runtime_output_dir = self._ml_object.hydra_runtime_output_dir
|
|
326
298
|
if hydra_runtime_output_dir:
|
|
327
299
|
timestamp = hydra_runtime_output_dir.parts[-1]
|
|
328
300
|
for hydra_asset in hydra_runtime_output_dir.rglob("*"):
|
|
329
301
|
if hydra_asset.is_dir():
|
|
330
302
|
continue
|
|
331
|
-
|
|
303
|
+
# Register file for upload (side effect); result intentionally unused
|
|
304
|
+
# Use Hydra_Config type for Hydra YAML configuration files
|
|
305
|
+
self.asset_file_path(
|
|
332
306
|
asset_name=MLAsset.execution_metadata,
|
|
333
307
|
file_name=hydra_runtime_output_dir / hydra_asset,
|
|
334
308
|
rename_file=f"hydra-{timestamp}-{hydra_asset.name}",
|
|
335
|
-
asset_types=ExecMetadataType.
|
|
309
|
+
asset_types=ExecMetadataType.hydra_config.value,
|
|
336
310
|
)
|
|
337
311
|
|
|
338
312
|
def _initialize_execution(self, reload: RID | None = None) -> None:
|
|
339
|
-
"""Initialize the execution
|
|
340
|
-
Set up a working directory and download all the assets and data.
|
|
313
|
+
"""Initialize the execution environment.
|
|
341
314
|
|
|
342
|
-
|
|
315
|
+
Sets up the working directory, downloads required datasets and assets,
|
|
316
|
+
and saves initial configuration metadata.
|
|
343
317
|
|
|
344
318
|
Args:
|
|
345
|
-
reload: RID of previously initialized execution.
|
|
346
|
-
|
|
347
|
-
Returns:
|
|
319
|
+
reload: Optional RID of a previously initialized execution to reload.
|
|
348
320
|
|
|
321
|
+
Raises:
|
|
322
|
+
DerivaMLException: If initialization fails.
|
|
349
323
|
"""
|
|
350
324
|
# Materialize bdbag
|
|
351
325
|
for dataset in self.configuration.datasets:
|
|
@@ -354,7 +328,7 @@ class Execution:
|
|
|
354
328
|
self.dataset_rids.append(dataset.rid)
|
|
355
329
|
|
|
356
330
|
# Update execution info
|
|
357
|
-
schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
|
|
331
|
+
schema_path = self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema]
|
|
358
332
|
if self.dataset_rids and not (reload or self._dry_run):
|
|
359
333
|
schema_path.Dataset_Execution.insert(
|
|
360
334
|
[{"Dataset": d, "Execution": self.execution_rid} for d in self.dataset_rids]
|
|
@@ -379,16 +353,21 @@ class Execution:
|
|
|
379
353
|
|
|
380
354
|
# Save configuration details for later upload
|
|
381
355
|
if not reload:
|
|
356
|
+
# Save DerivaML configuration with Deriva_Config type
|
|
382
357
|
cfile = self.asset_file_path(
|
|
383
358
|
asset_name=MLAsset.execution_metadata,
|
|
384
359
|
file_name="configuration.json",
|
|
385
|
-
asset_types=ExecMetadataType.
|
|
360
|
+
asset_types=ExecMetadataType.deriva_config.value,
|
|
386
361
|
)
|
|
387
362
|
|
|
388
363
|
with Path(cfile).open("w", encoding="utf-8") as config_file:
|
|
389
364
|
json.dump(self.configuration.model_dump(mode="json"), config_file)
|
|
390
|
-
|
|
391
|
-
if
|
|
365
|
+
# Only try to copy uv.lock if git_root is available (local workflow)
|
|
366
|
+
if self.configuration.workflow.git_root:
|
|
367
|
+
lock_file = Path(self.configuration.workflow.git_root) / "uv.lock"
|
|
368
|
+
else:
|
|
369
|
+
lock_file = None
|
|
370
|
+
if lock_file and lock_file.exists():
|
|
392
371
|
_ = self.asset_file_path(
|
|
393
372
|
asset_name=MLAsset.execution_metadata,
|
|
394
373
|
file_name=lock_file,
|
|
@@ -405,6 +384,37 @@ class Execution:
|
|
|
405
384
|
self.start_time = datetime.now()
|
|
406
385
|
self.update_status(Status.pending, "Initialize status finished.")
|
|
407
386
|
|
|
387
|
+
@property
|
|
388
|
+
def status(self) -> Status:
|
|
389
|
+
"""Get the current execution status.
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
Status: The current status (Created, Running, Completed, Failed, etc.).
|
|
393
|
+
"""
|
|
394
|
+
if self._execution_record is not None:
|
|
395
|
+
return self._execution_record.status
|
|
396
|
+
return self._status
|
|
397
|
+
|
|
398
|
+
@status.setter
|
|
399
|
+
def status(self, value: Status) -> None:
|
|
400
|
+
"""Set the execution status.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
value: The new status value.
|
|
404
|
+
"""
|
|
405
|
+
self._status = value
|
|
406
|
+
if self._execution_record is not None:
|
|
407
|
+
self._execution_record._status = value
|
|
408
|
+
|
|
409
|
+
@property
|
|
410
|
+
def execution_record(self) -> ExecutionRecord | None:
|
|
411
|
+
"""Get the ExecutionRecord for catalog operations.
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
ExecutionRecord if not in dry_run mode, None otherwise.
|
|
415
|
+
"""
|
|
416
|
+
return self._execution_record
|
|
417
|
+
|
|
408
418
|
@property
|
|
409
419
|
def working_dir(self) -> Path:
|
|
410
420
|
"""Return the working directory for the execution."""
|
|
@@ -412,39 +422,78 @@ class Execution:
|
|
|
412
422
|
|
|
413
423
|
@property
|
|
414
424
|
def _execution_root(self) -> Path:
|
|
425
|
+
"""Get the root directory for this execution's files.
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
Path to the execution-specific directory.
|
|
415
429
|
"""
|
|
430
|
+
return execution_root(self._working_dir, self.execution_rid)
|
|
416
431
|
|
|
417
|
-
|
|
432
|
+
@property
|
|
433
|
+
def _feature_root(self) -> Path:
|
|
434
|
+
"""Get the root directory for feature files.
|
|
418
435
|
|
|
419
436
|
Returns:
|
|
420
|
-
|
|
437
|
+
Path to the feature directory within the execution.
|
|
438
|
+
"""
|
|
439
|
+
return feature_root(self._working_dir, self.execution_rid)
|
|
440
|
+
|
|
441
|
+
@property
|
|
442
|
+
def _asset_root(self) -> Path:
|
|
443
|
+
"""Get the root directory for asset files.
|
|
421
444
|
|
|
445
|
+
Returns:
|
|
446
|
+
Path to the asset directory within the execution.
|
|
422
447
|
"""
|
|
423
|
-
return
|
|
448
|
+
return asset_root(self._working_dir, self.execution_rid)
|
|
424
449
|
|
|
425
450
|
@property
|
|
426
|
-
def
|
|
427
|
-
"""
|
|
428
|
-
:return:
|
|
451
|
+
def database_catalog(self) -> DerivaMLDatabase | None:
|
|
452
|
+
"""Get a catalog-like interface for downloaded datasets.
|
|
429
453
|
|
|
430
|
-
|
|
454
|
+
Returns a DerivaMLDatabase that implements the DerivaMLCatalog
|
|
455
|
+
protocol, allowing the same code to work with both live catalogs
|
|
456
|
+
and downloaded bags.
|
|
457
|
+
|
|
458
|
+
This is useful for writing code that can operate on either a live
|
|
459
|
+
catalog (via DerivaML) or on downloaded bags (via DerivaMLDatabase).
|
|
431
460
|
|
|
432
461
|
Returns:
|
|
462
|
+
DerivaMLDatabase wrapping the primary downloaded dataset's model,
|
|
463
|
+
or None if no datasets have been downloaded.
|
|
433
464
|
|
|
465
|
+
Example:
|
|
466
|
+
>>> with ml.create_execution(config) as exe:
|
|
467
|
+
... if exe.database_catalog:
|
|
468
|
+
... db = exe.database_catalog
|
|
469
|
+
... # Use same interface as DerivaML
|
|
470
|
+
... dataset = db.lookup_dataset("4HM")
|
|
471
|
+
... term = db.lookup_term("Diagnosis", "cancer")
|
|
472
|
+
... else:
|
|
473
|
+
... # No datasets downloaded, use live catalog
|
|
474
|
+
... pass
|
|
434
475
|
"""
|
|
435
|
-
|
|
476
|
+
if not self.datasets:
|
|
477
|
+
return None
|
|
478
|
+
# Use the first dataset's model as the primary
|
|
479
|
+
return DerivaMLDatabase(self.datasets[0].model)
|
|
436
480
|
|
|
437
481
|
@property
|
|
438
|
-
def
|
|
439
|
-
"""
|
|
440
|
-
:return:
|
|
482
|
+
def catalog(self) -> "DerivaML":
|
|
483
|
+
"""Get the live catalog (DerivaML) instance for this execution.
|
|
441
484
|
|
|
442
|
-
|
|
485
|
+
This provides access to the live catalog for operations that require
|
|
486
|
+
catalog connectivity, such as looking up datasets or other read operations.
|
|
443
487
|
|
|
444
488
|
Returns:
|
|
489
|
+
DerivaML: The live catalog instance.
|
|
445
490
|
|
|
491
|
+
Example:
|
|
492
|
+
>>> with ml.create_execution(config) as exe:
|
|
493
|
+
... # Use live catalog for lookups
|
|
494
|
+
... existing_dataset = exe.catalog.lookup_dataset("1-ABC")
|
|
446
495
|
"""
|
|
447
|
-
return
|
|
496
|
+
return self._ml_object
|
|
448
497
|
|
|
449
498
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
450
499
|
def download_dataset_bag(self, dataset: DatasetSpec) -> DatasetBag:
|
|
@@ -471,7 +520,7 @@ class Execution:
|
|
|
471
520
|
>>> bag = execution.download_dataset_bag(spec)
|
|
472
521
|
>>> print(f"Downloaded to {bag.path}")
|
|
473
522
|
"""
|
|
474
|
-
return self._ml_object.download_dataset_bag(dataset
|
|
523
|
+
return self._ml_object.download_dataset_bag(dataset)
|
|
475
524
|
|
|
476
525
|
@validate_call
|
|
477
526
|
def update_status(self, status: Status, msg: str) -> None:
|
|
@@ -490,21 +539,26 @@ class Execution:
|
|
|
490
539
|
Example:
|
|
491
540
|
>>> execution.update_status(Status.running, "Processing sample 1 of 10")
|
|
492
541
|
"""
|
|
493
|
-
self.
|
|
542
|
+
self._status = status
|
|
494
543
|
self._logger.info(msg)
|
|
495
544
|
|
|
496
545
|
if self._dry_run:
|
|
497
546
|
return
|
|
498
547
|
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
548
|
+
# Delegate to ExecutionRecord for catalog updates
|
|
549
|
+
if self._execution_record is not None:
|
|
550
|
+
self._execution_record.update_status(status, msg)
|
|
551
|
+
else:
|
|
552
|
+
# Fallback for cases where ExecutionRecord isn't available
|
|
553
|
+
self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema].Execution.update(
|
|
554
|
+
[
|
|
555
|
+
{
|
|
556
|
+
"RID": self.execution_rid,
|
|
557
|
+
"Status": status.value,
|
|
558
|
+
"Status_Detail": msg,
|
|
559
|
+
}
|
|
560
|
+
]
|
|
561
|
+
)
|
|
508
562
|
|
|
509
563
|
def execution_start(self) -> None:
|
|
510
564
|
"""Marks the execution as started.
|
|
@@ -545,17 +599,23 @@ class Execution:
|
|
|
545
599
|
|
|
546
600
|
self.update_status(Status.completed, "Algorithm execution ended.")
|
|
547
601
|
if not self._dry_run:
|
|
548
|
-
self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
|
|
602
|
+
self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema].Execution.update(
|
|
549
603
|
[{"RID": self.execution_rid, "Duration": duration}]
|
|
550
604
|
)
|
|
551
605
|
|
|
552
|
-
def _upload_execution_dirs(
|
|
606
|
+
def _upload_execution_dirs(
|
|
607
|
+
self, progress_callback: Callable[[UploadProgress], None] | None = None
|
|
608
|
+
) -> dict[str, list[AssetFilePath]]:
|
|
553
609
|
"""Upload execution assets at _working_dir/Execution_asset.
|
|
554
610
|
|
|
555
611
|
This routine uploads the contents of the
|
|
556
612
|
Execution_Asset directory and then updates the execution_asset table in the ML schema to have references
|
|
557
613
|
to these newly uploaded files.
|
|
558
614
|
|
|
615
|
+
Args:
|
|
616
|
+
progress_callback: Optional callback function to receive upload progress updates.
|
|
617
|
+
Called with UploadProgress objects containing file information and progress.
|
|
618
|
+
|
|
559
619
|
Returns:
|
|
560
620
|
dict: Results of the upload operation.
|
|
561
621
|
|
|
@@ -565,11 +625,11 @@ class Execution:
|
|
|
565
625
|
|
|
566
626
|
try:
|
|
567
627
|
self.update_status(Status.running, "Uploading execution files...")
|
|
568
|
-
results = upload_directory(self._model, self._asset_root)
|
|
569
|
-
except RuntimeError as e:
|
|
628
|
+
results = upload_directory(self._model, self._asset_root, progress_callback=progress_callback)
|
|
629
|
+
except (RuntimeError, DerivaMLException) as e:
|
|
570
630
|
error = format_exception(e)
|
|
571
631
|
self.update_status(Status.failed, error)
|
|
572
|
-
raise DerivaMLException(f"
|
|
632
|
+
raise DerivaMLException(f"Failed to upload execution_assets: {error}")
|
|
573
633
|
|
|
574
634
|
asset_map = {}
|
|
575
635
|
for path, status in results.items():
|
|
@@ -578,7 +638,7 @@ class Execution:
|
|
|
578
638
|
asset_map.setdefault(asset_table, []).append(
|
|
579
639
|
AssetFilePath(
|
|
580
640
|
asset_path=path,
|
|
581
|
-
|
|
641
|
+
asset_table=asset_table,
|
|
582
642
|
file_name=file_name,
|
|
583
643
|
asset_metadata={
|
|
584
644
|
k: v
|
|
@@ -629,7 +689,7 @@ class Execution:
|
|
|
629
689
|
hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
|
|
630
690
|
|
|
631
691
|
asset_type_table, _col_l, _col_r = self._model.find_association(asset_table, MLVocab.asset_type)
|
|
632
|
-
type_path = self._ml_object.pathBuilder.schemas[asset_type_table.schema.name].tables[asset_type_table.name]
|
|
692
|
+
type_path = self._ml_object.pathBuilder().schemas[asset_type_table.schema.name].tables[asset_type_table.name]
|
|
633
693
|
asset_types = [
|
|
634
694
|
asset_type[MLVocab.asset_type.value]
|
|
635
695
|
for asset_type in type_path.filter(type_path.columns[asset_table.name] == asset_rid)
|
|
@@ -642,7 +702,7 @@ class Execution:
|
|
|
642
702
|
asset_rid=asset_rid,
|
|
643
703
|
asset_path=asset_filename,
|
|
644
704
|
asset_metadata=asset_metadata,
|
|
645
|
-
|
|
705
|
+
asset_table=asset_table.name,
|
|
646
706
|
asset_types=asset_types,
|
|
647
707
|
)
|
|
648
708
|
|
|
@@ -690,15 +750,26 @@ class Execution:
|
|
|
690
750
|
results = upload_directory(self._model, assets_dir)
|
|
691
751
|
return {path_to_asset(p): r for p, r in results.items()}
|
|
692
752
|
|
|
693
|
-
def upload_execution_outputs(
|
|
753
|
+
def upload_execution_outputs(
|
|
754
|
+
self, clean_folder: bool | None = None, progress_callback: Callable[[UploadProgress], None] | None = None
|
|
755
|
+
) -> dict[str, list[AssetFilePath]]:
|
|
694
756
|
"""Uploads all outputs from the execution to the catalog.
|
|
695
757
|
|
|
696
758
|
Scans the execution's output directories for assets, features, and other results,
|
|
697
759
|
then uploads them to the catalog. Can optionally clean up the output folders
|
|
698
760
|
after successful upload.
|
|
699
761
|
|
|
762
|
+
IMPORTANT: This method must be called AFTER exiting the context manager, not inside it.
|
|
763
|
+
The context manager handles execution timing (start/stop), while this method handles
|
|
764
|
+
the separate upload step.
|
|
765
|
+
|
|
700
766
|
Args:
|
|
701
|
-
clean_folder: Whether to delete output folders after upload.
|
|
767
|
+
clean_folder: Whether to delete output folders after upload. If None (default),
|
|
768
|
+
uses the DerivaML instance's clean_execution_dir setting. Pass True/False
|
|
769
|
+
to override for this specific execution.
|
|
770
|
+
progress_callback: Optional callback function to receive upload progress updates.
|
|
771
|
+
Called with UploadProgress objects containing file name, bytes uploaded,
|
|
772
|
+
total bytes, percent complete, phase, and status message.
|
|
702
773
|
|
|
703
774
|
Returns:
|
|
704
775
|
dict[str, list[AssetFilePath]]: Mapping of asset types to their file paths.
|
|
@@ -707,14 +778,28 @@ class Execution:
|
|
|
707
778
|
DerivaMLException: If upload fails or outputs are invalid.
|
|
708
779
|
|
|
709
780
|
Example:
|
|
710
|
-
>>>
|
|
711
|
-
|
|
712
|
-
...
|
|
781
|
+
>>> with ml.create_execution(config) as execution:
|
|
782
|
+
... # Do ML work, register output files with asset_file_path()
|
|
783
|
+
... path = execution.asset_file_path("Model", "model.pt")
|
|
784
|
+
... # Write to path...
|
|
785
|
+
...
|
|
786
|
+
>>> # Upload AFTER the context manager exits
|
|
787
|
+
>>> def my_callback(progress):
|
|
788
|
+
... print(f"Uploading {progress.file_name}: {progress.percent_complete:.1f}%")
|
|
789
|
+
>>> outputs = execution.upload_execution_outputs(progress_callback=my_callback)
|
|
790
|
+
>>>
|
|
791
|
+
>>> # Override cleanup setting for this execution
|
|
792
|
+
>>> outputs = execution.upload_execution_outputs(clean_folder=False) # Keep files
|
|
713
793
|
"""
|
|
714
794
|
if self._dry_run:
|
|
715
795
|
return {}
|
|
796
|
+
|
|
797
|
+
# Use DerivaML instance setting if not explicitly provided
|
|
798
|
+
if clean_folder is None:
|
|
799
|
+
clean_folder = getattr(self._ml_object, 'clean_execution_dir', True)
|
|
800
|
+
|
|
716
801
|
try:
|
|
717
|
-
self.uploaded_assets = self._upload_execution_dirs()
|
|
802
|
+
self.uploaded_assets = self._upload_execution_dirs(progress_callback=progress_callback)
|
|
718
803
|
self.update_status(Status.completed, "Successfully end the execution.")
|
|
719
804
|
if clean_folder:
|
|
720
805
|
self._clean_folder_contents(self._execution_root)
|
|
@@ -724,14 +809,17 @@ class Execution:
|
|
|
724
809
|
self.update_status(Status.failed, error)
|
|
725
810
|
raise e
|
|
726
811
|
|
|
727
|
-
def _clean_folder_contents(self, folder_path: Path):
|
|
728
|
-
"""Clean up folder contents
|
|
812
|
+
def _clean_folder_contents(self, folder_path: Path, remove_folder: bool = True):
|
|
813
|
+
"""Clean up folder contents and optionally the folder itself.
|
|
814
|
+
|
|
815
|
+
Removes all files and subdirectories within the specified folder.
|
|
816
|
+
Uses retry logic for Windows compatibility where files may be temporarily locked.
|
|
729
817
|
|
|
730
818
|
Args:
|
|
731
|
-
folder_path: Path to the folder to clean
|
|
819
|
+
folder_path: Path to the folder to clean.
|
|
820
|
+
remove_folder: If True (default), also remove the folder itself after
|
|
821
|
+
cleaning its contents. If False, only remove contents.
|
|
732
822
|
"""
|
|
733
|
-
import time
|
|
734
|
-
|
|
735
823
|
MAX_RETRIES = 3
|
|
736
824
|
RETRY_DELAY = 1 # seconds
|
|
737
825
|
|
|
@@ -745,20 +833,26 @@ class Execution:
|
|
|
745
833
|
return True
|
|
746
834
|
except (OSError, PermissionError) as e:
|
|
747
835
|
if attempt == MAX_RETRIES - 1:
|
|
748
|
-
|
|
836
|
+
logging.warning(f"Failed to remove {path}: {e}")
|
|
749
837
|
return False
|
|
750
838
|
time.sleep(RETRY_DELAY)
|
|
751
839
|
return False
|
|
752
840
|
|
|
753
841
|
try:
|
|
842
|
+
# First remove all contents
|
|
754
843
|
with os.scandir(folder_path) as entries:
|
|
755
844
|
for entry in entries:
|
|
756
845
|
if entry.is_dir() and not entry.is_symlink():
|
|
757
846
|
remove_with_retry(Path(entry.path), is_dir=True)
|
|
758
847
|
else:
|
|
759
848
|
remove_with_retry(Path(entry.path))
|
|
849
|
+
|
|
850
|
+
# Then remove the folder itself if requested
|
|
851
|
+
if remove_folder:
|
|
852
|
+
remove_with_retry(folder_path, is_dir=True)
|
|
853
|
+
|
|
760
854
|
except OSError as e:
|
|
761
|
-
|
|
855
|
+
logging.warning(f"Failed to clean folder {folder_path}: {e}")
|
|
762
856
|
|
|
763
857
|
def _update_feature_table(
|
|
764
858
|
self,
|
|
@@ -767,14 +861,16 @@ class Execution:
|
|
|
767
861
|
feature_file: str | Path,
|
|
768
862
|
uploaded_files: dict[str, list[AssetFilePath]],
|
|
769
863
|
) -> None:
|
|
770
|
-
"""
|
|
864
|
+
"""Update the feature table with values from a JSONL file.
|
|
865
|
+
|
|
866
|
+
Reads feature values from a file and inserts them into the catalog,
|
|
867
|
+
replacing file paths with the RIDs of uploaded assets.
|
|
771
868
|
|
|
772
869
|
Args:
|
|
773
|
-
target_table:
|
|
774
|
-
feature_name:
|
|
775
|
-
feature_file:
|
|
776
|
-
uploaded_files:
|
|
777
|
-
RID of that asset.
|
|
870
|
+
target_table: Name of the table the feature is defined on.
|
|
871
|
+
feature_name: Name of the feature to update.
|
|
872
|
+
feature_file: Path to JSONL file containing feature values.
|
|
873
|
+
uploaded_files: Map from asset table names to their uploaded AssetFilePath objects.
|
|
778
874
|
"""
|
|
779
875
|
|
|
780
876
|
# Get the column names of all the Feature columns that should be the RID of an asset
|
|
@@ -804,7 +900,7 @@ class Execution:
|
|
|
804
900
|
with Path(feature_file).open("r") as feature_values:
|
|
805
901
|
entities = [json.loads(line.strip()) for line in feature_values]
|
|
806
902
|
# Update the asset columns in the feature and add to the catalog.
|
|
807
|
-
self._ml_object.domain_path.tables[feature_table].insert([map_path(e) for e in entities], on_conflict_skip=True)
|
|
903
|
+
self._ml_object.domain_path().tables[feature_table].insert([map_path(e) for e in entities], on_conflict_skip=True)
|
|
808
904
|
|
|
809
905
|
def _update_asset_execution_table(
|
|
810
906
|
self,
|
|
@@ -824,7 +920,7 @@ class Execution:
|
|
|
824
920
|
return
|
|
825
921
|
self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
|
|
826
922
|
|
|
827
|
-
pb = self._ml_object.pathBuilder
|
|
923
|
+
pb = self._ml_object.pathBuilder()
|
|
828
924
|
for asset_table, asset_list in uploaded_assets.items():
|
|
829
925
|
asset_table_name = asset_table.split("/")[1] # Peel off the schema from the asset table
|
|
830
926
|
asset_exe, asset_fk, execution_fk = self._model.find_association(asset_table_name, "Execution")
|
|
@@ -924,6 +1020,11 @@ class Execution:
|
|
|
924
1020
|
# There is a funny bug with S3 hatrac if we have the leading _ in the filename.
|
|
925
1021
|
file_name = file_name.with_name("-implementations.log")
|
|
926
1022
|
|
|
1023
|
+
# Resolve relative paths to absolute paths to ensure exists() and symlink work correctly
|
|
1024
|
+
# regardless of the current working directory
|
|
1025
|
+
if not file_name.is_absolute():
|
|
1026
|
+
file_name = file_name.resolve()
|
|
1027
|
+
|
|
927
1028
|
target_name = Path(rename_file) if file_name.exists() and rename_file else file_name
|
|
928
1029
|
asset_path = asset_file_path(
|
|
929
1030
|
prefix=self._working_dir,
|
|
@@ -949,7 +1050,7 @@ class Execution:
|
|
|
949
1050
|
|
|
950
1051
|
return AssetFilePath(
|
|
951
1052
|
asset_path=asset_path,
|
|
952
|
-
|
|
1053
|
+
asset_table=asset_name,
|
|
953
1054
|
file_name=target_name.name,
|
|
954
1055
|
asset_metadata=kwargs,
|
|
955
1056
|
asset_types=asset_types,
|
|
@@ -964,10 +1065,18 @@ class Execution:
|
|
|
964
1065
|
Returns:
|
|
965
1066
|
Pathlib path to the file in which to place table values.
|
|
966
1067
|
"""
|
|
967
|
-
|
|
968
|
-
|
|
1068
|
+
# Find which domain schema contains this table
|
|
1069
|
+
table_schema = None
|
|
1070
|
+
for domain_schema in self._ml_object.domain_schemas:
|
|
1071
|
+
if domain_schema in self._model.schemas:
|
|
1072
|
+
if table in self._model.schemas[domain_schema].tables:
|
|
1073
|
+
table_schema = domain_schema
|
|
1074
|
+
break
|
|
1075
|
+
|
|
1076
|
+
if table_schema is None:
|
|
1077
|
+
raise DerivaMLException("Table '{}' not found in any domain schema".format(table))
|
|
969
1078
|
|
|
970
|
-
return table_path(self._working_dir, schema=
|
|
1079
|
+
return table_path(self._working_dir, schema=table_schema, table=table)
|
|
971
1080
|
|
|
972
1081
|
def execute(self) -> Execution:
|
|
973
1082
|
"""Initiate an execution with the provided configuration. Can be used in a context manager."""
|
|
@@ -1005,9 +1114,11 @@ class Execution:
|
|
|
1005
1114
|
# Update feature records to include current execution_rid
|
|
1006
1115
|
first_row = features[0]
|
|
1007
1116
|
feature = first_row.feature
|
|
1117
|
+
# Use the schema from the feature table
|
|
1118
|
+
feature_schema = feature.feature_table.schema.name
|
|
1008
1119
|
json_path = feature_value_path(
|
|
1009
1120
|
self._working_dir,
|
|
1010
|
-
schema=
|
|
1121
|
+
schema=feature_schema,
|
|
1011
1122
|
target_table=feature.target_table.name,
|
|
1012
1123
|
feature_name=feature.feature_name,
|
|
1013
1124
|
exec_rid=self.execution_rid,
|
|
@@ -1017,78 +1128,93 @@ class Execution:
|
|
|
1017
1128
|
feature.Execution = self.execution_rid
|
|
1018
1129
|
file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
|
|
1019
1130
|
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
self,
|
|
1023
|
-
dataset_types: str | list[str],
|
|
1024
|
-
description: str,
|
|
1025
|
-
version: DatasetVersion | None = None,
|
|
1026
|
-
) -> RID:
|
|
1027
|
-
"""Create a new dataset with specified types.
|
|
1028
|
-
|
|
1029
|
-
Args:
|
|
1030
|
-
dataset_types: param description:
|
|
1031
|
-
description: Markdown description of the dataset being created.
|
|
1032
|
-
version: Version to assign to the dataset. Defaults to 0.1.0
|
|
1131
|
+
def list_input_datasets(self) -> list[Dataset]:
|
|
1132
|
+
"""List all datasets that were inputs to this execution.
|
|
1033
1133
|
|
|
1034
1134
|
Returns:
|
|
1035
|
-
|
|
1135
|
+
List of Dataset objects that were used as inputs.
|
|
1136
|
+
|
|
1137
|
+
Example:
|
|
1138
|
+
>>> for ds in execution.list_input_datasets():
|
|
1139
|
+
... print(f"Input: {ds.dataset_rid} - {ds.description}")
|
|
1036
1140
|
"""
|
|
1037
|
-
|
|
1141
|
+
if self._execution_record is not None:
|
|
1142
|
+
return self._execution_record.list_input_datasets()
|
|
1038
1143
|
|
|
1039
|
-
|
|
1040
|
-
self
|
|
1041
|
-
|
|
1042
|
-
members: list[RID] | dict[str, list[RID]],
|
|
1043
|
-
validate: bool = True,
|
|
1044
|
-
description: str = "",
|
|
1045
|
-
) -> None:
|
|
1046
|
-
"""Add additional elements to an existing dataset_table.
|
|
1144
|
+
# Fallback for dry_run mode
|
|
1145
|
+
pb = self._ml_object.pathBuilder()
|
|
1146
|
+
dataset_exec = pb.schemas[self._ml_object.ml_schema].Dataset_Execution
|
|
1047
1147
|
|
|
1048
|
-
|
|
1049
|
-
|
|
1148
|
+
records = list(
|
|
1149
|
+
dataset_exec.filter(dataset_exec.Execution == self.execution_rid)
|
|
1150
|
+
.entities()
|
|
1151
|
+
.fetch()
|
|
1152
|
+
)
|
|
1153
|
+
|
|
1154
|
+
return [self._ml_object.lookup_dataset(r["Dataset"]) for r in records]
|
|
1050
1155
|
|
|
1051
|
-
|
|
1052
|
-
|
|
1156
|
+
def list_assets(self, asset_role: str | None = None) -> list["Asset"]:
|
|
1157
|
+
"""List all assets that were inputs or outputs of this execution.
|
|
1053
1158
|
|
|
1054
1159
|
Args:
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1160
|
+
asset_role: Optional filter: "Input" or "Output". If None, returns all.
|
|
1161
|
+
|
|
1162
|
+
Returns:
|
|
1163
|
+
List of Asset objects associated with this execution.
|
|
1164
|
+
|
|
1165
|
+
Example:
|
|
1166
|
+
>>> inputs = execution.list_assets(asset_role="Input")
|
|
1167
|
+
>>> outputs = execution.list_assets(asset_role="Output")
|
|
1060
1168
|
"""
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
)
|
|
1169
|
+
if self._execution_record is not None:
|
|
1170
|
+
return self._execution_record.list_assets(asset_role=asset_role)
|
|
1171
|
+
|
|
1172
|
+
# Fallback for dry_run mode
|
|
1173
|
+
from deriva_ml.asset.asset import Asset
|
|
1174
|
+
|
|
1175
|
+
pb = self._ml_object.pathBuilder()
|
|
1176
|
+
asset_exec = pb.schemas[self._ml_object.ml_schema].Execution_Asset_Execution
|
|
1177
|
+
|
|
1178
|
+
query = asset_exec.filter(asset_exec.Execution == self.execution_rid)
|
|
1179
|
+
if asset_role:
|
|
1180
|
+
query = query.filter(asset_exec.Asset_Role == asset_role)
|
|
1181
|
+
|
|
1182
|
+
records = list(query.entities().fetch())
|
|
1183
|
+
|
|
1184
|
+
assets = []
|
|
1185
|
+
for r in records:
|
|
1186
|
+
try:
|
|
1187
|
+
asset = self._ml_object.lookup_asset(r["Execution_Asset"])
|
|
1188
|
+
assets.append(asset)
|
|
1189
|
+
except Exception:
|
|
1190
|
+
pass # Skip assets that can't be looked up
|
|
1191
|
+
return assets
|
|
1192
|
+
|
|
1193
|
+
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
1194
|
+
def create_dataset(
|
|
1195
|
+
self,
|
|
1196
|
+
dataset_types: str | list[str] | None = None,
|
|
1197
|
+
version: DatasetVersion | str | None = None,
|
|
1198
|
+
description: str = "",
|
|
1199
|
+
) -> Dataset:
|
|
1200
|
+
"""Create a new dataset with specified types.
|
|
1068
1201
|
|
|
1069
|
-
|
|
1070
|
-
self, dataset_rid: RID, component: VersionPart, description: str = ""
|
|
1071
|
-
) -> DatasetVersion:
|
|
1072
|
-
"""Increment the version of the specified dataset_table.
|
|
1202
|
+
Creates a dataset associated with this execution for provenance tracking.
|
|
1073
1203
|
|
|
1074
1204
|
Args:
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
component: Major, Minor, or Patch
|
|
1079
|
-
description: Description of the version update of the dataset_table.
|
|
1205
|
+
dataset_types: One or more dataset type terms from Dataset_Type vocabulary.
|
|
1206
|
+
description: Markdown description of the dataset being created.
|
|
1207
|
+
version: Dataset version. Defaults to 0.1.0.
|
|
1080
1208
|
|
|
1081
1209
|
Returns:
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
Raises:
|
|
1085
|
-
DerivaMLException: if provided RID is not to a dataset_table.
|
|
1210
|
+
The newly created Dataset.
|
|
1086
1211
|
"""
|
|
1087
|
-
return
|
|
1088
|
-
|
|
1089
|
-
component=component,
|
|
1090
|
-
description=description,
|
|
1212
|
+
return Dataset.create_dataset(
|
|
1213
|
+
ml_instance=self._ml_object,
|
|
1091
1214
|
execution_rid=self.execution_rid,
|
|
1215
|
+
dataset_types=dataset_types,
|
|
1216
|
+
version=version,
|
|
1217
|
+
description=description,
|
|
1092
1218
|
)
|
|
1093
1219
|
|
|
1094
1220
|
@validate_call(config=ConfigDict(arbitrary_types_allowed=True))
|
|
@@ -1097,7 +1223,7 @@ class Execution:
|
|
|
1097
1223
|
files: Iterable[FileSpec],
|
|
1098
1224
|
dataset_types: str | list[str] | None = None,
|
|
1099
1225
|
description: str = "",
|
|
1100
|
-
) ->
|
|
1226
|
+
) -> "Dataset":
|
|
1101
1227
|
"""Adds files to the catalog with their metadata.
|
|
1102
1228
|
|
|
1103
1229
|
Registers files in the catalog along with their metadata (MD5, length, URL) and associates them with
|
|
@@ -1109,7 +1235,7 @@ class Execution:
|
|
|
1109
1235
|
description: Description of the files.
|
|
1110
1236
|
|
|
1111
1237
|
Returns:
|
|
1112
|
-
RID: Dataset
|
|
1238
|
+
RID: Dataset that identifies newly added files. Will be nested to mirror original directory structure
|
|
1113
1239
|
of the files.
|
|
1114
1240
|
|
|
1115
1241
|
Raises:
|
|
@@ -1128,11 +1254,186 @@ class Execution:
|
|
|
1128
1254
|
"""
|
|
1129
1255
|
return self._ml_object.add_files(
|
|
1130
1256
|
files=files,
|
|
1131
|
-
dataset_types=dataset_types,
|
|
1132
1257
|
execution_rid=self.execution_rid,
|
|
1258
|
+
dataset_types=dataset_types,
|
|
1133
1259
|
description=description,
|
|
1134
1260
|
)
|
|
1135
1261
|
|
|
1262
|
+
# =========================================================================
|
|
1263
|
+
# Execution Nesting Methods
|
|
1264
|
+
# =========================================================================
|
|
1265
|
+
|
|
1266
|
+
def add_nested_execution(
|
|
1267
|
+
self,
|
|
1268
|
+
nested_execution: "Execution | ExecutionRecord | RID",
|
|
1269
|
+
sequence: int | None = None,
|
|
1270
|
+
) -> None:
|
|
1271
|
+
"""Add a nested (child) execution to this execution.
|
|
1272
|
+
|
|
1273
|
+
Creates a parent-child relationship between this execution and another.
|
|
1274
|
+
This is useful for grouping related executions, such as parameter sweeps
|
|
1275
|
+
or pipeline stages.
|
|
1276
|
+
|
|
1277
|
+
Args:
|
|
1278
|
+
nested_execution: The child execution to add (Execution, ExecutionRecord, or RID).
|
|
1279
|
+
sequence: Optional ordering index (0, 1, 2...). Use None for parallel executions.
|
|
1280
|
+
|
|
1281
|
+
Raises:
|
|
1282
|
+
DerivaMLException: If the association cannot be created.
|
|
1283
|
+
|
|
1284
|
+
Example:
|
|
1285
|
+
>>> parent_exec = ml.create_execution(parent_config)
|
|
1286
|
+
>>> child_exec = ml.create_execution(child_config)
|
|
1287
|
+
>>> parent_exec.add_nested_execution(child_exec, sequence=0)
|
|
1288
|
+
"""
|
|
1289
|
+
if self._dry_run:
|
|
1290
|
+
return
|
|
1291
|
+
|
|
1292
|
+
# Get the RID from the nested execution
|
|
1293
|
+
if isinstance(nested_execution, Execution):
|
|
1294
|
+
nested_rid = nested_execution.execution_rid
|
|
1295
|
+
elif isinstance(nested_execution, ExecutionRecord):
|
|
1296
|
+
nested_rid = nested_execution.execution_rid
|
|
1297
|
+
else:
|
|
1298
|
+
nested_rid = nested_execution
|
|
1299
|
+
|
|
1300
|
+
# Delegate to ExecutionRecord if available
|
|
1301
|
+
if self._execution_record is not None:
|
|
1302
|
+
self._execution_record.add_nested_execution(nested_rid, sequence=sequence)
|
|
1303
|
+
else:
|
|
1304
|
+
# Fallback for cases without execution record
|
|
1305
|
+
pb = self._ml_object.pathBuilder()
|
|
1306
|
+
execution_execution = pb.schemas[self._ml_object.ml_schema].Execution_Execution
|
|
1307
|
+
|
|
1308
|
+
record = {
|
|
1309
|
+
"Execution": self.execution_rid,
|
|
1310
|
+
"Nested_Execution": nested_rid,
|
|
1311
|
+
}
|
|
1312
|
+
if sequence is not None:
|
|
1313
|
+
record["Sequence"] = sequence
|
|
1314
|
+
|
|
1315
|
+
execution_execution.insert([record])
|
|
1316
|
+
|
|
1317
|
+
def list_nested_executions(
|
|
1318
|
+
self,
|
|
1319
|
+
recurse: bool = False,
|
|
1320
|
+
_visited: set[RID] | None = None,
|
|
1321
|
+
) -> list["ExecutionRecord"]:
|
|
1322
|
+
"""List all nested (child) executions of this execution.
|
|
1323
|
+
|
|
1324
|
+
Args:
|
|
1325
|
+
recurse: If True, recursively return all descendant executions.
|
|
1326
|
+
_visited: Internal parameter to track visited executions and prevent infinite recursion.
|
|
1327
|
+
|
|
1328
|
+
Returns:
|
|
1329
|
+
List of nested ExecutionRecord objects, ordered by sequence if available.
|
|
1330
|
+
To get full Execution objects with lifecycle management, use restore_execution().
|
|
1331
|
+
|
|
1332
|
+
Example:
|
|
1333
|
+
>>> children = parent_exec.list_nested_executions()
|
|
1334
|
+
>>> all_descendants = parent_exec.list_nested_executions(recurse=True)
|
|
1335
|
+
"""
|
|
1336
|
+
if self._execution_record is not None:
|
|
1337
|
+
return list(self._execution_record.list_nested_executions(recurse=recurse, _visited=_visited))
|
|
1338
|
+
|
|
1339
|
+
# Fallback for dry_run mode
|
|
1340
|
+
if _visited is None:
|
|
1341
|
+
_visited = set()
|
|
1342
|
+
|
|
1343
|
+
if self.execution_rid in _visited:
|
|
1344
|
+
return []
|
|
1345
|
+
_visited.add(self.execution_rid)
|
|
1346
|
+
|
|
1347
|
+
pb = self._ml_object.pathBuilder()
|
|
1348
|
+
execution_execution = pb.schemas[self._ml_object.ml_schema].Execution_Execution
|
|
1349
|
+
|
|
1350
|
+
# Query for nested executions, ordered by sequence
|
|
1351
|
+
nested = list(
|
|
1352
|
+
execution_execution.filter(execution_execution.Execution == self.execution_rid)
|
|
1353
|
+
.entities()
|
|
1354
|
+
.fetch()
|
|
1355
|
+
)
|
|
1356
|
+
|
|
1357
|
+
# Sort by sequence (None values at the end)
|
|
1358
|
+
nested.sort(key=lambda x: (x.get("Sequence") is None, x.get("Sequence")))
|
|
1359
|
+
|
|
1360
|
+
children = []
|
|
1361
|
+
for record in nested:
|
|
1362
|
+
child = self._ml_object.lookup_execution(record["Nested_Execution"])
|
|
1363
|
+
children.append(child)
|
|
1364
|
+
if recurse:
|
|
1365
|
+
children.extend(child.list_nested_executions(recurse=True, _visited=_visited))
|
|
1366
|
+
|
|
1367
|
+
return children
|
|
1368
|
+
|
|
1369
|
+
def list_parent_executions(
|
|
1370
|
+
self,
|
|
1371
|
+
recurse: bool = False,
|
|
1372
|
+
_visited: set[RID] | None = None,
|
|
1373
|
+
) -> list["ExecutionRecord"]:
|
|
1374
|
+
"""List all parent executions that contain this execution as a nested child.
|
|
1375
|
+
|
|
1376
|
+
Args:
|
|
1377
|
+
recurse: If True, recursively return all ancestor executions.
|
|
1378
|
+
_visited: Internal parameter to track visited executions and prevent infinite recursion.
|
|
1379
|
+
|
|
1380
|
+
Returns:
|
|
1381
|
+
List of parent ExecutionRecord objects.
|
|
1382
|
+
To get full Execution objects with lifecycle management, use restore_execution().
|
|
1383
|
+
|
|
1384
|
+
Example:
|
|
1385
|
+
>>> parents = child_exec.list_parent_executions()
|
|
1386
|
+
>>> all_ancestors = child_exec.list_parent_executions(recurse=True)
|
|
1387
|
+
"""
|
|
1388
|
+
if self._execution_record is not None:
|
|
1389
|
+
return list(self._execution_record.list_parent_executions(recurse=recurse, _visited=_visited))
|
|
1390
|
+
|
|
1391
|
+
# Fallback for dry_run mode
|
|
1392
|
+
if _visited is None:
|
|
1393
|
+
_visited = set()
|
|
1394
|
+
|
|
1395
|
+
if self.execution_rid in _visited:
|
|
1396
|
+
return []
|
|
1397
|
+
_visited.add(self.execution_rid)
|
|
1398
|
+
|
|
1399
|
+
pb = self._ml_object.pathBuilder()
|
|
1400
|
+
execution_execution = pb.schemas[self._ml_object.ml_schema].Execution_Execution
|
|
1401
|
+
|
|
1402
|
+
parent_records = list(
|
|
1403
|
+
execution_execution.filter(execution_execution.Nested_Execution == self.execution_rid)
|
|
1404
|
+
.entities()
|
|
1405
|
+
.fetch()
|
|
1406
|
+
)
|
|
1407
|
+
|
|
1408
|
+
parents = []
|
|
1409
|
+
for record in parent_records:
|
|
1410
|
+
parent = self._ml_object.lookup_execution(record["Execution"])
|
|
1411
|
+
parents.append(parent)
|
|
1412
|
+
if recurse:
|
|
1413
|
+
parents.extend(parent.list_parent_executions(recurse=True, _visited=_visited))
|
|
1414
|
+
|
|
1415
|
+
return parents
|
|
1416
|
+
|
|
1417
|
+
def is_nested(self) -> bool:
|
|
1418
|
+
"""Check if this execution is nested within another execution.
|
|
1419
|
+
|
|
1420
|
+
Returns:
|
|
1421
|
+
True if this execution has at least one parent execution.
|
|
1422
|
+
"""
|
|
1423
|
+
if self._execution_record is not None:
|
|
1424
|
+
return self._execution_record.is_nested()
|
|
1425
|
+
return len(self.list_parent_executions()) > 0
|
|
1426
|
+
|
|
1427
|
+
def is_parent(self) -> bool:
|
|
1428
|
+
"""Check if this execution has nested child executions.
|
|
1429
|
+
|
|
1430
|
+
Returns:
|
|
1431
|
+
True if this execution has at least one nested execution.
|
|
1432
|
+
"""
|
|
1433
|
+
if self._execution_record is not None:
|
|
1434
|
+
return self._execution_record.is_parent()
|
|
1435
|
+
return len(self.list_nested_executions()) > 0
|
|
1436
|
+
|
|
1136
1437
|
def __str__(self):
|
|
1137
1438
|
items = [
|
|
1138
1439
|
f"caching_dir: {self._cache_dir}",
|