deriva-ml 1.17.9__py3-none-any.whl → 1.17.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. deriva_ml/__init__.py +43 -1
  2. deriva_ml/asset/__init__.py +17 -0
  3. deriva_ml/asset/asset.py +357 -0
  4. deriva_ml/asset/aux_classes.py +100 -0
  5. deriva_ml/bump_version.py +254 -11
  6. deriva_ml/catalog/__init__.py +21 -0
  7. deriva_ml/catalog/clone.py +1199 -0
  8. deriva_ml/catalog/localize.py +426 -0
  9. deriva_ml/core/__init__.py +29 -0
  10. deriva_ml/core/base.py +817 -1067
  11. deriva_ml/core/config.py +169 -21
  12. deriva_ml/core/constants.py +120 -19
  13. deriva_ml/core/definitions.py +123 -13
  14. deriva_ml/core/enums.py +47 -73
  15. deriva_ml/core/ermrest.py +226 -193
  16. deriva_ml/core/exceptions.py +297 -14
  17. deriva_ml/core/filespec.py +99 -28
  18. deriva_ml/core/logging_config.py +225 -0
  19. deriva_ml/core/mixins/__init__.py +42 -0
  20. deriva_ml/core/mixins/annotation.py +915 -0
  21. deriva_ml/core/mixins/asset.py +384 -0
  22. deriva_ml/core/mixins/dataset.py +237 -0
  23. deriva_ml/core/mixins/execution.py +408 -0
  24. deriva_ml/core/mixins/feature.py +365 -0
  25. deriva_ml/core/mixins/file.py +263 -0
  26. deriva_ml/core/mixins/path_builder.py +145 -0
  27. deriva_ml/core/mixins/rid_resolution.py +204 -0
  28. deriva_ml/core/mixins/vocabulary.py +400 -0
  29. deriva_ml/core/mixins/workflow.py +322 -0
  30. deriva_ml/core/validation.py +389 -0
  31. deriva_ml/dataset/__init__.py +2 -1
  32. deriva_ml/dataset/aux_classes.py +20 -4
  33. deriva_ml/dataset/catalog_graph.py +575 -0
  34. deriva_ml/dataset/dataset.py +1242 -1008
  35. deriva_ml/dataset/dataset_bag.py +1311 -182
  36. deriva_ml/dataset/history.py +27 -14
  37. deriva_ml/dataset/upload.py +225 -38
  38. deriva_ml/demo_catalog.py +186 -105
  39. deriva_ml/execution/__init__.py +46 -2
  40. deriva_ml/execution/base_config.py +639 -0
  41. deriva_ml/execution/execution.py +545 -244
  42. deriva_ml/execution/execution_configuration.py +26 -11
  43. deriva_ml/execution/execution_record.py +592 -0
  44. deriva_ml/execution/find_caller.py +298 -0
  45. deriva_ml/execution/model_protocol.py +175 -0
  46. deriva_ml/execution/multirun_config.py +153 -0
  47. deriva_ml/execution/runner.py +595 -0
  48. deriva_ml/execution/workflow.py +224 -35
  49. deriva_ml/experiment/__init__.py +8 -0
  50. deriva_ml/experiment/experiment.py +411 -0
  51. deriva_ml/feature.py +6 -1
  52. deriva_ml/install_kernel.py +143 -6
  53. deriva_ml/interfaces.py +862 -0
  54. deriva_ml/model/__init__.py +99 -0
  55. deriva_ml/model/annotations.py +1278 -0
  56. deriva_ml/model/catalog.py +286 -60
  57. deriva_ml/model/database.py +144 -649
  58. deriva_ml/model/deriva_ml_database.py +308 -0
  59. deriva_ml/model/handles.py +14 -0
  60. deriva_ml/run_model.py +319 -0
  61. deriva_ml/run_notebook.py +507 -38
  62. deriva_ml/schema/__init__.py +18 -2
  63. deriva_ml/schema/annotations.py +62 -33
  64. deriva_ml/schema/create_schema.py +169 -69
  65. deriva_ml/schema/validation.py +601 -0
  66. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -5
  67. deriva_ml-1.17.11.dist-info/RECORD +77 -0
  68. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
  69. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +2 -0
  70. deriva_ml/protocols/dataset.py +0 -19
  71. deriva_ml/test.py +0 -94
  72. deriva_ml-1.17.9.dist-info/RECORD +0 -45
  73. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
  74. {deriva_ml-1.17.9.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,14 @@ Typical usage example:
18
18
  >>> with ml.create_execution(config) as execution:
19
19
  ... execution.download_dataset_bag(dataset_spec)
20
20
  ... # Run analysis
21
- ... execution.upload_execution_outputs()
21
+ ... path = execution.asset_file_path("Model", "model.pt")
22
+ ... # Write model to path...
23
+ ...
24
+ >>> # IMPORTANT: Upload AFTER the context manager exits
25
+ >>> execution.upload_execution_outputs()
26
+
27
+ The context manager handles start/stop timing automatically. The upload_execution_outputs()
28
+ call must happen AFTER exiting the context manager to ensure proper status tracking.
22
29
  """
23
30
 
24
31
  from __future__ import annotations
@@ -28,10 +35,11 @@ import logging
28
35
  import os
29
36
  import shutil
30
37
  import sys
38
+ import time
31
39
  from collections import defaultdict
32
40
  from datetime import datetime
33
41
  from pathlib import Path
34
- from typing import Any, Iterable, List
42
+ from typing import Any, Callable, Iterable, List
35
43
 
36
44
  from deriva.core import format_exception
37
45
  from deriva.core.hatrac_store import HatracStore
@@ -47,9 +55,12 @@ from deriva_ml.core.definitions import (
47
55
  MLAsset,
48
56
  MLVocab,
49
57
  Status,
58
+ UploadProgress,
50
59
  )
51
60
  from deriva_ml.core.exceptions import DerivaMLException
52
- from deriva_ml.dataset.aux_classes import DatasetSpec, DatasetVersion, VersionPart
61
+ from deriva_ml.asset.aux_classes import AssetFilePath
62
+ from deriva_ml.dataset.aux_classes import DatasetSpec, DatasetVersion
63
+ from deriva_ml.dataset.dataset import Dataset
53
64
  from deriva_ml.dataset.dataset_bag import DatasetBag
54
65
  from deriva_ml.dataset.upload import (
55
66
  asset_file_path,
@@ -65,8 +76,10 @@ from deriva_ml.dataset.upload import (
65
76
  )
66
77
  from deriva_ml.execution.environment import get_execution_environment
67
78
  from deriva_ml.execution.execution_configuration import ExecutionConfiguration
79
+ from deriva_ml.execution.execution_record import ExecutionRecord
68
80
  from deriva_ml.execution.workflow import Workflow
69
81
  from deriva_ml.feature import FeatureRecord
82
+ from deriva_ml.model.deriva_ml_database import DerivaMLDatabase
70
83
 
71
84
  # Keep pycharm from complaining about undefined references in docstrings.
72
85
  execution: Execution
@@ -90,92 +103,6 @@ except ImportError:
90
103
  return s
91
104
 
92
105
 
93
- # Platform-specific base class
94
- if sys.version_info >= (3, 12):
95
-
96
- class AssetFilePath(Path):
97
- """Extended Path class for managing asset files.
98
-
99
- Represents a file path with additional metadata about its role as an asset in the catalog.
100
- This class extends the standard Path class to include information about the asset's
101
- catalog representation and type.
102
-
103
- Attributes:
104
- asset_name (str): Name of the asset in the catalog (e.g., asset table name).
105
- file_name (str): Name of the local file containing the asset.
106
- asset_metadata (dict[str, Any]): Additional columns beyond URL, Length, and checksum.
107
- asset_types (list[str]): Terms from the Asset_Type controlled vocabulary.
108
- asset_rid (RID | None): Resource Identifier if uploaded to an asset table.
109
-
110
- Example:
111
- >>> path = AssetFilePath(
112
- ... "/path/to/file.txt",
113
- ... asset_name="analysis_output",
114
- ... file_name="results.txt",
115
- ... asset_metadata={"version": "1.0"},
116
- ... asset_types=["text", "results"]
117
- ... )
118
- """
119
-
120
- def __init__(
121
- self,
122
- asset_path: str | Path,
123
- asset_name: str,
124
- file_name: str,
125
- asset_metadata: dict[str, Any],
126
- asset_types: list[str] | str,
127
- asset_rid: RID | None = None,
128
- ):
129
- """Initializes an AssetFilePath instance.
130
-
131
- Args:
132
- asset_path: Local path to the asset file.
133
- asset_name: Name of the asset in the catalog.
134
- file_name: Name of the local file.
135
- asset_metadata: Additional metadata columns.
136
- asset_types: One or more asset type terms.
137
- asset_rid: Optional Resource Identifier if already in catalog.
138
- """
139
- super().__init__(asset_path)
140
- self.asset_name = asset_name
141
- self.file_name = file_name
142
- self.asset_metadata = asset_metadata
143
- self.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
144
- self.asset_rid = asset_rid
145
- else:
146
-
147
- class AssetFilePath(type(Path())):
148
- """
149
- Create a new Path object that has additional information related to the use of this path as an asset.
150
-
151
- Attrubytes:
152
- asset_path: Local path to the location of the asset.
153
- asset_name: The name of the asset in the catalog (e.g., the asset table name).
154
- file_name: Name of the local file that contains the contents of the asset.
155
- asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
156
- asset_types: A list of terms from the Asset_Type controlled vocabulary.
157
- asset_rid: The RID of the asset if it has been uploaded into an asset table
158
- """
159
-
160
- def __new__(
161
- cls,
162
- asset_path: str | Path,
163
- asset_name: str,
164
- file_name: str,
165
- asset_metadata: dict[str, Any],
166
- asset_types: list[str] | str,
167
- asset_rid: RID | None = None,
168
- ):
169
- # Only pass the path to the base Path class
170
- obj = super().__new__(cls, asset_path)
171
- obj.asset_name = asset_name
172
- obj.file_name = file_name
173
- obj.asset_metadata = asset_metadata
174
- obj.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
175
- obj.asset_rid = asset_rid
176
- return obj
177
-
178
-
179
106
  class Execution:
180
107
  """Manages the lifecycle and context of a DerivaML execution.
181
108
 
@@ -201,14 +128,21 @@ class Execution:
201
128
  stop_time (datetime | None): When execution completed.
202
129
 
203
130
  Example:
204
- >>> config = ExecutionConfiguration(
205
- ... workflow="analysis",
206
- ... description="Process samples",
207
- ... )
208
- >>> with ml.create_execution(config) as execution:
209
- ... execution.download_dataset_bag(dataset_spec)
210
- ... # Run analysis
211
- ... execution.upload_execution_outputs()
131
+ The context manager handles start/stop timing. Upload must be called AFTER
132
+ the context manager exits::
133
+
134
+ >>> config = ExecutionConfiguration(
135
+ ... workflow="analysis",
136
+ ... description="Process samples",
137
+ ... )
138
+ >>> with ml.create_execution(config) as execution:
139
+ ... bag = execution.download_dataset_bag(dataset_spec)
140
+ ... # Run analysis using bag.path
141
+ ... output_path = execution.asset_file_path("Model", "model.pt")
142
+ ... # Write results to output_path
143
+ ...
144
+ >>> # IMPORTANT: Call upload AFTER exiting the context manager
145
+ >>> execution.upload_execution_outputs()
212
146
  """
213
147
 
214
148
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@@ -216,7 +150,7 @@ class Execution:
216
150
  self,
217
151
  configuration: ExecutionConfiguration,
218
152
  ml_object: DerivaML,
219
- workflow: Workflow | RID | None = None,
153
+ workflow: Workflow | None = None,
220
154
  reload: RID | None = None,
221
155
  dry_run: bool = False,
222
156
  ):
@@ -228,13 +162,32 @@ class Execution:
228
162
  Args:
229
163
  configuration: Settings and parameters for the execution.
230
164
  ml_object: DerivaML instance managing the execution.
231
- workflow: Optional workflow RID or Workflow object. If not specified, the workflow RID is taken from
232
- the ExecutionConfiguration object
165
+ workflow: Optional Workflow object. If not specified, the workflow is taken from
166
+ the ExecutionConfiguration object. Must be a Workflow object, not a RID.
233
167
  reload: Optional RID of existing execution to reload.
234
168
  dry_run: If True, don't create catalog records or upload results.
235
169
 
236
170
  Raises:
237
- DerivaMLException: If initialization fails or configuration is invalid.
171
+ DerivaMLException: If initialization fails, configuration is invalid,
172
+ or workflow is not a Workflow object.
173
+
174
+ Example:
175
+ Create an execution with a workflow::
176
+
177
+ >>> workflow = ml.lookup_workflow("2-ABC1")
178
+ >>> config = ExecutionConfiguration(
179
+ ... workflow=workflow,
180
+ ... description="Process data"
181
+ ... )
182
+ >>> execution = Execution(config, ml)
183
+
184
+ Or pass workflow separately::
185
+
186
+ >>> workflow = ml.lookup_workflow_by_url(
187
+ ... "https://github.com/org/repo/blob/abc123/analysis.py"
188
+ ... )
189
+ >>> config = ExecutionConfiguration(description="Run analysis")
190
+ >>> execution = Execution(config, ml, workflow=workflow)
238
191
  """
239
192
 
240
193
  self.asset_paths: dict[str, list[AssetFilePath]] = {}
@@ -244,9 +197,10 @@ class Execution:
244
197
  self._logger = ml_object._logger
245
198
  self.start_time = None
246
199
  self.stop_time = None
247
- self.status = Status.created
200
+ self._status = Status.created
248
201
  self.uploaded_assets: dict[str, list[AssetFilePath]] | None = None
249
202
  self.configuration.argv = sys.argv
203
+ self._execution_record: ExecutionRecord | None = None # Lazily created after RID is assigned
250
204
 
251
205
  self.dataset_rids: List[RID] = []
252
206
  self.datasets: list[DatasetBag] = []
@@ -255,18 +209,24 @@ class Execution:
255
209
  self._cache_dir = self._ml_object.cache_dir
256
210
  self._dry_run = dry_run
257
211
 
258
- # Make sure we have a good workflow.
212
+ # Make sure we have a valid Workflow object.
259
213
  if workflow:
260
214
  self.configuration.workflow = workflow
261
- if isinstance(self.configuration.workflow, Workflow):
262
- self._ml_object.lookup_term(MLVocab.workflow_type, configuration.workflow.workflow_type)
263
- self.workflow_rid = (
264
- self._ml_object.add_workflow(self.configuration.workflow) if not self._dry_run else DRY_RUN_RID
215
+
216
+ if self.configuration.workflow is None:
217
+ raise DerivaMLException("Workflow must be specified either in configuration or as a parameter")
218
+
219
+ if not isinstance(self.configuration.workflow, Workflow):
220
+ raise DerivaMLException(
221
+ f"Workflow must be a Workflow object, not {type(self.configuration.workflow).__name__}. "
222
+ "Use ml.lookup_workflow(rid) or ml.lookup_workflow_by_url(url) to get a Workflow object."
265
223
  )
266
- else:
267
- self.workflow_rid = self.configuration.workflow
268
- if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
269
- raise DerivaMLException("Workflow specified in execution configuration is not a Workflow")
224
+
225
+ # Validate workflow type and register in catalog
226
+ self._ml_object.lookup_term(MLVocab.workflow_type, self.configuration.workflow.workflow_type)
227
+ self.workflow_rid = (
228
+ self._ml_object.add_workflow(self.configuration.workflow) if not self._dry_run else DRY_RUN_RID
229
+ )
270
230
 
271
231
  # Validate the datasets and assets to be valid.
272
232
  for d in self.configuration.datasets:
@@ -277,7 +237,7 @@ class Execution:
277
237
  if not self._model.is_asset(self._ml_object.resolve_rid(a).table.name):
278
238
  raise DerivaMLException("Asset specified in execution configuration is not a asset table")
279
239
 
280
- schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
240
+ schema_path = self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema]
281
241
  if reload:
282
242
  self.execution_rid = reload
283
243
  if self.execution_rid == DRY_RUN_RID:
@@ -309,6 +269,18 @@ class Execution:
309
269
 
310
270
  # Create a directory for execution rid so we can recover the state in case of a crash.
311
271
  execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
272
+
273
+ # Create the ExecutionRecord to handle catalog state operations
274
+ if not self._dry_run:
275
+ self._execution_record = ExecutionRecord(
276
+ execution_rid=self.execution_rid,
277
+ workflow=self.configuration.workflow,
278
+ status=Status.created,
279
+ description=self.configuration.description,
280
+ _ml_instance=self._ml_object,
281
+ _logger=self._logger,
282
+ )
283
+
312
284
  self._initialize_execution(reload)
313
285
 
314
286
  def _save_runtime_environment(self):
@@ -321,31 +293,33 @@ class Execution:
321
293
  json.dump(get_execution_environment(), fp)
322
294
 
323
295
  def _upload_hydra_config_assets(self):
324
- """Upload hydra assets to the catalog."""
296
+ """Upload hydra assets to the catalog with Hydra_Config type."""
325
297
  hydra_runtime_output_dir = self._ml_object.hydra_runtime_output_dir
326
298
  if hydra_runtime_output_dir:
327
299
  timestamp = hydra_runtime_output_dir.parts[-1]
328
300
  for hydra_asset in hydra_runtime_output_dir.rglob("*"):
329
301
  if hydra_asset.is_dir():
330
302
  continue
331
- asset = self.asset_file_path(
303
+ # Register file for upload (side effect); result intentionally unused
304
+ # Use Hydra_Config type for Hydra YAML configuration files
305
+ self.asset_file_path(
332
306
  asset_name=MLAsset.execution_metadata,
333
307
  file_name=hydra_runtime_output_dir / hydra_asset,
334
308
  rename_file=f"hydra-{timestamp}-{hydra_asset.name}",
335
- asset_types=ExecMetadataType.execution_config.value,
309
+ asset_types=ExecMetadataType.hydra_config.value,
336
310
  )
337
311
 
338
312
  def _initialize_execution(self, reload: RID | None = None) -> None:
339
- """Initialize the execution by a configuration in the Execution_Metadata table.
340
- Set up a working directory and download all the assets and data.
313
+ """Initialize the execution environment.
341
314
 
342
- :raise DerivaMLException: If there is an issue initializing the execution.
315
+ Sets up the working directory, downloads required datasets and assets,
316
+ and saves initial configuration metadata.
343
317
 
344
318
  Args:
345
- reload: RID of previously initialized execution.
346
-
347
- Returns:
319
+ reload: Optional RID of a previously initialized execution to reload.
348
320
 
321
+ Raises:
322
+ DerivaMLException: If initialization fails.
349
323
  """
350
324
  # Materialize bdbag
351
325
  for dataset in self.configuration.datasets:
@@ -354,7 +328,7 @@ class Execution:
354
328
  self.dataset_rids.append(dataset.rid)
355
329
 
356
330
  # Update execution info
357
- schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
331
+ schema_path = self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema]
358
332
  if self.dataset_rids and not (reload or self._dry_run):
359
333
  schema_path.Dataset_Execution.insert(
360
334
  [{"Dataset": d, "Execution": self.execution_rid} for d in self.dataset_rids]
@@ -379,16 +353,21 @@ class Execution:
379
353
 
380
354
  # Save configuration details for later upload
381
355
  if not reload:
356
+ # Save DerivaML configuration with Deriva_Config type
382
357
  cfile = self.asset_file_path(
383
358
  asset_name=MLAsset.execution_metadata,
384
359
  file_name="configuration.json",
385
- asset_types=ExecMetadataType.execution_config.value,
360
+ asset_types=ExecMetadataType.deriva_config.value,
386
361
  )
387
- with Path(cfile).open("w", encoding="utf-8") as config_file:
388
- json.dump(self.configuration.model_dump(), config_file)
389
362
 
390
- lock_file = Path(self.configuration.workflow.git_root) / "uv.lock"
391
- if lock_file.exists():
363
+ with Path(cfile).open("w", encoding="utf-8") as config_file:
364
+ json.dump(self.configuration.model_dump(mode="json"), config_file)
365
+ # Only try to copy uv.lock if git_root is available (local workflow)
366
+ if self.configuration.workflow.git_root:
367
+ lock_file = Path(self.configuration.workflow.git_root) / "uv.lock"
368
+ else:
369
+ lock_file = None
370
+ if lock_file and lock_file.exists():
392
371
  _ = self.asset_file_path(
393
372
  asset_name=MLAsset.execution_metadata,
394
373
  file_name=lock_file,
@@ -405,6 +384,37 @@ class Execution:
405
384
  self.start_time = datetime.now()
406
385
  self.update_status(Status.pending, "Initialize status finished.")
407
386
 
387
+ @property
388
+ def status(self) -> Status:
389
+ """Get the current execution status.
390
+
391
+ Returns:
392
+ Status: The current status (Created, Running, Completed, Failed, etc.).
393
+ """
394
+ if self._execution_record is not None:
395
+ return self._execution_record.status
396
+ return self._status
397
+
398
+ @status.setter
399
+ def status(self, value: Status) -> None:
400
+ """Set the execution status.
401
+
402
+ Args:
403
+ value: The new status value.
404
+ """
405
+ self._status = value
406
+ if self._execution_record is not None:
407
+ self._execution_record._status = value
408
+
409
+ @property
410
+ def execution_record(self) -> ExecutionRecord | None:
411
+ """Get the ExecutionRecord for catalog operations.
412
+
413
+ Returns:
414
+ ExecutionRecord if not in dry_run mode, None otherwise.
415
+ """
416
+ return self._execution_record
417
+
408
418
  @property
409
419
  def working_dir(self) -> Path:
410
420
  """Return the working directory for the execution."""
@@ -412,39 +422,78 @@ class Execution:
412
422
 
413
423
  @property
414
424
  def _execution_root(self) -> Path:
425
+ """Get the root directory for this execution's files.
426
+
427
+ Returns:
428
+ Path to the execution-specific directory.
415
429
  """
430
+ return execution_root(self._working_dir, self.execution_rid)
416
431
 
417
- Args:
432
+ @property
433
+ def _feature_root(self) -> Path:
434
+ """Get the root directory for feature files.
418
435
 
419
436
  Returns:
420
- :return:
437
+ Path to the feature directory within the execution.
438
+ """
439
+ return feature_root(self._working_dir, self.execution_rid)
440
+
441
+ @property
442
+ def _asset_root(self) -> Path:
443
+ """Get the root directory for asset files.
421
444
 
445
+ Returns:
446
+ Path to the asset directory within the execution.
422
447
  """
423
- return execution_root(self._working_dir, self.execution_rid)
448
+ return asset_root(self._working_dir, self.execution_rid)
424
449
 
425
450
  @property
426
- def _feature_root(self) -> Path:
427
- """The root path to all execution-specific files.
428
- :return:
451
+ def database_catalog(self) -> DerivaMLDatabase | None:
452
+ """Get a catalog-like interface for downloaded datasets.
429
453
 
430
- Args:
454
+ Returns a DerivaMLDatabase that implements the DerivaMLCatalog
455
+ protocol, allowing the same code to work with both live catalogs
456
+ and downloaded bags.
457
+
458
+ This is useful for writing code that can operate on either a live
459
+ catalog (via DerivaML) or on downloaded bags (via DerivaMLDatabase).
431
460
 
432
461
  Returns:
462
+ DerivaMLDatabase wrapping the primary downloaded dataset's model,
463
+ or None if no datasets have been downloaded.
433
464
 
465
+ Example:
466
+ >>> with ml.create_execution(config) as exe:
467
+ ... if exe.database_catalog:
468
+ ... db = exe.database_catalog
469
+ ... # Use same interface as DerivaML
470
+ ... dataset = db.lookup_dataset("4HM")
471
+ ... term = db.lookup_term("Diagnosis", "cancer")
472
+ ... else:
473
+ ... # No datasets downloaded, use live catalog
474
+ ... pass
434
475
  """
435
- return feature_root(self._working_dir, self.execution_rid)
476
+ if not self.datasets:
477
+ return None
478
+ # Use the first dataset's model as the primary
479
+ return DerivaMLDatabase(self.datasets[0].model)
436
480
 
437
481
  @property
438
- def _asset_root(self) -> Path:
439
- """The root path to all execution-specific files.
440
- :return:
482
+ def catalog(self) -> "DerivaML":
483
+ """Get the live catalog (DerivaML) instance for this execution.
441
484
 
442
- Args:
485
+ This provides access to the live catalog for operations that require
486
+ catalog connectivity, such as looking up datasets or other read operations.
443
487
 
444
488
  Returns:
489
+ DerivaML: The live catalog instance.
445
490
 
491
+ Example:
492
+ >>> with ml.create_execution(config) as exe:
493
+ ... # Use live catalog for lookups
494
+ ... existing_dataset = exe.catalog.lookup_dataset("1-ABC")
446
495
  """
447
- return asset_root(self._working_dir, self.execution_rid)
496
+ return self._ml_object
448
497
 
449
498
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
450
499
  def download_dataset_bag(self, dataset: DatasetSpec) -> DatasetBag:
@@ -471,7 +520,7 @@ class Execution:
471
520
  >>> bag = execution.download_dataset_bag(spec)
472
521
  >>> print(f"Downloaded to {bag.path}")
473
522
  """
474
- return self._ml_object.download_dataset_bag(dataset, execution_rid=self.execution_rid)
523
+ return self._ml_object.download_dataset_bag(dataset)
475
524
 
476
525
  @validate_call
477
526
  def update_status(self, status: Status, msg: str) -> None:
@@ -490,21 +539,26 @@ class Execution:
490
539
  Example:
491
540
  >>> execution.update_status(Status.running, "Processing sample 1 of 10")
492
541
  """
493
- self.status = status
542
+ self._status = status
494
543
  self._logger.info(msg)
495
544
 
496
545
  if self._dry_run:
497
546
  return
498
547
 
499
- self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
500
- [
501
- {
502
- "RID": self.execution_rid,
503
- "Status": self.status.value,
504
- "Status_Detail": msg,
505
- }
506
- ]
507
- )
548
+ # Delegate to ExecutionRecord for catalog updates
549
+ if self._execution_record is not None:
550
+ self._execution_record.update_status(status, msg)
551
+ else:
552
+ # Fallback for cases where ExecutionRecord isn't available
553
+ self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema].Execution.update(
554
+ [
555
+ {
556
+ "RID": self.execution_rid,
557
+ "Status": status.value,
558
+ "Status_Detail": msg,
559
+ }
560
+ ]
561
+ )
508
562
 
509
563
  def execution_start(self) -> None:
510
564
  """Marks the execution as started.
@@ -545,17 +599,23 @@ class Execution:
545
599
 
546
600
  self.update_status(Status.completed, "Algorithm execution ended.")
547
601
  if not self._dry_run:
548
- self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
602
+ self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema].Execution.update(
549
603
  [{"RID": self.execution_rid, "Duration": duration}]
550
604
  )
551
605
 
552
- def _upload_execution_dirs(self) -> dict[str, list[AssetFilePath]]:
606
+ def _upload_execution_dirs(
607
+ self, progress_callback: Callable[[UploadProgress], None] | None = None
608
+ ) -> dict[str, list[AssetFilePath]]:
553
609
  """Upload execution assets at _working_dir/Execution_asset.
554
610
 
555
611
  This routine uploads the contents of the
556
612
  Execution_Asset directory and then updates the execution_asset table in the ML schema to have references
557
613
  to these newly uploaded files.
558
614
 
615
+ Args:
616
+ progress_callback: Optional callback function to receive upload progress updates.
617
+ Called with UploadProgress objects containing file information and progress.
618
+
559
619
  Returns:
560
620
  dict: Results of the upload operation.
561
621
 
@@ -565,11 +625,11 @@ class Execution:
565
625
 
566
626
  try:
567
627
  self.update_status(Status.running, "Uploading execution files...")
568
- results = upload_directory(self._model, self._asset_root)
569
- except RuntimeError as e:
628
+ results = upload_directory(self._model, self._asset_root, progress_callback=progress_callback)
629
+ except (RuntimeError, DerivaMLException) as e:
570
630
  error = format_exception(e)
571
631
  self.update_status(Status.failed, error)
572
- raise DerivaMLException(f"Fail to upload execution_assets. Error: {error}")
632
+ raise DerivaMLException(f"Failed to upload execution_assets: {error}")
573
633
 
574
634
  asset_map = {}
575
635
  for path, status in results.items():
@@ -578,7 +638,7 @@ class Execution:
578
638
  asset_map.setdefault(asset_table, []).append(
579
639
  AssetFilePath(
580
640
  asset_path=path,
581
- asset_name=asset_table,
641
+ asset_table=asset_table,
582
642
  file_name=file_name,
583
643
  asset_metadata={
584
644
  k: v
@@ -629,7 +689,7 @@ class Execution:
629
689
  hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
630
690
 
631
691
  asset_type_table, _col_l, _col_r = self._model.find_association(asset_table, MLVocab.asset_type)
632
- type_path = self._ml_object.pathBuilder.schemas[asset_type_table.schema.name].tables[asset_type_table.name]
692
+ type_path = self._ml_object.pathBuilder().schemas[asset_type_table.schema.name].tables[asset_type_table.name]
633
693
  asset_types = [
634
694
  asset_type[MLVocab.asset_type.value]
635
695
  for asset_type in type_path.filter(type_path.columns[asset_table.name] == asset_rid)
@@ -642,7 +702,7 @@ class Execution:
642
702
  asset_rid=asset_rid,
643
703
  asset_path=asset_filename,
644
704
  asset_metadata=asset_metadata,
645
- asset_name=asset_table.name,
705
+ asset_table=asset_table.name,
646
706
  asset_types=asset_types,
647
707
  )
648
708
 
@@ -690,15 +750,26 @@ class Execution:
690
750
  results = upload_directory(self._model, assets_dir)
691
751
  return {path_to_asset(p): r for p, r in results.items()}
692
752
 
693
- def upload_execution_outputs(self, clean_folder: bool = True) -> dict[str, list[AssetFilePath]]:
753
+ def upload_execution_outputs(
754
+ self, clean_folder: bool | None = None, progress_callback: Callable[[UploadProgress], None] | None = None
755
+ ) -> dict[str, list[AssetFilePath]]:
694
756
  """Uploads all outputs from the execution to the catalog.
695
757
 
696
758
  Scans the execution's output directories for assets, features, and other results,
697
759
  then uploads them to the catalog. Can optionally clean up the output folders
698
760
  after successful upload.
699
761
 
762
+ IMPORTANT: This method must be called AFTER exiting the context manager, not inside it.
763
+ The context manager handles execution timing (start/stop), while this method handles
764
+ the separate upload step.
765
+
700
766
  Args:
701
- clean_folder: Whether to delete output folders after upload. Defaults to True.
767
+ clean_folder: Whether to delete output folders after upload. If None (default),
768
+ uses the DerivaML instance's clean_execution_dir setting. Pass True/False
769
+ to override for this specific execution.
770
+ progress_callback: Optional callback function to receive upload progress updates.
771
+ Called with UploadProgress objects containing file name, bytes uploaded,
772
+ total bytes, percent complete, phase, and status message.
702
773
 
703
774
  Returns:
704
775
  dict[str, list[AssetFilePath]]: Mapping of asset types to their file paths.
@@ -707,14 +778,28 @@ class Execution:
707
778
  DerivaMLException: If upload fails or outputs are invalid.
708
779
 
709
780
  Example:
710
- >>> outputs = execution.upload_execution_outputs()
711
- >>> for type_name, paths in outputs.items():
712
- ... print(f"{type_name}: {len(paths)} files")
781
+ >>> with ml.create_execution(config) as execution:
782
+ ... # Do ML work, register output files with asset_file_path()
783
+ ... path = execution.asset_file_path("Model", "model.pt")
784
+ ... # Write to path...
785
+ ...
786
+ >>> # Upload AFTER the context manager exits
787
+ >>> def my_callback(progress):
788
+ ... print(f"Uploading {progress.file_name}: {progress.percent_complete:.1f}%")
789
+ >>> outputs = execution.upload_execution_outputs(progress_callback=my_callback)
790
+ >>>
791
+ >>> # Override cleanup setting for this execution
792
+ >>> outputs = execution.upload_execution_outputs(clean_folder=False) # Keep files
713
793
  """
714
794
  if self._dry_run:
715
795
  return {}
796
+
797
+ # Use DerivaML instance setting if not explicitly provided
798
+ if clean_folder is None:
799
+ clean_folder = getattr(self._ml_object, 'clean_execution_dir', True)
800
+
716
801
  try:
717
- self.uploaded_assets = self._upload_execution_dirs()
802
+ self.uploaded_assets = self._upload_execution_dirs(progress_callback=progress_callback)
718
803
  self.update_status(Status.completed, "Successfully end the execution.")
719
804
  if clean_folder:
720
805
  self._clean_folder_contents(self._execution_root)
@@ -724,14 +809,17 @@ class Execution:
724
809
  self.update_status(Status.failed, error)
725
810
  raise e
726
811
 
727
- def _clean_folder_contents(self, folder_path: Path):
728
- """Clean up folder contents with Windows-compatible error handling.
812
+ def _clean_folder_contents(self, folder_path: Path, remove_folder: bool = True):
813
+ """Clean up folder contents and optionally the folder itself.
814
+
815
+ Removes all files and subdirectories within the specified folder.
816
+ Uses retry logic for Windows compatibility where files may be temporarily locked.
729
817
 
730
818
  Args:
731
- folder_path: Path to the folder to clean
819
+ folder_path: Path to the folder to clean.
820
+ remove_folder: If True (default), also remove the folder itself after
821
+ cleaning its contents. If False, only remove contents.
732
822
  """
733
- import time
734
-
735
823
  MAX_RETRIES = 3
736
824
  RETRY_DELAY = 1 # seconds
737
825
 
@@ -745,20 +833,26 @@ class Execution:
745
833
  return True
746
834
  except (OSError, PermissionError) as e:
747
835
  if attempt == MAX_RETRIES - 1:
748
- self.update_status(Status.failed, format_exception(e))
836
+ logging.warning(f"Failed to remove {path}: {e}")
749
837
  return False
750
838
  time.sleep(RETRY_DELAY)
751
839
  return False
752
840
 
753
841
  try:
842
+ # First remove all contents
754
843
  with os.scandir(folder_path) as entries:
755
844
  for entry in entries:
756
845
  if entry.is_dir() and not entry.is_symlink():
757
846
  remove_with_retry(Path(entry.path), is_dir=True)
758
847
  else:
759
848
  remove_with_retry(Path(entry.path))
849
+
850
+ # Then remove the folder itself if requested
851
+ if remove_folder:
852
+ remove_with_retry(folder_path, is_dir=True)
853
+
760
854
  except OSError as e:
761
- self.update_status(Status.failed, format_exception(e))
855
+ logging.warning(f"Failed to clean folder {folder_path}: {e}")
762
856
 
763
857
  def _update_feature_table(
764
858
  self,
@@ -767,14 +861,16 @@ class Execution:
767
861
  feature_file: str | Path,
768
862
  uploaded_files: dict[str, list[AssetFilePath]],
769
863
  ) -> None:
770
- """
864
+ """Update the feature table with values from a JSONL file.
865
+
866
+ Reads feature values from a file and inserts them into the catalog,
867
+ replacing file paths with the RIDs of uploaded assets.
771
868
 
772
869
  Args:
773
- target_table: str:
774
- feature_name: str:
775
- feature_file: str | Path:
776
- uploaded_files: Dictionary whose key is an asset name, file-name pair, and whose value is a filename,
777
- RID of that asset.
870
+ target_table: Name of the table the feature is defined on.
871
+ feature_name: Name of the feature to update.
872
+ feature_file: Path to JSONL file containing feature values.
873
+ uploaded_files: Map from asset table names to their uploaded AssetFilePath objects.
778
874
  """
779
875
 
780
876
  # Get the column names of all the Feature columns that should be the RID of an asset
@@ -804,7 +900,7 @@ class Execution:
804
900
  with Path(feature_file).open("r") as feature_values:
805
901
  entities = [json.loads(line.strip()) for line in feature_values]
806
902
  # Update the asset columns in the feature and add to the catalog.
807
- self._ml_object.domain_path.tables[feature_table].insert([map_path(e) for e in entities], on_conflict_skip=True)
903
+ self._ml_object.domain_path().tables[feature_table].insert([map_path(e) for e in entities], on_conflict_skip=True)
808
904
 
809
905
  def _update_asset_execution_table(
810
906
  self,
@@ -824,7 +920,7 @@ class Execution:
824
920
  return
825
921
  self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
826
922
 
827
- pb = self._ml_object.pathBuilder
923
+ pb = self._ml_object.pathBuilder()
828
924
  for asset_table, asset_list in uploaded_assets.items():
829
925
  asset_table_name = asset_table.split("/")[1] # Peel off the schema from the asset table
830
926
  asset_exe, asset_fk, execution_fk = self._model.find_association(asset_table_name, "Execution")
@@ -924,6 +1020,11 @@ class Execution:
924
1020
  # There is a funny bug with S3 hatrac if we have the leading _ in the filename.
925
1021
  file_name = file_name.with_name("-implementations.log")
926
1022
 
1023
+ # Resolve relative paths to absolute paths to ensure exists() and symlink work correctly
1024
+ # regardless of the current working directory
1025
+ if not file_name.is_absolute():
1026
+ file_name = file_name.resolve()
1027
+
927
1028
  target_name = Path(rename_file) if file_name.exists() and rename_file else file_name
928
1029
  asset_path = asset_file_path(
929
1030
  prefix=self._working_dir,
@@ -949,7 +1050,7 @@ class Execution:
949
1050
 
950
1051
  return AssetFilePath(
951
1052
  asset_path=asset_path,
952
- asset_name=asset_name,
1053
+ asset_table=asset_name,
953
1054
  file_name=target_name.name,
954
1055
  asset_metadata=kwargs,
955
1056
  asset_types=asset_types,
@@ -964,10 +1065,18 @@ class Execution:
964
1065
  Returns:
965
1066
  Pathlib path to the file in which to place table values.
966
1067
  """
967
- if table not in self._model.schemas[self._ml_object.domain_schema].tables:
968
- raise DerivaMLException("Table '{}' not found in domain schema".format(table))
1068
+ # Find which domain schema contains this table
1069
+ table_schema = None
1070
+ for domain_schema in self._ml_object.domain_schemas:
1071
+ if domain_schema in self._model.schemas:
1072
+ if table in self._model.schemas[domain_schema].tables:
1073
+ table_schema = domain_schema
1074
+ break
969
1075
 
970
- return table_path(self._working_dir, schema=self._ml_object.domain_schema, table=table)
1076
+ if table_schema is None:
1077
+ raise DerivaMLException("Table '{}' not found in any domain schema".format(table))
1078
+
1079
+ return table_path(self._working_dir, schema=table_schema, table=table)
971
1080
 
972
1081
  def execute(self) -> Execution:
973
1082
  """Initiate an execution with the provided configuration. Can be used in a context manager."""
@@ -1005,9 +1114,11 @@ class Execution:
1005
1114
  # Update feature records to include current execution_rid
1006
1115
  first_row = features[0]
1007
1116
  feature = first_row.feature
1117
+ # Use the schema from the feature table
1118
+ feature_schema = feature.feature_table.schema.name
1008
1119
  json_path = feature_value_path(
1009
1120
  self._working_dir,
1010
- schema=self._ml_object.domain_schema,
1121
+ schema=feature_schema,
1011
1122
  target_table=feature.target_table.name,
1012
1123
  feature_name=feature.feature_name,
1013
1124
  exec_rid=self.execution_rid,
@@ -1017,78 +1128,93 @@ class Execution:
1017
1128
  feature.Execution = self.execution_rid
1018
1129
  file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
1019
1130
 
1020
- @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
1021
- def create_dataset(
1022
- self,
1023
- dataset_types: str | list[str],
1024
- description: str,
1025
- version: DatasetVersion | None = None,
1026
- ) -> RID:
1027
- """Create a new dataset with specified types.
1028
-
1029
- Args:
1030
- dataset_types: param description:
1031
- description: Markdown description of the dataset being created.
1032
- version: Version to assign to the dataset. Defaults to 0.1.0
1131
+ def list_input_datasets(self) -> list[Dataset]:
1132
+ """List all datasets that were inputs to this execution.
1033
1133
 
1034
1134
  Returns:
1035
- RID of the newly created dataset.
1135
+ List of Dataset objects that were used as inputs.
1136
+
1137
+ Example:
1138
+ >>> for ds in execution.list_input_datasets():
1139
+ ... print(f"Input: {ds.dataset_rid} - {ds.description}")
1036
1140
  """
1037
- return self._ml_object.create_dataset(dataset_types, description, self.execution_rid, version=version)
1141
+ if self._execution_record is not None:
1142
+ return self._execution_record.list_input_datasets()
1038
1143
 
1039
- def add_dataset_members(
1040
- self,
1041
- dataset_rid: RID,
1042
- members: list[RID] | dict[str, list[RID]],
1043
- validate: bool = True,
1044
- description: str = "",
1045
- ) -> None:
1046
- """Add additional elements to an existing dataset_table.
1144
+ # Fallback for dry_run mode
1145
+ pb = self._ml_object.pathBuilder()
1146
+ dataset_exec = pb.schemas[self._ml_object.ml_schema].Dataset_Execution
1047
1147
 
1048
- Add new elements to an existing dataset. In addition to adding new members, the minor version number of the
1049
- dataset is incremented and the description, if provide is applied to that new version.
1148
+ records = list(
1149
+ dataset_exec.filter(dataset_exec.Execution == self.execution_rid)
1150
+ .entities()
1151
+ .fetch()
1152
+ )
1153
+
1154
+ return [self._ml_object.lookup_dataset(r["Dataset"]) for r in records]
1050
1155
 
1051
- The RIDs in the list to not have to be all from the same table, but they must be from a table that has
1052
- been configured to be a dataset element type.
1156
+ def list_assets(self, asset_role: str | None = None) -> list["Asset"]:
1157
+ """List all assets that were inputs or outputs of this execution.
1053
1158
 
1054
1159
  Args:
1055
- dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
1056
- members: List of RIDs of members to add to the dataset_table. RID must be to a table type that is a
1057
- dataset element type (see DerivaML.add_dataset_element_type).
1058
- validate: Check rid_list to make sure elements are not already in the dataset_table.
1059
- description: Markdown description of the updated dataset.
1160
+ asset_role: Optional filter: "Input" or "Output". If None, returns all.
1161
+
1162
+ Returns:
1163
+ List of Asset objects associated with this execution.
1164
+
1165
+ Example:
1166
+ >>> inputs = execution.list_assets(asset_role="Input")
1167
+ >>> outputs = execution.list_assets(asset_role="Output")
1060
1168
  """
1061
- return self._ml_object.add_dataset_members(
1062
- dataset_rid=dataset_rid,
1063
- members=members,
1064
- validate=validate,
1065
- description=description,
1066
- execution_rid=self.execution_rid,
1067
- )
1169
+ if self._execution_record is not None:
1170
+ return self._execution_record.list_assets(asset_role=asset_role)
1171
+
1172
+ # Fallback for dry_run mode
1173
+ from deriva_ml.asset.asset import Asset
1174
+
1175
+ pb = self._ml_object.pathBuilder()
1176
+ asset_exec = pb.schemas[self._ml_object.ml_schema].Execution_Asset_Execution
1177
+
1178
+ query = asset_exec.filter(asset_exec.Execution == self.execution_rid)
1179
+ if asset_role:
1180
+ query = query.filter(asset_exec.Asset_Role == asset_role)
1181
+
1182
+ records = list(query.entities().fetch())
1183
+
1184
+ assets = []
1185
+ for r in records:
1186
+ try:
1187
+ asset = self._ml_object.lookup_asset(r["Execution_Asset"])
1188
+ assets.append(asset)
1189
+ except Exception:
1190
+ pass # Skip assets that can't be looked up
1191
+ return assets
1192
+
1193
+ @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
1194
+ def create_dataset(
1195
+ self,
1196
+ dataset_types: str | list[str] | None = None,
1197
+ version: DatasetVersion | str | None = None,
1198
+ description: str = "",
1199
+ ) -> Dataset:
1200
+ """Create a new dataset with specified types.
1068
1201
 
1069
- def increment_dataset_version(
1070
- self, dataset_rid: RID, component: VersionPart, description: str = ""
1071
- ) -> DatasetVersion:
1072
- """Increment the version of the specified dataset_table.
1202
+ Creates a dataset associated with this execution for provenance tracking.
1073
1203
 
1074
1204
  Args:
1075
- dataset_rid: RID to a dataset_table
1076
- component: Which version of the dataset_table to increment.
1077
- dataset_rid: RID of the dataset whose version is to be incremented.
1078
- component: Major, Minor, or Patch
1079
- description: Description of the version update of the dataset_table.
1205
+ dataset_types: One or more dataset type terms from Dataset_Type vocabulary.
1206
+ description: Markdown description of the dataset being created.
1207
+ version: Dataset version. Defaults to 0.1.0.
1080
1208
 
1081
1209
  Returns:
1082
- new semantic version of the dataset_table as a 3-tuple
1083
-
1084
- Raises:
1085
- DerivaMLException: if provided RID is not to a dataset_table.
1210
+ The newly created Dataset.
1086
1211
  """
1087
- return self._ml_object.increment_dataset_version(
1088
- dataset_rid=dataset_rid,
1089
- component=component,
1090
- description=description,
1212
+ return Dataset.create_dataset(
1213
+ ml_instance=self._ml_object,
1091
1214
  execution_rid=self.execution_rid,
1215
+ dataset_types=dataset_types,
1216
+ version=version,
1217
+ description=description,
1092
1218
  )
1093
1219
 
1094
1220
  @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@@ -1097,7 +1223,7 @@ class Execution:
1097
1223
  files: Iterable[FileSpec],
1098
1224
  dataset_types: str | list[str] | None = None,
1099
1225
  description: str = "",
1100
- ) -> RID:
1226
+ ) -> "Dataset":
1101
1227
  """Adds files to the catalog with their metadata.
1102
1228
 
1103
1229
  Registers files in the catalog along with their metadata (MD5, length, URL) and associates them with
@@ -1109,7 +1235,7 @@ class Execution:
1109
1235
  description: Description of the files.
1110
1236
 
1111
1237
  Returns:
1112
- RID: Dataset RID that identifies newly added files. Will be nested to mirror original directory structure
1238
+ RID: Dataset that identifies newly added files. Will be nested to mirror original directory structure
1113
1239
  of the files.
1114
1240
 
1115
1241
  Raises:
@@ -1128,11 +1254,186 @@ class Execution:
1128
1254
  """
1129
1255
  return self._ml_object.add_files(
1130
1256
  files=files,
1131
- dataset_types=dataset_types,
1132
1257
  execution_rid=self.execution_rid,
1258
+ dataset_types=dataset_types,
1133
1259
  description=description,
1134
1260
  )
1135
1261
 
1262
+ # =========================================================================
1263
+ # Execution Nesting Methods
1264
+ # =========================================================================
1265
+
1266
+ def add_nested_execution(
1267
+ self,
1268
+ nested_execution: "Execution | ExecutionRecord | RID",
1269
+ sequence: int | None = None,
1270
+ ) -> None:
1271
+ """Add a nested (child) execution to this execution.
1272
+
1273
+ Creates a parent-child relationship between this execution and another.
1274
+ This is useful for grouping related executions, such as parameter sweeps
1275
+ or pipeline stages.
1276
+
1277
+ Args:
1278
+ nested_execution: The child execution to add (Execution, ExecutionRecord, or RID).
1279
+ sequence: Optional ordering index (0, 1, 2...). Use None for parallel executions.
1280
+
1281
+ Raises:
1282
+ DerivaMLException: If the association cannot be created.
1283
+
1284
+ Example:
1285
+ >>> parent_exec = ml.create_execution(parent_config)
1286
+ >>> child_exec = ml.create_execution(child_config)
1287
+ >>> parent_exec.add_nested_execution(child_exec, sequence=0)
1288
+ """
1289
+ if self._dry_run:
1290
+ return
1291
+
1292
+ # Get the RID from the nested execution
1293
+ if isinstance(nested_execution, Execution):
1294
+ nested_rid = nested_execution.execution_rid
1295
+ elif isinstance(nested_execution, ExecutionRecord):
1296
+ nested_rid = nested_execution.execution_rid
1297
+ else:
1298
+ nested_rid = nested_execution
1299
+
1300
+ # Delegate to ExecutionRecord if available
1301
+ if self._execution_record is not None:
1302
+ self._execution_record.add_nested_execution(nested_rid, sequence=sequence)
1303
+ else:
1304
+ # Fallback for cases without execution record
1305
+ pb = self._ml_object.pathBuilder()
1306
+ execution_execution = pb.schemas[self._ml_object.ml_schema].Execution_Execution
1307
+
1308
+ record = {
1309
+ "Execution": self.execution_rid,
1310
+ "Nested_Execution": nested_rid,
1311
+ }
1312
+ if sequence is not None:
1313
+ record["Sequence"] = sequence
1314
+
1315
+ execution_execution.insert([record])
1316
+
1317
+ def list_nested_executions(
1318
+ self,
1319
+ recurse: bool = False,
1320
+ _visited: set[RID] | None = None,
1321
+ ) -> list["ExecutionRecord"]:
1322
+ """List all nested (child) executions of this execution.
1323
+
1324
+ Args:
1325
+ recurse: If True, recursively return all descendant executions.
1326
+ _visited: Internal parameter to track visited executions and prevent infinite recursion.
1327
+
1328
+ Returns:
1329
+ List of nested ExecutionRecord objects, ordered by sequence if available.
1330
+ To get full Execution objects with lifecycle management, use restore_execution().
1331
+
1332
+ Example:
1333
+ >>> children = parent_exec.list_nested_executions()
1334
+ >>> all_descendants = parent_exec.list_nested_executions(recurse=True)
1335
+ """
1336
+ if self._execution_record is not None:
1337
+ return list(self._execution_record.list_nested_executions(recurse=recurse, _visited=_visited))
1338
+
1339
+ # Fallback for dry_run mode
1340
+ if _visited is None:
1341
+ _visited = set()
1342
+
1343
+ if self.execution_rid in _visited:
1344
+ return []
1345
+ _visited.add(self.execution_rid)
1346
+
1347
+ pb = self._ml_object.pathBuilder()
1348
+ execution_execution = pb.schemas[self._ml_object.ml_schema].Execution_Execution
1349
+
1350
+ # Query for nested executions, ordered by sequence
1351
+ nested = list(
1352
+ execution_execution.filter(execution_execution.Execution == self.execution_rid)
1353
+ .entities()
1354
+ .fetch()
1355
+ )
1356
+
1357
+ # Sort by sequence (None values at the end)
1358
+ nested.sort(key=lambda x: (x.get("Sequence") is None, x.get("Sequence")))
1359
+
1360
+ children = []
1361
+ for record in nested:
1362
+ child = self._ml_object.lookup_execution(record["Nested_Execution"])
1363
+ children.append(child)
1364
+ if recurse:
1365
+ children.extend(child.list_nested_executions(recurse=True, _visited=_visited))
1366
+
1367
+ return children
1368
+
1369
+ def list_parent_executions(
1370
+ self,
1371
+ recurse: bool = False,
1372
+ _visited: set[RID] | None = None,
1373
+ ) -> list["ExecutionRecord"]:
1374
+ """List all parent executions that contain this execution as a nested child.
1375
+
1376
+ Args:
1377
+ recurse: If True, recursively return all ancestor executions.
1378
+ _visited: Internal parameter to track visited executions and prevent infinite recursion.
1379
+
1380
+ Returns:
1381
+ List of parent ExecutionRecord objects.
1382
+ To get full Execution objects with lifecycle management, use restore_execution().
1383
+
1384
+ Example:
1385
+ >>> parents = child_exec.list_parent_executions()
1386
+ >>> all_ancestors = child_exec.list_parent_executions(recurse=True)
1387
+ """
1388
+ if self._execution_record is not None:
1389
+ return list(self._execution_record.list_parent_executions(recurse=recurse, _visited=_visited))
1390
+
1391
+ # Fallback for dry_run mode
1392
+ if _visited is None:
1393
+ _visited = set()
1394
+
1395
+ if self.execution_rid in _visited:
1396
+ return []
1397
+ _visited.add(self.execution_rid)
1398
+
1399
+ pb = self._ml_object.pathBuilder()
1400
+ execution_execution = pb.schemas[self._ml_object.ml_schema].Execution_Execution
1401
+
1402
+ parent_records = list(
1403
+ execution_execution.filter(execution_execution.Nested_Execution == self.execution_rid)
1404
+ .entities()
1405
+ .fetch()
1406
+ )
1407
+
1408
+ parents = []
1409
+ for record in parent_records:
1410
+ parent = self._ml_object.lookup_execution(record["Execution"])
1411
+ parents.append(parent)
1412
+ if recurse:
1413
+ parents.extend(parent.list_parent_executions(recurse=True, _visited=_visited))
1414
+
1415
+ return parents
1416
+
1417
+ def is_nested(self) -> bool:
1418
+ """Check if this execution is nested within another execution.
1419
+
1420
+ Returns:
1421
+ True if this execution has at least one parent execution.
1422
+ """
1423
+ if self._execution_record is not None:
1424
+ return self._execution_record.is_nested()
1425
+ return len(self.list_parent_executions()) > 0
1426
+
1427
+ def is_parent(self) -> bool:
1428
+ """Check if this execution has nested child executions.
1429
+
1430
+ Returns:
1431
+ True if this execution has at least one nested execution.
1432
+ """
1433
+ if self._execution_record is not None:
1434
+ return self._execution_record.is_parent()
1435
+ return len(self.list_nested_executions()) > 0
1436
+
1136
1437
  def __str__(self):
1137
1438
  items = [
1138
1439
  f"caching_dir: {self._cache_dir}",