PyPI - deriva-ml - Versions diffs - 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl - Mend

deriva-ml 1.17.10py3-none-any.whl → 1.17.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

deriva_ml/__init__.py +69 -1
deriva_ml/asset/__init__.py +17 -0
deriva_ml/asset/asset.py +357 -0
deriva_ml/asset/aux_classes.py +100 -0
deriva_ml/bump_version.py +254 -11
deriva_ml/catalog/__init__.py +31 -0
deriva_ml/catalog/clone.py +1939 -0
deriva_ml/catalog/localize.py +426 -0
deriva_ml/core/__init__.py +29 -0
deriva_ml/core/base.py +845 -1067
deriva_ml/core/config.py +169 -21
deriva_ml/core/constants.py +120 -19
deriva_ml/core/definitions.py +123 -13
deriva_ml/core/enums.py +47 -73
deriva_ml/core/ermrest.py +226 -193
deriva_ml/core/exceptions.py +297 -14
deriva_ml/core/filespec.py +99 -28
deriva_ml/core/logging_config.py +225 -0
deriva_ml/core/mixins/__init__.py +42 -0
deriva_ml/core/mixins/annotation.py +915 -0
deriva_ml/core/mixins/asset.py +384 -0
deriva_ml/core/mixins/dataset.py +237 -0
deriva_ml/core/mixins/execution.py +408 -0
deriva_ml/core/mixins/feature.py +365 -0
deriva_ml/core/mixins/file.py +263 -0
deriva_ml/core/mixins/path_builder.py +145 -0
deriva_ml/core/mixins/rid_resolution.py +204 -0
deriva_ml/core/mixins/vocabulary.py +400 -0
deriva_ml/core/mixins/workflow.py +322 -0
deriva_ml/core/validation.py +389 -0
deriva_ml/dataset/__init__.py +2 -1
deriva_ml/dataset/aux_classes.py +20 -4
deriva_ml/dataset/catalog_graph.py +575 -0
deriva_ml/dataset/dataset.py +1242 -1008
deriva_ml/dataset/dataset_bag.py +1311 -182
deriva_ml/dataset/history.py +27 -14
deriva_ml/dataset/upload.py +225 -38
deriva_ml/demo_catalog.py +126 -110
deriva_ml/execution/__init__.py +46 -2
deriva_ml/execution/base_config.py +639 -0
deriva_ml/execution/execution.py +543 -242
deriva_ml/execution/execution_configuration.py +26 -11
deriva_ml/execution/execution_record.py +592 -0
deriva_ml/execution/find_caller.py +298 -0
deriva_ml/execution/model_protocol.py +175 -0
deriva_ml/execution/multirun_config.py +153 -0
deriva_ml/execution/runner.py +595 -0
deriva_ml/execution/workflow.py +223 -34
deriva_ml/experiment/__init__.py +8 -0
deriva_ml/experiment/experiment.py +411 -0
deriva_ml/feature.py +6 -1
deriva_ml/install_kernel.py +143 -6
deriva_ml/interfaces.py +862 -0
deriva_ml/model/__init__.py +99 -0
deriva_ml/model/annotations.py +1278 -0
deriva_ml/model/catalog.py +286 -60
deriva_ml/model/database.py +144 -649
deriva_ml/model/deriva_ml_database.py +308 -0
deriva_ml/model/handles.py +14 -0
deriva_ml/run_model.py +319 -0
deriva_ml/run_notebook.py +507 -38
deriva_ml/schema/__init__.py +18 -2
deriva_ml/schema/annotations.py +62 -33
deriva_ml/schema/create_schema.py +169 -69
deriva_ml/schema/validation.py +601 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
deriva_ml-1.17.12.dist-info/RECORD +77 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
deriva_ml/protocols/dataset.py +0 -19
deriva_ml/test.py +0 -94
deriva_ml-1.17.10.dist-info/RECORD +0 -45
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0

deriva_ml/execution/execution.py CHANGED Viewed

@@ -18,7 +18,14 @@ Typical usage example:
     >>> with ml.create_execution(config) as execution:
     ...     execution.download_dataset_bag(dataset_spec)
     ...     # Run analysis
-    ...     execution.upload_execution_outputs()
+    ...     path = execution.asset_file_path("Model", "model.pt")
+    ...     # Write model to path...
+    ...
+    >>> # IMPORTANT: Upload AFTER the context manager exits
+    >>> execution.upload_execution_outputs()
+The context manager handles start/stop timing automatically. The upload_execution_outputs()
+call must happen AFTER exiting the context manager to ensure proper status tracking.
 """
 from __future__ import annotations
@@ -28,10 +35,11 @@ import logging
 import os
 import shutil
 import sys
+import time
 from collections import defaultdict
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Iterable, List
+from typing import Any, Callable, Iterable, List
 from deriva.core import format_exception
 from deriva.core.hatrac_store import HatracStore
@@ -47,9 +55,12 @@ from deriva_ml.core.definitions import (
     MLAsset,
     MLVocab,
     Status,
+    UploadProgress,
 )
 from deriva_ml.core.exceptions import DerivaMLException
-from deriva_ml.dataset.aux_classes import DatasetSpec, DatasetVersion, VersionPart
+from deriva_ml.asset.aux_classes import AssetFilePath
+from deriva_ml.dataset.aux_classes import DatasetSpec, DatasetVersion
+from deriva_ml.dataset.dataset import Dataset
 from deriva_ml.dataset.dataset_bag import DatasetBag
 from deriva_ml.dataset.upload import (
     asset_file_path,
@@ -65,8 +76,10 @@ from deriva_ml.dataset.upload import (
 )
 from deriva_ml.execution.environment import get_execution_environment
 from deriva_ml.execution.execution_configuration import ExecutionConfiguration
+from deriva_ml.execution.execution_record import ExecutionRecord
 from deriva_ml.execution.workflow import Workflow
 from deriva_ml.feature import FeatureRecord
+from deriva_ml.model.deriva_ml_database import DerivaMLDatabase
 # Keep pycharm from complaining about undefined references in docstrings.
 execution: Execution
@@ -90,92 +103,6 @@ except ImportError:
         return s
-# Platform-specific base class
-if sys.version_info >= (3, 12):
-    class AssetFilePath(Path):
-        """Extended Path class for managing asset files.
-        Represents a file path with additional metadata about its role as an asset in the catalog.
-        This class extends the standard Path class to include information about the asset's
-        catalog representation and type.
-        Attributes:
-            asset_name (str): Name of the asset in the catalog (e.g., asset table name).
-            file_name (str): Name of the local file containing the asset.
-            asset_metadata (dict[str, Any]): Additional columns beyond URL, Length, and checksum.
-            asset_types (list[str]): Terms from the Asset_Type controlled vocabulary.
-            asset_rid (RID | None): Resource Identifier if uploaded to an asset table.
-        Example:
-            >>> path = AssetFilePath(
-            ...     "/path/to/file.txt",
-            ...     asset_name="analysis_output",
-            ...     file_name="results.txt",
-            ...     asset_metadata={"version": "1.0"},
-            ...     asset_types=["text", "results"]
-            ... )
-        """
-        def __init__(
-            self,
-            asset_path: str | Path,
-            asset_name: str,
-            file_name: str,
-            asset_metadata: dict[str, Any],
-            asset_types: list[str] | str,
-            asset_rid: RID | None = None,
-        ):
-            """Initializes an AssetFilePath instance.
-            Args:
-                asset_path: Local path to the asset file.
-                asset_name: Name of the asset in the catalog.
-                file_name: Name of the local file.
-                asset_metadata: Additional metadata columns.
-                asset_types: One or more asset type terms.
-                asset_rid: Optional Resource Identifier if already in catalog.
-            """
-            super().__init__(asset_path)
-            self.asset_name = asset_name
-            self.file_name = file_name
-            self.asset_metadata = asset_metadata
-            self.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
-            self.asset_rid = asset_rid
-else:
-    class AssetFilePath(type(Path())):
-        """
-        Create a new Path object that has additional information related to the use of this path as an asset.
-        Attrubytes:
-            asset_path: Local path to the location of the asset.
-            asset_name:  The name of the asset in the catalog (e.g., the asset table name).
-            file_name:  Name of the local file that contains the contents of the asset.
-            asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
-            asset_types:  A list of terms from the Asset_Type controlled vocabulary.
-            asset_rid:  The RID of the asset if it has been uploaded into an asset table
-        """
-        def __new__(
-            cls,
-            asset_path: str | Path,
-            asset_name: str,
-            file_name: str,
-            asset_metadata: dict[str, Any],
-            asset_types: list[str] | str,
-            asset_rid: RID | None = None,
-        ):
-            # Only pass the path to the base Path class
-            obj = super().__new__(cls, asset_path)
-            obj.asset_name = asset_name
-            obj.file_name = file_name
-            obj.asset_metadata = asset_metadata
-            obj.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
-            obj.asset_rid = asset_rid
-            return obj
 class Execution:
     """Manages the lifecycle and context of a DerivaML execution.
@@ -201,14 +128,21 @@ class Execution:
         stop_time (datetime | None): When execution completed.
     Example:
-        >>> config = ExecutionConfiguration(
-        ...     workflow="analysis",
-        ...     description="Process samples",
-        ... )
-        >>> with ml.create_execution(config) as execution:
-        ...     execution.download_dataset_bag(dataset_spec)
-        ...     # Run analysis
-        ...     execution.upload_execution_outputs()
+        The context manager handles start/stop timing. Upload must be called AFTER
+        the context manager exits::
+            >>> config = ExecutionConfiguration(
+            ...     workflow="analysis",
+            ...     description="Process samples",
+            ... )
+            >>> with ml.create_execution(config) as execution:
+            ...     bag = execution.download_dataset_bag(dataset_spec)
+            ...     # Run analysis using bag.path
+            ...     output_path = execution.asset_file_path("Model", "model.pt")
+            ...     # Write results to output_path
+            ...
+            >>> # IMPORTANT: Call upload AFTER exiting the context manager
+            >>> execution.upload_execution_outputs()
     """
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@@ -216,7 +150,7 @@ class Execution:
         self,
         configuration: ExecutionConfiguration,
         ml_object: DerivaML,
-        workflow: Workflow | RID | None = None,
+        workflow: Workflow | None = None,
         reload: RID | None = None,
         dry_run: bool = False,
     ):
@@ -228,13 +162,32 @@ class Execution:
         Args:
             configuration: Settings and parameters for the execution.
             ml_object: DerivaML instance managing the execution.
-            workflow: Optional workflow RID or Workflow object.  If not specified, the workflow RID is taken from
-              the ExecutionConfiguration object
+            workflow: Optional Workflow object. If not specified, the workflow is taken from
+                the ExecutionConfiguration object. Must be a Workflow object, not a RID.
             reload: Optional RID of existing execution to reload.
             dry_run: If True, don't create catalog records or upload results.
         Raises:
-            DerivaMLException: If initialization fails or configuration is invalid.
+            DerivaMLException: If initialization fails, configuration is invalid,
+                or workflow is not a Workflow object.
+        Example:
+            Create an execution with a workflow::
+                >>> workflow = ml.lookup_workflow("2-ABC1")
+                >>> config = ExecutionConfiguration(
+                ...     workflow=workflow,
+                ...     description="Process data"
+                ... )
+                >>> execution = Execution(config, ml)
+            Or pass workflow separately::
+                >>> workflow = ml.lookup_workflow_by_url(
+                ...     "https://github.com/org/repo/blob/abc123/analysis.py"
+                ... )
+                >>> config = ExecutionConfiguration(description="Run analysis")
+                >>> execution = Execution(config, ml, workflow=workflow)
         """
         self.asset_paths: dict[str, list[AssetFilePath]] = {}
@@ -244,9 +197,10 @@ class Execution:
         self._logger = ml_object._logger
         self.start_time = None
         self.stop_time = None
-        self.status = Status.created
+        self._status = Status.created
         self.uploaded_assets: dict[str, list[AssetFilePath]] | None = None
         self.configuration.argv = sys.argv
+        self._execution_record: ExecutionRecord | None = None  # Lazily created after RID is assigned
         self.dataset_rids: List[RID] = []
         self.datasets: list[DatasetBag] = []
@@ -255,18 +209,24 @@ class Execution:
         self._cache_dir = self._ml_object.cache_dir
         self._dry_run = dry_run
-        # Make sure we have a good workflow.
+        # Make sure we have a valid Workflow object.
         if workflow:
             self.configuration.workflow = workflow
-        if isinstance(self.configuration.workflow, Workflow):
-            self._ml_object.lookup_term(MLVocab.workflow_type, configuration.workflow.workflow_type)
-            self.workflow_rid = (
-                self._ml_object.add_workflow(self.configuration.workflow) if not self._dry_run else DRY_RUN_RID
+        if self.configuration.workflow is None:
+            raise DerivaMLException("Workflow must be specified either in configuration or as a parameter")
+        if not isinstance(self.configuration.workflow, Workflow):
+            raise DerivaMLException(
+                f"Workflow must be a Workflow object, not {type(self.configuration.workflow).__name__}. "
+                "Use ml.lookup_workflow(rid) or ml.lookup_workflow_by_url(url) to get a Workflow object."
             )
-        else:
-            self.workflow_rid = self.configuration.workflow
-            if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
-                raise DerivaMLException("Workflow specified in execution configuration is not a Workflow")
+        # Validate workflow type and register in catalog
+        self._ml_object.lookup_term(MLVocab.workflow_type, self.configuration.workflow.workflow_type)
+        self.workflow_rid = (
+            self._ml_object.add_workflow(self.configuration.workflow) if not self._dry_run else DRY_RUN_RID
+        )
         # Validate the datasets and assets to be valid.
         for d in self.configuration.datasets:
@@ -277,7 +237,7 @@ class Execution:
             if not self._model.is_asset(self._ml_object.resolve_rid(a).table.name):
                 raise DerivaMLException("Asset specified in execution configuration is not a asset table")
-        schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
+        schema_path = self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema]
         if reload:
             self.execution_rid = reload
             if self.execution_rid == DRY_RUN_RID:
@@ -309,6 +269,18 @@ class Execution:
         # Create a directory for execution rid so we can recover the state in case of a crash.
         execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
+        # Create the ExecutionRecord to handle catalog state operations
+        if not self._dry_run:
+            self._execution_record = ExecutionRecord(
+                execution_rid=self.execution_rid,
+                workflow=self.configuration.workflow,
+                status=Status.created,
+                description=self.configuration.description,
+                _ml_instance=self._ml_object,
+                _logger=self._logger,
+            )
         self._initialize_execution(reload)
     def _save_runtime_environment(self):
@@ -321,31 +293,33 @@ class Execution:
             json.dump(get_execution_environment(), fp)
     def _upload_hydra_config_assets(self):
-        """Upload hydra assets to the catalog."""
+        """Upload hydra assets to the catalog with Hydra_Config type."""
         hydra_runtime_output_dir = self._ml_object.hydra_runtime_output_dir
         if hydra_runtime_output_dir:
             timestamp = hydra_runtime_output_dir.parts[-1]
             for hydra_asset in hydra_runtime_output_dir.rglob("*"):
                 if hydra_asset.is_dir():
                     continue
-                asset = self.asset_file_path(
+                # Register file for upload (side effect); result intentionally unused
+                # Use Hydra_Config type for Hydra YAML configuration files
+                self.asset_file_path(
                     asset_name=MLAsset.execution_metadata,
                     file_name=hydra_runtime_output_dir / hydra_asset,
                     rename_file=f"hydra-{timestamp}-{hydra_asset.name}",
-                    asset_types=ExecMetadataType.execution_config.value,
+                    asset_types=ExecMetadataType.hydra_config.value,
                 )
     def _initialize_execution(self, reload: RID | None = None) -> None:
-        """Initialize the execution by a configuration in the Execution_Metadata table.
-        Set up a working directory and download all the assets and data.
+        """Initialize the execution environment.
-        :raise DerivaMLException: If there is an issue initializing the execution.
+        Sets up the working directory, downloads required datasets and assets,
+        and saves initial configuration metadata.
         Args:
-            reload: RID of previously initialized execution.
-        Returns:
+            reload: Optional RID of a previously initialized execution to reload.
+        Raises:
+            DerivaMLException: If initialization fails.
         """
         # Materialize bdbag
         for dataset in self.configuration.datasets:
@@ -354,7 +328,7 @@ class Execution:
             self.dataset_rids.append(dataset.rid)
         # Update execution info
-        schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
+        schema_path = self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema]
         if self.dataset_rids and not (reload or self._dry_run):
             schema_path.Dataset_Execution.insert(
                 [{"Dataset": d, "Execution": self.execution_rid} for d in self.dataset_rids]
@@ -379,16 +353,21 @@ class Execution:
         # Save configuration details for later upload
         if not reload:
+            # Save DerivaML configuration with Deriva_Config type
             cfile = self.asset_file_path(
                 asset_name=MLAsset.execution_metadata,
                 file_name="configuration.json",
-                asset_types=ExecMetadataType.execution_config.value,
+                asset_types=ExecMetadataType.deriva_config.value,
             )
             with Path(cfile).open("w", encoding="utf-8") as config_file:
                 json.dump(self.configuration.model_dump(mode="json"), config_file)
-            lock_file = Path(self.configuration.workflow.git_root) / "uv.lock"
-            if lock_file.exists():
+            # Only try to copy uv.lock if git_root is available (local workflow)
+            if self.configuration.workflow.git_root:
+                lock_file = Path(self.configuration.workflow.git_root) / "uv.lock"
+            else:
+                lock_file = None
+            if lock_file and lock_file.exists():
                 _ = self.asset_file_path(
                     asset_name=MLAsset.execution_metadata,
                     file_name=lock_file,
@@ -405,6 +384,37 @@ class Execution:
         self.start_time = datetime.now()
         self.update_status(Status.pending, "Initialize status finished.")
+    @property
+    def status(self) -> Status:
+        """Get the current execution status.
+        Returns:
+            Status: The current status (Created, Running, Completed, Failed, etc.).
+        """
+        if self._execution_record is not None:
+            return self._execution_record.status
+        return self._status
+    @status.setter
+    def status(self, value: Status) -> None:
+        """Set the execution status.
+        Args:
+            value: The new status value.
+        """
+        self._status = value
+        if self._execution_record is not None:
+            self._execution_record._status = value
+    @property
+    def execution_record(self) -> ExecutionRecord | None:
+        """Get the ExecutionRecord for catalog operations.
+        Returns:
+            ExecutionRecord if not in dry_run mode, None otherwise.
+        """
+        return self._execution_record
     @property
     def working_dir(self) -> Path:
         """Return the working directory for the execution."""
@@ -412,39 +422,78 @@ class Execution:
     @property
     def _execution_root(self) -> Path:
+        """Get the root directory for this execution's files.
+        Returns:
+            Path to the execution-specific directory.
         """
+        return execution_root(self._working_dir, self.execution_rid)
-        Args:
+    @property
+    def _feature_root(self) -> Path:
+        """Get the root directory for feature files.
         Returns:
-          :return:
+            Path to the feature directory within the execution.
+        """
+        return feature_root(self._working_dir, self.execution_rid)
+    @property
+    def _asset_root(self) -> Path:
+        """Get the root directory for asset files.
+        Returns:
+            Path to the asset directory within the execution.
         """
-        return execution_root(self._working_dir, self.execution_rid)
+        return asset_root(self._working_dir, self.execution_rid)
     @property
-    def _feature_root(self) -> Path:
-        """The root path to all execution-specific files.
-        :return:
+    def database_catalog(self) -> DerivaMLDatabase | None:
+        """Get a catalog-like interface for downloaded datasets.
-        Args:
+        Returns a DerivaMLDatabase that implements the DerivaMLCatalog
+        protocol, allowing the same code to work with both live catalogs
+        and downloaded bags.
+        This is useful for writing code that can operate on either a live
+        catalog (via DerivaML) or on downloaded bags (via DerivaMLDatabase).
         Returns:
+            DerivaMLDatabase wrapping the primary downloaded dataset's model,
+            or None if no datasets have been downloaded.
+        Example:
+            >>> with ml.create_execution(config) as exe:
+            ...     if exe.database_catalog:
+            ...         db = exe.database_catalog
+            ...         # Use same interface as DerivaML
+            ...         dataset = db.lookup_dataset("4HM")
+            ...         term = db.lookup_term("Diagnosis", "cancer")
+            ...     else:
+            ...         # No datasets downloaded, use live catalog
+            ...         pass
         """
-        return feature_root(self._working_dir, self.execution_rid)
+        if not self.datasets:
+            return None
+        # Use the first dataset's model as the primary
+        return DerivaMLDatabase(self.datasets[0].model)
     @property
-    def _asset_root(self) -> Path:
-        """The root path to all execution-specific files.
-        :return:
+    def catalog(self) -> "DerivaML":
+        """Get the live catalog (DerivaML) instance for this execution.
-        Args:
+        This provides access to the live catalog for operations that require
+        catalog connectivity, such as looking up datasets or other read operations.
         Returns:
+            DerivaML: The live catalog instance.
+        Example:
+            >>> with ml.create_execution(config) as exe:
+            ...     # Use live catalog for lookups
+            ...     existing_dataset = exe.catalog.lookup_dataset("1-ABC")
         """
-        return asset_root(self._working_dir, self.execution_rid)
+        return self._ml_object
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def download_dataset_bag(self, dataset: DatasetSpec) -> DatasetBag:
@@ -471,7 +520,7 @@ class Execution:
             >>> bag = execution.download_dataset_bag(spec)
             >>> print(f"Downloaded to {bag.path}")
         """
-        return self._ml_object.download_dataset_bag(dataset, execution_rid=self.execution_rid)
+        return self._ml_object.download_dataset_bag(dataset)
     @validate_call
     def update_status(self, status: Status, msg: str) -> None:
@@ -490,21 +539,26 @@ class Execution:
         Example:
             >>> execution.update_status(Status.running, "Processing sample 1 of 10")
         """
-        self.status = status
+        self._status = status
         self._logger.info(msg)
         if self._dry_run:
             return
-        self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
-            [
-                {
-                    "RID": self.execution_rid,
-                    "Status": self.status.value,
-                    "Status_Detail": msg,
-                }
-            ]
-        )
+        # Delegate to ExecutionRecord for catalog updates
+        if self._execution_record is not None:
+            self._execution_record.update_status(status, msg)
+        else:
+            # Fallback for cases where ExecutionRecord isn't available
+            self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema].Execution.update(
+                [
+                    {
+                        "RID": self.execution_rid,
+                        "Status": status.value,
+                        "Status_Detail": msg,
+                    }
+                ]
+            )
     def execution_start(self) -> None:
         """Marks the execution as started.
@@ -545,17 +599,23 @@ class Execution:
         self.update_status(Status.completed, "Algorithm execution ended.")
         if not self._dry_run:
-            self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
+            self._ml_object.pathBuilder().schemas[self._ml_object.ml_schema].Execution.update(
                 [{"RID": self.execution_rid, "Duration": duration}]
             )
-    def _upload_execution_dirs(self) -> dict[str, list[AssetFilePath]]:
+    def _upload_execution_dirs(
+        self, progress_callback: Callable[[UploadProgress], None] | None = None
+    ) -> dict[str, list[AssetFilePath]]:
         """Upload execution assets at _working_dir/Execution_asset.
         This routine uploads the contents of the
         Execution_Asset directory and then updates the execution_asset table in the ML schema to have references
         to these newly uploaded files.
+        Args:
+            progress_callback: Optional callback function to receive upload progress updates.
+                Called with UploadProgress objects containing file information and progress.
         Returns:
           dict: Results of the upload operation.
@@ -565,11 +625,11 @@ class Execution:
         try:
             self.update_status(Status.running, "Uploading execution files...")
-            results = upload_directory(self._model, self._asset_root)
-        except RuntimeError as e:
+            results = upload_directory(self._model, self._asset_root, progress_callback=progress_callback)
+        except (RuntimeError, DerivaMLException) as e:
             error = format_exception(e)
             self.update_status(Status.failed, error)
-            raise DerivaMLException(f"Fail to upload execution_assets. Error: {error}")
+            raise DerivaMLException(f"Failed to upload execution_assets: {error}")
         asset_map = {}
         for path, status in results.items():
@@ -578,7 +638,7 @@ class Execution:
             asset_map.setdefault(asset_table, []).append(
                 AssetFilePath(
                     asset_path=path,
-                    asset_name=asset_table,
+                    asset_table=asset_table,
                     file_name=file_name,
                     asset_metadata={
                         k: v
@@ -629,7 +689,7 @@ class Execution:
         hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
         asset_type_table, _col_l, _col_r = self._model.find_association(asset_table, MLVocab.asset_type)
-        type_path = self._ml_object.pathBuilder.schemas[asset_type_table.schema.name].tables[asset_type_table.name]
+        type_path = self._ml_object.pathBuilder().schemas[asset_type_table.schema.name].tables[asset_type_table.name]
         asset_types = [
             asset_type[MLVocab.asset_type.value]
             for asset_type in type_path.filter(type_path.columns[asset_table.name] == asset_rid)
@@ -642,7 +702,7 @@ class Execution:
             asset_rid=asset_rid,
             asset_path=asset_filename,
             asset_metadata=asset_metadata,
-            asset_name=asset_table.name,
+            asset_table=asset_table.name,
             asset_types=asset_types,
         )
@@ -690,15 +750,26 @@ class Execution:
         results = upload_directory(self._model, assets_dir)
         return {path_to_asset(p): r for p, r in results.items()}
-    def upload_execution_outputs(self, clean_folder: bool = True) -> dict[str, list[AssetFilePath]]:
+    def upload_execution_outputs(
+        self, clean_folder: bool | None = None, progress_callback: Callable[[UploadProgress], None] | None = None
+    ) -> dict[str, list[AssetFilePath]]:
         """Uploads all outputs from the execution to the catalog.
         Scans the execution's output directories for assets, features, and other results,
         then uploads them to the catalog. Can optionally clean up the output folders
         after successful upload.
+        IMPORTANT: This method must be called AFTER exiting the context manager, not inside it.
+        The context manager handles execution timing (start/stop), while this method handles
+        the separate upload step.
         Args:
-            clean_folder: Whether to delete output folders after upload. Defaults to True.
+            clean_folder: Whether to delete output folders after upload. If None (default),
+                uses the DerivaML instance's clean_execution_dir setting. Pass True/False
+                to override for this specific execution.
+            progress_callback: Optional callback function to receive upload progress updates.
+                Called with UploadProgress objects containing file name, bytes uploaded,
+                total bytes, percent complete, phase, and status message.
         Returns:
             dict[str, list[AssetFilePath]]: Mapping of asset types to their file paths.
@@ -707,14 +778,28 @@ class Execution:
             DerivaMLException: If upload fails or outputs are invalid.
         Example:
-            >>> outputs = execution.upload_execution_outputs()
-            >>> for type_name, paths in outputs.items():
-            ...     print(f"{type_name}: {len(paths)} files")
+            >>> with ml.create_execution(config) as execution:
+            ...     # Do ML work, register output files with asset_file_path()
+            ...     path = execution.asset_file_path("Model", "model.pt")
+            ...     # Write to path...
+            ...
+            >>> # Upload AFTER the context manager exits
+            >>> def my_callback(progress):
+            ...     print(f"Uploading {progress.file_name}: {progress.percent_complete:.1f}%")
+            >>> outputs = execution.upload_execution_outputs(progress_callback=my_callback)
+            >>>
+            >>> # Override cleanup setting for this execution
+            >>> outputs = execution.upload_execution_outputs(clean_folder=False)  # Keep files
         """
         if self._dry_run:
             return {}
+        # Use DerivaML instance setting if not explicitly provided
+        if clean_folder is None:
+            clean_folder = getattr(self._ml_object, 'clean_execution_dir', True)
         try:
-            self.uploaded_assets = self._upload_execution_dirs()
+            self.uploaded_assets = self._upload_execution_dirs(progress_callback=progress_callback)
             self.update_status(Status.completed, "Successfully end the execution.")
             if clean_folder:
                 self._clean_folder_contents(self._execution_root)
@@ -724,14 +809,17 @@ class Execution:
             self.update_status(Status.failed, error)
             raise e
-    def _clean_folder_contents(self, folder_path: Path):
-        """Clean up folder contents with Windows-compatible error handling.
+    def _clean_folder_contents(self, folder_path: Path, remove_folder: bool = True):
+        """Clean up folder contents and optionally the folder itself.
+        Removes all files and subdirectories within the specified folder.
+        Uses retry logic for Windows compatibility where files may be temporarily locked.
         Args:
-            folder_path: Path to the folder to clean
+            folder_path: Path to the folder to clean.
+            remove_folder: If True (default), also remove the folder itself after
+                cleaning its contents. If False, only remove contents.
         """
-        import time
         MAX_RETRIES = 3
         RETRY_DELAY = 1  # seconds
@@ -745,20 +833,26 @@ class Execution:
                     return True
                 except (OSError, PermissionError) as e:
                     if attempt == MAX_RETRIES - 1:
-                        self.update_status(Status.failed, format_exception(e))
+                        logging.warning(f"Failed to remove {path}: {e}")
                         return False
                     time.sleep(RETRY_DELAY)
             return False
         try:
+            # First remove all contents
             with os.scandir(folder_path) as entries:
                 for entry in entries:
                     if entry.is_dir() and not entry.is_symlink():
                         remove_with_retry(Path(entry.path), is_dir=True)
                     else:
                         remove_with_retry(Path(entry.path))
+            # Then remove the folder itself if requested
+            if remove_folder:
+                remove_with_retry(folder_path, is_dir=True)
         except OSError as e:
-            self.update_status(Status.failed, format_exception(e))
+            logging.warning(f"Failed to clean folder {folder_path}: {e}")
     def _update_feature_table(
         self,
@@ -767,14 +861,16 @@ class Execution:
         feature_file: str | Path,
         uploaded_files: dict[str, list[AssetFilePath]],
     ) -> None:
-        """
+        """Update the feature table with values from a JSONL file.
+        Reads feature values from a file and inserts them into the catalog,
+        replacing file paths with the RIDs of uploaded assets.
         Args:
-            target_table: str:
-            feature_name: str:
-            feature_file: str | Path:
-            uploaded_files: Dictionary whose key is an asset name, file-name pair, and whose value is a filename,
-                RID of that asset.
+            target_table: Name of the table the feature is defined on.
+            feature_name: Name of the feature to update.
+            feature_file: Path to JSONL file containing feature values.
+            uploaded_files: Map from asset table names to their uploaded AssetFilePath objects.
         """
         # Get the column names of all the Feature columns that should be the RID of an asset
@@ -804,7 +900,7 @@ class Execution:
         with Path(feature_file).open("r") as feature_values:
             entities = [json.loads(line.strip()) for line in feature_values]
         # Update the asset columns in the feature and add to the catalog.
-        self._ml_object.domain_path.tables[feature_table].insert([map_path(e) for e in entities], on_conflict_skip=True)
+        self._ml_object.domain_path().tables[feature_table].insert([map_path(e) for e in entities], on_conflict_skip=True)
     def _update_asset_execution_table(
         self,
@@ -824,7 +920,7 @@ class Execution:
             return
         self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
-        pb = self._ml_object.pathBuilder
+        pb = self._ml_object.pathBuilder()
         for asset_table, asset_list in uploaded_assets.items():
             asset_table_name = asset_table.split("/")[1]  # Peel off the schema from the asset table
             asset_exe, asset_fk, execution_fk = self._model.find_association(asset_table_name, "Execution")
@@ -924,6 +1020,11 @@ class Execution:
             # There is a funny bug with S3 hatrac if we have the leading _ in the filename.
             file_name = file_name.with_name("-implementations.log")
+        # Resolve relative paths to absolute paths to ensure exists() and symlink work correctly
+        # regardless of the current working directory
+        if not file_name.is_absolute():
+            file_name = file_name.resolve()
         target_name = Path(rename_file) if file_name.exists() and rename_file else file_name
         asset_path = asset_file_path(
             prefix=self._working_dir,
@@ -949,7 +1050,7 @@ class Execution:
         return AssetFilePath(
             asset_path=asset_path,
-            asset_name=asset_name,
+            asset_table=asset_name,
             file_name=target_name.name,
             asset_metadata=kwargs,
             asset_types=asset_types,
@@ -964,10 +1065,18 @@ class Execution:
         Returns:
             Pathlib path to the file in which to place table values.
         """
-        if table not in self._model.schemas[self._ml_object.domain_schema].tables:
-            raise DerivaMLException("Table '{}' not found in domain schema".format(table))
+        # Find which domain schema contains this table
+        table_schema = None
+        for domain_schema in self._ml_object.domain_schemas:
+            if domain_schema in self._model.schemas:
+                if table in self._model.schemas[domain_schema].tables:
+                    table_schema = domain_schema
+                    break
+        if table_schema is None:
+            raise DerivaMLException("Table '{}' not found in any domain schema".format(table))
-        return table_path(self._working_dir, schema=self._ml_object.domain_schema, table=table)
+        return table_path(self._working_dir, schema=table_schema, table=table)
     def execute(self) -> Execution:
         """Initiate an execution with the provided configuration. Can be used in a context manager."""
@@ -1005,9 +1114,11 @@ class Execution:
         # Update feature records to include current execution_rid
         first_row = features[0]
         feature = first_row.feature
+        # Use the schema from the feature table
+        feature_schema = feature.feature_table.schema.name
         json_path = feature_value_path(
             self._working_dir,
-            schema=self._ml_object.domain_schema,
+            schema=feature_schema,
             target_table=feature.target_table.name,
             feature_name=feature.feature_name,
             exec_rid=self.execution_rid,
@@ -1017,78 +1128,93 @@ class Execution:
                 feature.Execution = self.execution_rid
                 file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
-    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
-    def create_dataset(
-        self,
-        dataset_types: str | list[str],
-        description: str,
-        version: DatasetVersion | None = None,
-    ) -> RID:
-        """Create a new dataset with specified types.
-        Args:
-            dataset_types: param description:
-            description: Markdown description of the dataset being created.
-            version: Version to assign to the dataset.  Defaults to 0.1.0
+    def list_input_datasets(self) -> list[Dataset]:
+        """List all datasets that were inputs to this execution.
         Returns:
-            RID of the newly created dataset.
+            List of Dataset objects that were used as inputs.
+        Example:
+            >>> for ds in execution.list_input_datasets():
+            ...     print(f"Input: {ds.dataset_rid} - {ds.description}")
         """
-        return self._ml_object.create_dataset(dataset_types, description, self.execution_rid, version=version)
+        if self._execution_record is not None:
+            return self._execution_record.list_input_datasets()
-    def add_dataset_members(
-        self,
-        dataset_rid: RID,
-        members: list[RID] | dict[str, list[RID]],
-        validate: bool = True,
-        description: str = "",
-    ) -> None:
-        """Add additional elements to an existing dataset_table.
+        # Fallback for dry_run mode
+        pb = self._ml_object.pathBuilder()
+        dataset_exec = pb.schemas[self._ml_object.ml_schema].Dataset_Execution
-        Add new elements to an existing dataset. In addition to adding new members, the minor version number of the
-        dataset is incremented and the description, if provide is applied to that new version.
+        records = list(
+            dataset_exec.filter(dataset_exec.Execution == self.execution_rid)
+            .entities()
+            .fetch()
+        )
+        return [self._ml_object.lookup_dataset(r["Dataset"]) for r in records]
-        The RIDs in the list to not have to be all from the same table, but they must be from a table that has
-        been configured to be a dataset element type.
+    def list_assets(self, asset_role: str | None = None) -> list["Asset"]:
+        """List all assets that were inputs or outputs of this execution.
         Args:
-            dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
-            members: List of RIDs of members to add to the  dataset_table. RID must be to a table type that is a
-                dataset element type (see DerivaML.add_dataset_element_type).
-            validate: Check rid_list to make sure elements are not already in the dataset_table.
-            description: Markdown description of the updated dataset.
+            asset_role: Optional filter: "Input" or "Output". If None, returns all.
+        Returns:
+            List of Asset objects associated with this execution.
+        Example:
+            >>> inputs = execution.list_assets(asset_role="Input")
+            >>> outputs = execution.list_assets(asset_role="Output")
         """
-        return self._ml_object.add_dataset_members(
-            dataset_rid=dataset_rid,
-            members=members,
-            validate=validate,
-            description=description,
-            execution_rid=self.execution_rid,
-        )
+        if self._execution_record is not None:
+            return self._execution_record.list_assets(asset_role=asset_role)
+        # Fallback for dry_run mode
+        from deriva_ml.asset.asset import Asset
+        pb = self._ml_object.pathBuilder()
+        asset_exec = pb.schemas[self._ml_object.ml_schema].Execution_Asset_Execution
+        query = asset_exec.filter(asset_exec.Execution == self.execution_rid)
+        if asset_role:
+            query = query.filter(asset_exec.Asset_Role == asset_role)
+        records = list(query.entities().fetch())
+        assets = []
+        for r in records:
+            try:
+                asset = self._ml_object.lookup_asset(r["Execution_Asset"])
+                assets.append(asset)
+            except Exception:
+                pass  # Skip assets that can't be looked up
+        return assets
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def create_dataset(
+        self,
+        dataset_types: str | list[str] | None = None,
+        version: DatasetVersion | str | None = None,
+        description: str = "",
+    ) -> Dataset:
+        """Create a new dataset with specified types.
-    def increment_dataset_version(
-        self, dataset_rid: RID, component: VersionPart, description: str = ""
-    ) -> DatasetVersion:
-        """Increment the version of the specified dataset_table.
+        Creates a dataset associated with this execution for provenance tracking.
         Args:
-          dataset_rid: RID to a dataset_table
-          component: Which version of the dataset_table to increment.
-          dataset_rid: RID of the dataset whose version is to be incremented.
-          component: Major, Minor, or Patch
-          description: Description of the version update of the dataset_table.
+            dataset_types: One or more dataset type terms from Dataset_Type vocabulary.
+            description: Markdown description of the dataset being created.
+            version: Dataset version. Defaults to 0.1.0.
         Returns:
-          new semantic version of the dataset_table as a 3-tuple
-        Raises:
-          DerivaMLException: if provided RID is not to a dataset_table.
+            The newly created Dataset.
         """
-        return self._ml_object.increment_dataset_version(
-            dataset_rid=dataset_rid,
-            component=component,
-            description=description,
+        return Dataset.create_dataset(
+            ml_instance=self._ml_object,
             execution_rid=self.execution_rid,
+            dataset_types=dataset_types,
+            version=version,
+            description=description,
         )
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
@@ -1097,7 +1223,7 @@ class Execution:
         files: Iterable[FileSpec],
         dataset_types: str | list[str] | None = None,
         description: str = "",
-    ) -> RID:
+    ) -> "Dataset":
         """Adds files to the catalog with their metadata.
         Registers files in the catalog along with their metadata (MD5, length, URL) and associates them with
@@ -1109,7 +1235,7 @@ class Execution:
             description: Description of the files.
         Returns:
-            RID: Dataset RID that identifies newly added files. Will be nested to mirror original directory structure
+            RID: Dataset  that identifies newly added files. Will be nested to mirror original directory structure
             of the files.
         Raises:
@@ -1128,11 +1254,186 @@ class Execution:
         """
         return self._ml_object.add_files(
             files=files,
-            dataset_types=dataset_types,
             execution_rid=self.execution_rid,
+            dataset_types=dataset_types,
             description=description,
         )
+    # =========================================================================
+    # Execution Nesting Methods
+    # =========================================================================
+    def add_nested_execution(
+        self,
+        nested_execution: "Execution | ExecutionRecord | RID",
+        sequence: int | None = None,
+    ) -> None:
+        """Add a nested (child) execution to this execution.
+        Creates a parent-child relationship between this execution and another.
+        This is useful for grouping related executions, such as parameter sweeps
+        or pipeline stages.
+        Args:
+            nested_execution: The child execution to add (Execution, ExecutionRecord, or RID).
+            sequence: Optional ordering index (0, 1, 2...). Use None for parallel executions.
+        Raises:
+            DerivaMLException: If the association cannot be created.
+        Example:
+            >>> parent_exec = ml.create_execution(parent_config)
+            >>> child_exec = ml.create_execution(child_config)
+            >>> parent_exec.add_nested_execution(child_exec, sequence=0)
+        """
+        if self._dry_run:
+            return
+        # Get the RID from the nested execution
+        if isinstance(nested_execution, Execution):
+            nested_rid = nested_execution.execution_rid
+        elif isinstance(nested_execution, ExecutionRecord):
+            nested_rid = nested_execution.execution_rid
+        else:
+            nested_rid = nested_execution
+        # Delegate to ExecutionRecord if available
+        if self._execution_record is not None:
+            self._execution_record.add_nested_execution(nested_rid, sequence=sequence)
+        else:
+            # Fallback for cases without execution record
+            pb = self._ml_object.pathBuilder()
+            execution_execution = pb.schemas[self._ml_object.ml_schema].Execution_Execution
+            record = {
+                "Execution": self.execution_rid,
+                "Nested_Execution": nested_rid,
+            }
+            if sequence is not None:
+                record["Sequence"] = sequence
+            execution_execution.insert([record])
+    def list_nested_executions(
+        self,
+        recurse: bool = False,
+        _visited: set[RID] | None = None,
+    ) -> list["ExecutionRecord"]:
+        """List all nested (child) executions of this execution.
+        Args:
+            recurse: If True, recursively return all descendant executions.
+            _visited: Internal parameter to track visited executions and prevent infinite recursion.
+        Returns:
+            List of nested ExecutionRecord objects, ordered by sequence if available.
+            To get full Execution objects with lifecycle management, use restore_execution().
+        Example:
+            >>> children = parent_exec.list_nested_executions()
+            >>> all_descendants = parent_exec.list_nested_executions(recurse=True)
+        """
+        if self._execution_record is not None:
+            return list(self._execution_record.list_nested_executions(recurse=recurse, _visited=_visited))
+        # Fallback for dry_run mode
+        if _visited is None:
+            _visited = set()
+        if self.execution_rid in _visited:
+            return []
+        _visited.add(self.execution_rid)
+        pb = self._ml_object.pathBuilder()
+        execution_execution = pb.schemas[self._ml_object.ml_schema].Execution_Execution
+        # Query for nested executions, ordered by sequence
+        nested = list(
+            execution_execution.filter(execution_execution.Execution == self.execution_rid)
+            .entities()
+            .fetch()
+        )
+        # Sort by sequence (None values at the end)
+        nested.sort(key=lambda x: (x.get("Sequence") is None, x.get("Sequence")))
+        children = []
+        for record in nested:
+            child = self._ml_object.lookup_execution(record["Nested_Execution"])
+            children.append(child)
+            if recurse:
+                children.extend(child.list_nested_executions(recurse=True, _visited=_visited))
+        return children
+    def list_parent_executions(
+        self,
+        recurse: bool = False,
+        _visited: set[RID] | None = None,
+    ) -> list["ExecutionRecord"]:
+        """List all parent executions that contain this execution as a nested child.
+        Args:
+            recurse: If True, recursively return all ancestor executions.
+            _visited: Internal parameter to track visited executions and prevent infinite recursion.
+        Returns:
+            List of parent ExecutionRecord objects.
+            To get full Execution objects with lifecycle management, use restore_execution().
+        Example:
+            >>> parents = child_exec.list_parent_executions()
+            >>> all_ancestors = child_exec.list_parent_executions(recurse=True)
+        """
+        if self._execution_record is not None:
+            return list(self._execution_record.list_parent_executions(recurse=recurse, _visited=_visited))
+        # Fallback for dry_run mode
+        if _visited is None:
+            _visited = set()
+        if self.execution_rid in _visited:
+            return []
+        _visited.add(self.execution_rid)
+        pb = self._ml_object.pathBuilder()
+        execution_execution = pb.schemas[self._ml_object.ml_schema].Execution_Execution
+        parent_records = list(
+            execution_execution.filter(execution_execution.Nested_Execution == self.execution_rid)
+            .entities()
+            .fetch()
+        )
+        parents = []
+        for record in parent_records:
+            parent = self._ml_object.lookup_execution(record["Execution"])
+            parents.append(parent)
+            if recurse:
+                parents.extend(parent.list_parent_executions(recurse=True, _visited=_visited))
+        return parents
+    def is_nested(self) -> bool:
+        """Check if this execution is nested within another execution.
+        Returns:
+            True if this execution has at least one parent execution.
+        """
+        if self._execution_record is not None:
+            return self._execution_record.is_nested()
+        return len(self.list_parent_executions()) > 0
+    def is_parent(self) -> bool:
+        """Check if this execution has nested child executions.
+        Returns:
+            True if this execution has at least one nested execution.
+        """
+        if self._execution_record is not None:
+            return self._execution_record.is_parent()
+        return len(self.list_nested_executions()) > 0
     def __str__(self):
         items = [
             f"caching_dir: {self._cache_dir}",

deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl

deriva-ml 1.17.10py3-none-any.whl → 1.17.12py3-none-any.whl