PyPI - deriva-ml - Versions diffs - 1.13.3__py3-none-any.whl → 1.14.26__py3-none-any.whl - Mend

deriva-ml 1.13.3py3-none-any.whl → 1.14.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

deriva_ml/__init__.py +25 -30
deriva_ml/core/__init__.py +39 -0
deriva_ml/core/base.py +1489 -0
deriva_ml/core/constants.py +36 -0
deriva_ml/core/definitions.py +74 -0
deriva_ml/core/enums.py +222 -0
deriva_ml/core/ermrest.py +288 -0
deriva_ml/core/exceptions.py +28 -0
deriva_ml/core/filespec.py +116 -0
deriva_ml/dataset/__init__.py +4 -0
deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
deriva_ml/{dataset.py → dataset/dataset.py} +408 -416
deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
deriva_ml/{history.py → dataset/history.py} +52 -33
deriva_ml/{upload.py → dataset/upload.py} +48 -70
deriva_ml/demo_catalog.py +233 -183
deriva_ml/execution/environment.py +290 -0
deriva_ml/{execution.py → execution/execution.py} +365 -252
deriva_ml/execution/execution_configuration.py +163 -0
deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
deriva_ml/feature.py +83 -46
deriva_ml/model/__init__.py +0 -0
deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
deriva_ml/{database_model.py → model/database.py} +52 -74
deriva_ml/model/sql_mapper.py +44 -0
deriva_ml/run_notebook.py +19 -11
deriva_ml/schema/__init__.py +3 -0
deriva_ml/{schema_setup → schema}/annotations.py +31 -22
deriva_ml/schema/check_schema.py +104 -0
deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
deriva_ml/schema/deriva-ml-reference.json +8525 -0
deriva_ml/schema/table_comments_utils.py +57 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
deriva_ml-1.14.26.dist-info/RECORD +40 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
deriva_ml/deriva_definitions.py +0 -372
deriva_ml/deriva_ml_base.py +0 -1046
deriva_ml/execution_environment.py +0 -139
deriva_ml/schema_setup/table_comments_utils.py +0 -56
deriva_ml/test-files/execution-parameters.json +0 -1
deriva_ml/test-files/notebook-parameters.json +0 -5
deriva_ml/test_functions.py +0 -141
deriva_ml/test_notebook.ipynb +0 -197
deriva_ml-1.13.3.dist-info/RECORD +0 -31
/deriva_ml/{schema_setup → execution}/__init__.py +0 -0
/deriva_ml/{schema_setup → schema}/policy.json +0 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.13.3.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0

deriva_ml/{execution.py → execution/execution.py} RENAMED Viewed

@@ -1,52 +1,78 @@
-"""
-This module defined the Execution class which is used to interact with the state of an active execution.
+"""Execution management for DerivaML.
+This module provides functionality for managing and tracking executions in DerivaML. An execution
+represents a computational or manual process that operates on datasets and produces outputs.
+The module includes:
+- Execution class: Core class for managing execution state and context
+- Asset management: Track input and output files
+- Status tracking: Monitor and update execution progress
+- Dataset handling: Download and materialize required datasets
+- Provenance tracking: Record relationships between inputs, processes, and outputs
+The Execution class serves as the primary interface for managing the lifecycle of a computational
+or manual process within DerivaML.
+Typical usage example:
+    >>> config = ExecutionConfiguration(workflow="analysis_workflow", description="Data analysis")
+    >>> with ml.create_execution(config) as execution:
+    ...     execution.download_dataset_bag(dataset_spec)
+    ...     # Run analysis
+    ...     execution.upload_execution_outputs()
 """
 from __future__ import annotations
-from collections import defaultdict
-from datetime import datetime
 import json
 import logging
 import os
-from pathlib import Path
-from pydantic import validate_call, ConfigDict
-import sys
 import shutil
-from typing import Iterable, Any, Optional
+import sys
+from collections import defaultdict
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Iterable, List
 from deriva.core import format_exception
 from deriva.core.hatrac_store import HatracStore
-from .deriva_definitions import (
+from pydantic import ConfigDict, validate_call
+from deriva_ml.core.base import DerivaML
+from deriva_ml.core.definitions import (
+    DRY_RUN_RID,
     RID,
-    Status,
-    FileUploadState,
-    DerivaMLException,
-    MLVocab,
-    MLAsset,
-    ExecMetadataType,
     ExecAssetType,
+    ExecMetadataType,
     FileSpec,
-    DRY_RUN_RID,
+    FileUploadState,
+    MLAsset,
+    MLVocab,
+    Status,
 )
-from .deriva_ml_base import DerivaML, FeatureRecord
-from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
-from .dataset_bag import DatasetBag
-from .execution_configuration import ExecutionConfiguration, Workflow
-from .execution_environment import get_execution_environment
-from .upload import (
+from deriva_ml.core.exceptions import DerivaMLException
+from deriva_ml.dataset.aux_classes import DatasetSpec, DatasetVersion, VersionPart
+from deriva_ml.dataset.dataset_bag import DatasetBag
+from deriva_ml.dataset.upload import (
+    asset_file_path,
+    asset_root,
+    asset_type_path,
     execution_root,
     feature_root,
-    asset_root,
     feature_value_path,
     is_feature_dir,
+    normalize_asset_dir,
     table_path,
     upload_directory,
-    normalize_asset_dir,
-    asset_file_path,
-    asset_type_path,
 )
+from deriva_ml.execution.environment import get_execution_environment
+from deriva_ml.execution.execution_configuration import ExecutionConfiguration
+from deriva_ml.execution.workflow import Workflow
+from deriva_ml.feature import FeatureRecord
+# Keep pycharm from complaining about undefined references in docstrings.
+execution: Execution
+ml: DerivaML
+dataset_spec: DatasetSpec
 try:
     from icecream import ic
@@ -55,7 +81,7 @@ except ImportError:  # Graceful fallback if IceCream isn't installed.
 try:
-    from IPython.display import display, Markdown
+    from IPython.display import Markdown, display
 except ImportError:
     def display(s):
@@ -69,16 +95,27 @@ except ImportError:
 if sys.version_info >= (3, 12):
     class AssetFilePath(Path):
-        """
-        Create a new Path object that has additional information related to the use of this path as an asset.
-        Args:
-            asset_path: Local path to the location of the asset.
-            asset_name:  The name of the asset in the catalog (e.g. the asset table name).
-            file_name:  Name of the local file that contains the contents of the asset.
-            asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
-            asset_types:  A list of terms from the Asset_Type controlled vocabulary.
-            asset_rid:  The RID of the asset if it has been uploaded into an asset table
+        """Extended Path class for managing asset files.
+        Represents a file path with additional metadata about its role as an asset in the catalog.
+        This class extends the standard Path class to include information about the asset's
+        catalog representation and type.
+        Attributes:
+            asset_name (str): Name of the asset in the catalog (e.g., asset table name).
+            file_name (str): Name of the local file containing the asset.
+            asset_metadata (dict[str, Any]): Additional columns beyond URL, Length, and checksum.
+            asset_types (list[str]): Terms from the Asset_Type controlled vocabulary.
+            asset_rid (RID | None): Resource Identifier if uploaded to an asset table.
+        Example:
+            >>> path = AssetFilePath(
+            ...     "/path/to/file.txt",
+            ...     asset_name="analysis_output",
+            ...     file_name="results.txt",
+            ...     asset_metadata={"version": "1.0"},
+            ...     asset_types=["text", "results"]
+            ... )
         """
         def __init__(
@@ -88,16 +125,23 @@ if sys.version_info >= (3, 12):
             file_name: str,
             asset_metadata: dict[str, Any],
             asset_types: list[str] | str,
-            asset_rid: Optional["RID"] = None,
+            asset_rid: RID | None = None,
         ):
+            """Initializes an AssetFilePath instance.
+            Args:
+                asset_path: Local path to the asset file.
+                asset_name: Name of the asset in the catalog.
+                file_name: Name of the local file.
+                asset_metadata: Additional metadata columns.
+                asset_types: One or more asset type terms.
+                asset_rid: Optional Resource Identifier if already in catalog.
+            """
             super().__init__(asset_path)
-            # These assignments happen after __new__ returns the instance
             self.asset_name = asset_name
             self.file_name = file_name
             self.asset_metadata = asset_metadata
-            self.asset_types = (
-                asset_types if isinstance(asset_types, list) else [asset_types]
-            )
+            self.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
             self.asset_rid = asset_rid
 else:
@@ -105,9 +149,9 @@ else:
         """
         Create a new Path object that has additional information related to the use of this path as an asset.
-        Args:
+        Attrubytes:
             asset_path: Local path to the location of the asset.
-            asset_name:  The name of the asset in the catalog (e.g. the asset table name).
+            asset_name:  The name of the asset in the catalog (e.g., the asset table name).
             file_name:  Name of the local file that contains the contents of the asset.
             asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
             asset_types:  A list of terms from the Asset_Type controlled vocabulary.
@@ -121,65 +165,76 @@ else:
             file_name: str,
             asset_metadata: dict[str, Any],
             asset_types: list[str] | str,
-            asset_rid: Optional["RID"] = None,
+            asset_rid: RID | None = None,
         ):
             # Only pass the path to the base Path class
             obj = super().__new__(cls, asset_path)
             obj.asset_name = asset_name
             obj.file_name = file_name
             obj.asset_metadata = asset_metadata
-            obj.asset_types = (
-                asset_types if isinstance(asset_types, list) else [asset_types]
-            )
+            obj.asset_types = asset_types if isinstance(asset_types, list) else [asset_types]
             obj.asset_rid = asset_rid
             return obj
 class Execution:
-    """The Execution class is used to capture the context of an activity within DerivaML.  While these are primarily
-    computational, manual processes can be represented by an execution as well.
-    Within DerivaML, Executions are used to provide providence. Every dataset_table and data file that is generated is
-    associated with an execution, which records which program and input parameters were used to generate that data.
-    Execution objects are created from an ExecutionConfiguration, which provides information about what DerivaML
-    datasets will be used, what additional files (assets) are required, what code is being run (Workflow) and an
-    optional description of the Execution.  Side effects of creating an execution object are:
-    1. An execution record is created in the catalog and the RID of that record  recorded,
-    2. Any specified datasets are downloaded and materialized
-    3. Any additional required assets are downloaded.
+    """Manages the lifecycle and context of a DerivaML execution.
-    Once execution is complete, a method can be called to upload any data produced by the execution. In addition, the
-    Execution object provides methods for locating where to find downloaded datasets and assets, and also where to
-    place any data that may be uploaded.
+    An Execution represents a computational or manual process within DerivaML. It provides:
+    - Dataset materialization and access
+    - Asset management (inputs and outputs)
+    - Status tracking and updates
+    - Provenance recording
+    - Result upload and cataloging
-    Finally, the execution object can update its current state in the DerivaML catalog, allowing users to remotely
-    track the progress of their execution.
+    The class handles downloading required datasets and assets, tracking execution state,
+    and managing the upload of results. Every dataset and file generated is associated
+    with an execution record for provenance tracking.
     Attributes:
-        dataset_rids (list[RID]): The RIDs of the datasets to be downloaded and materialized as part of the execution.
-        datasets (list[DatasetBag]): List of datasetBag objects that referred the materialized datasets specified in.
-            `dataset_rids`.
-        configuration (ExecutionConfiguration): The configuration of the execution.
-        workflow_rid (RID): The RID of the workflow associated with the execution.
-        status (Status): The status of the execution.
+        dataset_rids (list[RID]): RIDs of datasets used in the execution.
+        datasets (list[DatasetBag]): Materialized dataset objects.
+        configuration (ExecutionConfiguration): Execution settings and parameters.
+        workflow_rid (RID): RID of the associated workflow.
+        status (Status): Current execution status.
+        asset_paths (list[AssetFilePath]): Paths to execution assets.
+        parameters (dict): Execution parameters.
+        start_time (datetime | None): When execution started.
+        stop_time (datetime | None): When execution completed.
+    Example:
+        >>> config = ExecutionConfiguration(
+        ...     workflow="analysis",
+        ...     description="Process samples",
+        ...     parameters={"threshold": 0.5}
+        ... )
+        >>> with ml.create_execution(config) as execution:
+        ...     execution.download_dataset_bag(dataset_spec)
+        ...     # Run analysis
+        ...     execution.upload_execution_outputs()
     """
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def __init__(
         self,
         configuration: ExecutionConfiguration,
-        ml_object: "DerivaML",
-        reload: Optional[RID] = None,
+        ml_object: DerivaML,
+        reload: RID | None = None,
         dry_run: bool = False,
     ):
-        """
+        """Initializes an Execution instance.
+        Creates a new execution or reloads an existing one. Initializes the execution
+        environment, downloads required datasets, and sets up asset tracking.
         Args:
-            configuration: Execution configuration object that describes the execution.
-            ml_object: The DerivaML instance that created the execution.
-            reload: RID of a previously initialized execution object.
+            configuration: Settings and parameters for the execution.
+            ml_object: DerivaML instance managing the execution.
+            reload: Optional RID of existing execution to reload.
+            dry_run: If True, don't create catalog records or upload results.
+        Raises:
+            DerivaMLException: If initialization fails or configuration is invalid.
         """
         self.asset_paths: list[AssetFilePath] = []
         self.configuration = configuration
@@ -189,10 +244,10 @@ class Execution:
         self.start_time = None
         self.stop_time = None
         self.status = Status.created
-        self.uploaded_assets: Optional[dict[str, list[AssetFilePath]]] = None
+        self.uploaded_assets: dict[str, list[AssetFilePath]] | None = None
         self.configuration.argv = sys.argv
-        self.dataset_rids: list[RID] = []
+        self.dataset_rids: List[RID] = []
         self.datasets: list[DatasetBag] = []
         self.parameters = self.configuration.parameters
@@ -203,32 +258,21 @@ class Execution:
         # Make sure we have a good workflow.
         if isinstance(self.configuration.workflow, Workflow):
             self.workflow_rid = (
-                self._ml_object.add_workflow(self.configuration.workflow)
-                if not self._dry_run
-                else DRY_RUN_RID
+                self._ml_object.add_workflow(self.configuration.workflow) if not self._dry_run else DRY_RUN_RID
             )
         else:
             self.workflow_rid = self.configuration.workflow
-            if (
-                self._ml_object.resolve_rid(configuration.workflow).table.name
-                != "Workflow"
-            ):
-                raise DerivaMLException(
-                    "Workflow specified in execution configuration is not a Workflow"
-                )
+            if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
+                raise DerivaMLException("Workflow specified in execution configuration is not a Workflow")
         # Validate the datasets and assets to be valid.
         for d in self.configuration.datasets:
             if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
-                raise DerivaMLException(
-                    "Dataset specified in execution configuration is not a dataset"
-                )
+                raise DerivaMLException("Dataset specified in execution configuration is not a dataset")
         for a in self.configuration.assets:
             if not self._model.is_asset(self._ml_object.resolve_rid(a).table.name):
-                raise DerivaMLException(
-                    "Asset specified in execution configuration is not a asset table"
-                )
+                raise DerivaMLException("Asset specified in execution configuration is not a asset table")
         schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
         if reload:
@@ -247,16 +291,11 @@ class Execution:
                 ]
             )[0]["RID"]
-        if (
-            isinstance(self.configuration.workflow, Workflow)
-            and self.configuration.workflow.is_notebook
-        ):
-            # Put execution_rid into cell output so we can find it later.
-            display(
-                Markdown(f"Execution RID: {self._ml_object.cite(self.execution_rid)}")
-            )
+        if isinstance(self.configuration.workflow, Workflow) and self.configuration.workflow.is_notebook:
+            # Put execution_rid into the cell output so we can find it later.
+            display(Markdown(f"Execution RID: {self._ml_object.cite(self.execution_rid)}"))
-        # Create a directory for execution rid so we can recover state in case of a crash.
+        # Create a directory for execution rid so we can recover the state in case of a crash.
         execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
         self._initialize_execution(reload)
@@ -266,12 +305,12 @@ class Execution:
             f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
             ExecMetadataType.runtime_env.value,
         )
-        with open(runtime_env_path, "w") as fp:
+        with Path(runtime_env_path).open("w") as fp:
             json.dump(get_execution_environment(), fp)
-    def _initialize_execution(self, reload: Optional[RID] = None) -> None:
-        """Initialize the execution by a configuration  in the Execution_Metadata table.
-        Setup working directory and download all the assets and data.
+    def _initialize_execution(self, reload: RID | None = None) -> None:
+        """Initialize the execution by a configuration in the Execution_Metadata table.
+        Set up a working directory and download all the assets and data.
         :raise DerivaMLException: If there is an issue initializing the execution.
@@ -283,9 +322,7 @@ class Execution:
         """
         # Materialize bdbag
         for dataset in self.configuration.datasets:
-            self.update_status(
-                Status.initializing, f"Materialize bag {dataset.rid}... "
-            )
+            self.update_status(Status.initializing, f"Materialize bag {dataset.rid}... ")
             self.datasets.append(self.download_dataset_bag(dataset))
             self.dataset_rids.append(dataset.rid)
@@ -293,10 +330,7 @@ class Execution:
         schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
         if self.dataset_rids and not (reload or self._dry_run):
             schema_path.Dataset_Execution.insert(
-                [
-                    {"Dataset": d, "Execution": self.execution_rid}
-                    for d in self.dataset_rids
-                ]
+                [{"Dataset": d, "Execution": self.execution_rid} for d in self.dataset_rids]
             )
         # Download assets....
@@ -305,9 +339,7 @@ class Execution:
         for asset_rid in self.configuration.assets:
             asset_table = self._ml_object.resolve_rid(asset_rid).table.name
             dest_dir = (
-                execution_root(self._ml_object.working_dir, self.execution_rid)
-                / "downloaded-assets"
-                / asset_table
+                execution_root(self._ml_object.working_dir, self.execution_rid) / "downloaded-assets" / asset_table
             )
             dest_dir.mkdir(parents=True, exist_ok=True)
             self.asset_paths.setdefault(asset_table, []).append(
@@ -325,7 +357,7 @@ class Execution:
                 "configuration.json",
                 ExecMetadataType.execution_config.value,
             )
-            with open(cfile.as_posix(), "w", encoding="utf-8") as config_file:
+            with Path(cfile).open("w", encoding="utf-8") as config_file:
                 json.dump(self.configuration.model_dump(), config_file)
             for parameter_file in self.configuration.parameters:
@@ -355,7 +387,7 @@ class Execution:
     @property
     def _feature_root(self) -> Path:
-        """The root path to all execution specific files.
+        """The root path to all execution-specific files.
         :return:
         Args:
@@ -367,7 +399,7 @@ class Execution:
     @property
     def _asset_root(self) -> Path:
-        """The root path to all execution specific files.
+        """The root path to all execution-specific files.
         :return:
         Args:
@@ -379,26 +411,47 @@ class Execution:
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def download_dataset_bag(self, dataset: DatasetSpec) -> DatasetBag:
-        """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
-        that all the metadata is correct
+        """Downloads and materializes a dataset for use in the execution.
+        Downloads the specified dataset as a BDBag and materializes it in the execution's
+        working directory. The dataset version is determined by the DatasetSpec.
         Args:
-            dataset: A dataset specification of a dataset_table or a minid to an existing bag.
+            dataset: Specification of the dataset to download, including version and
+                materialization options.
         Returns:
-            the location of the unpacked and validated dataset_table bag and the RID of the bag
+            DatasetBag: Object containing:
+                - path: Local filesystem path to downloaded dataset
+                - rid: Dataset's Resource Identifier
+                - minid: Dataset's Minimal Viable Identifier
+        Raises:
+            DerivaMLException: If download or materialization fails.
+        Example:
+            >>> spec = DatasetSpec(rid="1-abc123", version="1.2.0")
+            >>> bag = execution.download_dataset_bag(spec)
+            >>> print(f"Downloaded to {bag.path}")
         """
-        return self._ml_object.download_dataset_bag(
-            dataset, execution_rid=self.execution_rid
-        )
+        return self._ml_object.download_dataset_bag(dataset, execution_rid=self.execution_rid)
     @validate_call
     def update_status(self, status: Status, msg: str) -> None:
-        """Update the status information in the execution record in the DerivaML catalog.
+        """Updates the execution's status in the catalog.
+        Records a new status and associated message in the catalog, allowing remote
+        tracking of execution progress.
         Args:
-            status: A value from the Status Enum
-            msg: Additional information about the status
+            status: New status value (e.g., running, completed, failed).
+            msg: Description of the status change or current state.
+        Raises:
+            DerivaMLException: If status update fails.
+        Example:
+            >>> execution.update_status(Status.running, "Processing sample 1 of 10")
         """
         self.status = status
         self._logger.info(msg)
@@ -417,14 +470,36 @@ class Execution:
         )
     def execution_start(self) -> None:
-        """Start an execution, uploading status to catalog"""
+        """Marks the execution as started.
+        Records the start time and updates the execution's status to 'running'.
+        This should be called before beginning the main execution work.
+        Example:
+            >>> execution.execution_start()
+            >>> try:
+            ...     # Run analysis
+            ...     execution.execution_stop()
+            ... except Exception:
+            ...     execution.update_status(Status.failed, "Analysis error")
+        """
         self.start_time = datetime.now()
         self.uploaded_assets = None
         self.update_status(Status.initializing, "Start execution  ...")
     def execution_stop(self) -> None:
-        """Finish the execution and update the duration and status of execution."""
+        """Marks the execution as completed.
+        Records the stop time and updates the execution's status to 'completed'.
+        This should be called after all execution work is finished.
+        Example:
+            >>> try:
+            ...     # Run analysis
+            ...     execution.execution_stop()
+            ... except Exception:
+            ...     execution.update_status(Status.failed, "Analysis error")
+        """
         self.stop_time = datetime.now()
         duration = self.stop_time - self.start_time
         hours, remainder = divmod(duration.total_seconds(), 3600)
@@ -433,22 +508,22 @@ class Execution:
         self.update_status(Status.completed, "Algorithm execution ended.")
         if not self._dry_run:
-            self._ml_object.pathBuilder.schemas[
-                self._ml_object.ml_schema
-            ].Execution.update([{"RID": self.execution_rid, "Duration": duration}])
+            self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
+                [{"RID": self.execution_rid, "Duration": duration}]
+            )
     def _upload_execution_dirs(self) -> dict[str, list[AssetFilePath]]:
         """Upload execution assets at _working_dir/Execution_asset.
         This routine uploads the contents of the
-        Execution_Asset directory, and then updates the execution_asset table in the ML schema to have references
+        Execution_Asset directory and then updates the execution_asset table in the ML schema to have references
         to these newly uploaded files.
         Returns:
           dict: Results of the upload operation.
         Raises:
-          DerivaMLException: If there is an issue uploading the assets.
+          DerivaMLException: If there is an issue when uploading the assets.
         """
         try:
@@ -494,9 +569,7 @@ class Execution:
         return asset_map
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
-    def download_asset(
-        self, asset_rid: RID, dest_dir: Path, update_catalog=True
-    ) -> AssetFilePath:
+    def download_asset(self, asset_rid: RID, dest_dir: Path, update_catalog=True) -> AssetFilePath:
         """Download an asset from a URL and place it in a local directory.
         Args:
@@ -513,25 +586,17 @@ class Execution:
             raise DerivaMLException(f"RID {asset_rid}  is not for an asset table.")
         asset_record = self._ml_object.retrieve_rid(asset_rid)
-        asset_metadata = {
-            k: v
-            for k, v in asset_record.items()
-            if k in self._model.asset_metadata(asset_table)
-        }
+        asset_metadata = {k: v for k, v in asset_record.items() if k in self._model.asset_metadata(asset_table)}
         asset_url = asset_record["URL"]
         asset_filename = dest_dir / asset_record["Filename"]
         hs = HatracStore("https", self._ml_object.host_name, self._ml_object.credential)
         hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
-        asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
-        type_path = self._ml_object.pathBuilder.schemas[
-            asset_type_table.schema.name
-        ].tables[asset_type_table.name]
+        asset_type_table, _col_l, _col_r = self._model.find_association(asset_table, MLVocab.asset_type)
+        type_path = self._ml_object.pathBuilder.schemas[asset_type_table.schema.name].tables[asset_type_table.name]
         asset_types = [
             asset_type[MLVocab.asset_type.value]
-            for asset_type in type_path.filter(
-                type_path.columns[asset_table.name] == asset_rid
-            )
+            for asset_type in type_path.filter(type_path.columns[asset_table.name] == asset_rid)
             .attributes(type_path.Asset_Type)
             .fetch()
         ]
@@ -557,47 +622,58 @@ class Execution:
         self,
         assets_dir: str | Path,
     ) -> dict[Any, FileUploadState] | None:
-        """Upload assets from a directory.
+        """Uploads assets from a directory to the catalog.
-        This routine assumes that the current upload specification includes a configuration for the specified directory.
-        Every asset in the specified directory is uploaded
+        Scans the specified directory for assets and uploads them to the catalog,
+        recording their metadata and types. Assets are organized by their types
+        and associated with the execution.
         Args:
-            assets_dir: Directory containing the assets to upload.
+            assets_dir: Directory containing assets to upload.
         Returns:
-            Results of the upload operation.
+            dict[Any, FileUploadState] | None: Mapping of assets to their upload states,
+                or None if no assets were found.
         Raises:
-            DerivaMLException: If there is an issue uploading the assets.
+            DerivaMLException: If upload fails or assets are invalid.
+        Example:
+            >>> states = execution.upload_assets("output/results")
+            >>> for asset, state in states.items():
+            ...     print(f"{asset}: {state}")
         """
         def path_to_asset(path: str) -> str:
             """Pull the asset name out of a path to that asset in the filesystem"""
             components = path.split("/")
-            return components[
-                components.index("asset") + 2
-            ]  # Look for asset in the path to find the name
+            return components[components.index("asset") + 2]  # Look for asset in the path to find the name
         if not self._model.is_asset(Path(assets_dir).name):
             raise DerivaMLException("Directory does not have name of an asset table.")
         results = upload_directory(self._model, assets_dir)
         return {path_to_asset(p): r for p, r in results.items()}
-    def upload_execution_outputs(
-        self, clean_folder: bool = True
-    ) -> dict[str, list[AssetFilePath]]:
-        """Upload all the assets and metadata associated with the current execution.
+    def upload_execution_outputs(self, clean_folder: bool = True) -> dict[str, list[AssetFilePath]]:
+        """Uploads all outputs from the execution to the catalog.
-        This will include any new assets, features, or table values.
+        Scans the execution's output directories for assets, features, and other results,
+        then uploads them to the catalog. Can optionally clean up the output folders
+        after successful upload.
         Args:
-            clean_folder: bool:  (Default value = True)
+            clean_folder: Whether to delete output folders after upload. Defaults to True.
         Returns:
-            Results of the upload operation. Asset names are all relative to the execution upload directory.
-            Uploaded assets with key as assets' suborder name, values as an
-            ordered dictionary with RID and metadata in the Execution_Asset table.
+            dict[str, list[AssetFilePath]]: Mapping of asset types to their file paths.
+        Raises:
+            DerivaMLException: If upload fails or outputs are invalid.
+        Example:
+            >>> outputs = execution.upload_execution_outputs()
+            >>> for type_name, paths in outputs.items():
+            ...     print(f"{type_name}: {len(paths)} files")
         """
         if self._dry_run:
             return {}
@@ -613,21 +689,40 @@ class Execution:
             raise e
     def _clean_folder_contents(self, folder_path: Path):
-        """
+        """Clean up folder contents with Windows-compatible error handling.
         Args:
-            folder_path: Path:
+            folder_path: Path to the folder to clean
         """
+        import time
+        MAX_RETRIES = 3
+        RETRY_DELAY = 1  # seconds
+        def remove_with_retry(path: Path, is_dir: bool = False) -> bool:
+            for attempt in range(MAX_RETRIES):
+                try:
+                    if is_dir:
+                        shutil.rmtree(path)
+                    else:
+                        Path(path).unlink()
+                    return True
+                except (OSError, PermissionError) as e:
+                    if attempt == MAX_RETRIES - 1:
+                        self.update_status(Status.failed, format_exception(e))
+                        return False
+                    time.sleep(RETRY_DELAY)
+            return False
         try:
             with os.scandir(folder_path) as entries:
                 for entry in entries:
                     if entry.is_dir() and not entry.is_symlink():
-                        shutil.rmtree(entry.path)
+                        remove_with_retry(Path(entry.path), is_dir=True)
                     else:
-                        os.remove(entry.path)
+                        remove_with_retry(Path(entry.path))
         except OSError as e:
-            error = format_exception(e)
-            self.update_status(Status.failed, error)
+            self.update_status(Status.failed, format_exception(e))
     def _update_feature_table(
         self,
@@ -642,28 +737,21 @@ class Execution:
             target_table: str:
             feature_name: str:
             feature_file: str | Path:
-            uploaded_files: Dictionary whose key ia an asset name, file-name pair, and whose value is a filename, RID of that asset.
+            uploaded_files: Dictionary whose key is an asset name, file-name pair, and whose value is a filename,
+                RID of that asset.
         """
         # Get the column names of all the Feature columns that should be the RID of an asset
         asset_columns = [
-            c.name
-            for c in self._ml_object.feature_record_class(
-                target_table, feature_name
-            ).feature.asset_columns
+            c.name for c in self._ml_object.feature_record_class(target_table, feature_name).feature.asset_columns
         ]
         # Get the names of the columns in the feature that are assets.
         asset_columns = [
-            c.name
-            for c in self._ml_object.feature_record_class(
-                target_table, feature_name
-            ).feature.asset_columns
+            c.name for c in self._ml_object.feature_record_class(target_table, feature_name).feature.asset_columns
         ]
-        feature_table = self._ml_object.feature_record_class(
-            target_table, feature_name
-        ).feature.feature_table.name
+        feature_table = self._ml_object.feature_record_class(target_table, feature_name).feature.feature_table.name
         asset_map = {
             (asset_table, asset.file_name): asset.asset_rid
             for asset_table, assets in uploaded_files.items()
@@ -677,41 +765,37 @@ class Execution:
             return e
         # Load the JSON file that has the set of records that contain the feature values.
-        with open(feature_file, "r") as feature_values:
+        with Path(feature_file).open("r") as feature_values:
             entities = [json.loads(line.strip()) for line in feature_values]
         # Update the asset columns in the feature and add to the catalog.
-        self._ml_object.domain_path.tables[feature_table].insert(
-            [map_path(e) for e in entities], on_conflict_skip=True
-        )
+        self._ml_object.domain_path.tables[feature_table].insert([map_path(e) for e in entities], on_conflict_skip=True)
     def _update_asset_execution_table(
         self,
         uploaded_assets: dict[str, list[AssetFilePath]],
         asset_role: str = "Output",
     ):
-        """Add entry to association table connecting an asset to an execution RID
+        """Add entry to the association table connecting an asset to an execution RID
         Args:
-            uploaded_assets: Dictionary whose key is the name of an asset table, and whose value is a list of RIDs for
+            uploaded_assets: Dictionary whose key is the name of an asset table and whose value is a list of RIDs for
                 newly added assets to that table.
              asset_role: A term or list of terms from the Asset_Role vocabulary.
         """
-        # Make sure  the asset role is in the controlled vocabulary table.
+        # Make sure the asset role is in the controlled vocabulary table.
         self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
         pb = self._ml_object.pathBuilder
         for asset_table, asset_list in uploaded_assets.items():
-            asset_table_name = asset_table.split("/")[
-                1
-            ]  # Peel off the schema from the asset table
-            asset_exe = self._model.find_association(asset_table_name, "Execution")
+            asset_table_name = asset_table.split("/")[1]  # Peel off the schema from the asset table
+            asset_exe, asset_fk, execution_fk = self._model.find_association(asset_table_name, "Execution")
             asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
             asset_exe_path.insert(
                 [
                     {
-                        asset_table_name: asset_path.asset_rid,
-                        "Execution": self.execution_rid,
+                        asset_fk: asset_path.asset_rid,
+                        execution_fk: self.execution_rid,
                         "Asset_Role": asset_role,
                     }
                     for asset_path in asset_list
@@ -724,25 +808,20 @@ class Execution:
             if asset_role == "Input":
                 return
             asset_type_map = {}
-            with open(
+            with Path(
                 asset_type_path(
                     self._working_dir,
                     self.execution_rid,
                     self._model.name_to_table(asset_table_name),
-                ),
-                "r",
-            ) as f:
-                for line in f:
+                )
+            ).open("r") as asset_type_file:
+                for line in asset_type_file:
                     asset_type_map.update(json.loads(line.strip()))
             for asset_path in asset_list:
                 asset_path.asset_types = asset_type_map[asset_path.file_name]
-            asset_asset_type = self._model.find_association(
-                asset_table_name, "Asset_Type"
-            )
-            type_path = pb.schemas[asset_asset_type.schema.name].tables[
-                asset_asset_type.name
-            ]
+            asset_asset_type, _, _ = self._model.find_association(asset_table_name, "Asset_Type")
+            type_path = pb.schemas[asset_asset_type.schema.name].tables[asset_asset_type.name]
             type_path.insert(
                 [
@@ -758,13 +837,13 @@ class Execution:
         self,
         asset_name: str,
         file_name: str | Path,
-        asset_types: Optional[list[str] | str] = None,
+        asset_types: list[str] | str | None = None,
         copy_file=False,
         **kwargs,
     ) -> AssetFilePath:
         """Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
-        Given the name of an asset table, and a file name, register the file for upload, and return a path to that
+        Given the name of an asset table, and a file name, register the file for upload and return a path to that
         file in the upload directory.  In addition to the filename, additional asset metadata and file asset types may
         be specified.
@@ -772,13 +851,13 @@ class Execution:
         to a new file with the specified name is returned.  The caller can then open that file for writing.
         If the provided filename refers to an existing file and the copy_file argument is False (the default), then the
-        returned path contains a symbolic link to that file.  If the copy_file argument is True then the contents of
+        returned path contains a symbolic link to that file.  If the copy_file argument is True, then the contents of
         file_name are copied into the target directory.
         Args:
             asset_name: Type of asset to be uploaded.  Must be a term in Asset_Type controlled vocabulary.
             file_name: Name of file to be uploaded.
-            asset_types: Type of asset to be uploaded.  Defaults to name of the asset.
+            asset_types: Type of asset to be uploaded.  Defaults to the name of the asset.
             **kwargs: Any additional metadata values that may be part of the asset table.
         Returns:
@@ -810,15 +889,17 @@ class Execution:
             if copy_file:
                 asset_path.write_bytes(file_name.read_bytes())
             else:
-                asset_path.symlink_to(file_name)
+                try:
+                    asset_path.symlink_to(file_name)
+                except (OSError, PermissionError):
+                    # Fallback to copy if symlink fails (common on Windows)
+                    asset_path.write_bytes(file_name.read_bytes())
         # Persist the asset types into a file
-        with open(
-            asset_type_path(self._working_dir, self.execution_rid, asset_table),
-            "a",
-            encoding="utf-8",
-        ) as f:
-            f.write(json.dumps({file_name.name: asset_types}) + "\n")
+        with Path(
+            asset_type_path(self._working_dir, self.execution_rid, asset_table)
+        ).open("a") as asset_type_file:
+            asset_type_file.write(json.dumps({file_name.name: asset_types}) + "\n")
         return AssetFilePath(
             asset_path=asset_path,
@@ -838,26 +919,33 @@ class Execution:
             Pathlib path to the file in which to place table values.
         """
         if table not in self._model.schemas[self._ml_object.domain_schema].tables:
-            raise DerivaMLException(
-                "Table '{}' not found in domain schema".format(table)
-            )
+            raise DerivaMLException("Table '{}' not found in domain schema".format(table))
-        return table_path(
-            self._working_dir, schema=self._ml_object.domain_schema, table=table
-        )
+        return table_path(self._working_dir, schema=self._ml_object.domain_schema, table=table)
     def execute(self) -> Execution:
-        """Initiate an execution with provided configuration. Can be used in a context manager."""
+        """Initiate an execution with the provided configuration. Can be used in a context manager."""
         self.execution_start()
         return self
     @validate_call
     def add_features(self, features: Iterable[FeatureRecord]) -> None:
-        """Given a collection of Feature records, write out a CSV file in the appropriate assets directory so that this
-        feature gets uploaded when the execution is complete.
+        """Adds feature records to the catalog.
+        Associates feature records with this execution and uploads them to the catalog.
+        Features represent measurable properties or characteristics of records.
+        NOTE: The catalog is not updated until upload_execution_outputs() is called.
         Args:
-            features: Iterable of Feature records to write.
+            features: Feature records to add, each containing a value and metadata.
+        Raises:
+            DerivaMLException: If feature addition fails or features are invalid.
+        Example:
+            >>> feature = FeatureRecord(value="high", confidence=0.95)
+            >>> execution.add_features([feature])
         """
         # Make sure feature list is homogeneous:
@@ -878,7 +966,7 @@ class Execution:
             feature_name=feature.feature_name,
             exec_rid=self.execution_rid,
         )
-        with open(json_path, "a", encoding="utf-8") as file:
+        with Path(json_path).open("a", encoding="utf-8") as file:
             for feature in features:
                 feature.Execution = self.execution_rid
                 file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
@@ -888,7 +976,7 @@ class Execution:
         self,
         dataset_types: str | list[str],
         description: str,
-        version: Optional[DatasetVersion] = None,
+        version: DatasetVersion | None = None,
     ) -> RID:
         """Create a new dataset with specified types.
@@ -900,14 +988,12 @@ class Execution:
         Returns:
             RID of the newly created dataset.
         """
-        return self._ml_object.create_dataset(
-            dataset_types, description, self.execution_rid, version=version
-        )
+        return self._ml_object.create_dataset(dataset_types, description, self.execution_rid, version=version)
     def add_dataset_members(
         self,
         dataset_rid: RID,
-        members: list[RID],
+        members: list[RID] | dict[str, list[RID]],
         validate: bool = True,
         description: str = "",
     ) -> None:
@@ -920,7 +1006,7 @@ class Execution:
         been configured to be a dataset element type.
         Args:
-            dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
+            dataset_rid: RID of dataset_table to extend or None if a new dataset_table is to be created.
             members: List of RIDs of members to add to the  dataset_table. RID must be to a table type that is a
                 dataset element type (see DerivaML.add_dataset_element_type).
             validate: Check rid_list to make sure elements are not already in the dataset_table.
@@ -943,7 +1029,7 @@ class Execution:
           dataset_rid: RID to a dataset_table
           component: Which version of the dataset_table to increment.
           dataset_rid: RID of the dataset whose version is to be incremented.
-          component: Major, Minor or Patch
+          component: Major, Minor, or Patch
           description: Description of the version update of the dataset_table.
         Returns:
@@ -963,13 +1049,42 @@ class Execution:
     def add_files(
         self,
         files: Iterable[FileSpec],
-        file_types: str | list[str],
-    ) -> Iterable[RID]:
-        """Add files to the file table"""
+        dataset_types: str | list[str] | None = None,
+        description: str = "",
+    ) -> RID:
+        """Adds files to the catalog with their metadata.
+        Registers files in the catalog along with their metadata (MD5, length, URL) and associates them with
+        specified file types.
+        Args:
+            files: File specifications containing MD5 checksum, length, and URL.
+            dataset_types: One or more dataset type terms from File_Type vocabulary.
+            description: Description of the files.
+        Returns:
+            RID: Dataset RID that identifes newly added files. Will be nested to mirror origioanl directory structure
+            of the files.
+        Raises:
+            DerivaMLInvalidTerm: If file_types are invalid or execution_rid is not an execution record.
+        Examples:
+            Add a single file type:
+                >>> files = [FileSpec(url="path/to/file.txt", md5="abc123", length=1000)]
+                >>> rids = exe.add_files(files, file_types="text")
+            Add multiple file types:
+                >>> rids = exe.add_files(
+                ...     files=[FileSpec(url="image.png", md5="def456", length=2000)],
+                ...     file_types=["image", "png"],
+                ... )
+        """
         return self._ml_object.add_files(
             files=files,
-            file_types=file_types,
+            dataset_types=dataset_types,
             execution_rid=self.execution_rid,
+            description=description,
         )
     def __str__(self):
@@ -1015,7 +1130,5 @@ class Execution:
                 Status.failed,
                 f"Exception type: {exc_type}, Exception value: {exc_value}",
             )
-            logging.error(
-                f"Exception type: {exc_type}, Exception value: {exc_value}, Exception traceback: {exc_tb}"
-            )
+            logging.error(f"Exception type: {exc_type}, Exception value: {exc_value}, Exception traceback: {exc_tb}")
             return False

deriva-ml 1.13.3__py3-none-any.whl → 1.14.26__py3-none-any.whl

deriva-ml 1.13.3py3-none-any.whl → 1.14.26py3-none-any.whl