PyPI - deriva-ml - Versions diffs - 1.10.1__py3-none-any.whl → 1.12.0__py3-none-any.whl - Mend

deriva-ml 1.10.1py3-none-any.whl → 1.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

deriva_ml/database_model.py +3 -2
deriva_ml/dataset.py +7 -16
deriva_ml/dataset_bag.py +10 -3
deriva_ml/demo_catalog.py +84 -78
deriva_ml/deriva_definitions.py +2 -2
deriva_ml/deriva_ml_base.py +105 -132
deriva_ml/deriva_model.py +31 -0
deriva_ml/execution.py +422 -315
deriva_ml/execution_configuration.py +4 -0
deriva_ml/feature.py +1 -2
deriva_ml/schema_setup/create_schema.py +223 -183
deriva_ml/upload.py +99 -236
{deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/METADATA +3 -1
deriva_ml-1.12.0.dist-info/RECORD +27 -0
deriva_ml-1.10.1.dist-info/RECORD +0 -27
{deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/WHEEL +0 -0
{deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.10.1.dist-info → deriva_ml-1.12.0.dist-info}/top_level.txt +0 -0

deriva_ml/execution.py CHANGED Viewed

@@ -5,45 +5,37 @@ This module defined the Execution class which is used to interact with the state
 from __future__ import annotations
 from collections import defaultdict
-import csv
 import json
 import logging
 import os
 import shutil
 from datetime import datetime
 from pathlib import Path
-import requests
-from tempfile import NamedTemporaryFile
 from typing import Iterable, Any, Optional
 from deriva.core import format_exception
-from deriva.core.ermrest_model import Table
 from pydantic import validate_call, ConfigDict
 import sys
+from deriva.core.hatrac_store import HatracStore
-from .deriva_definitions import MLVocab, ExecMetadataVocab
-from .deriva_definitions import (
-    RID,
-    Status,
-    FileUploadState,
-    UploadState,
-    DerivaMLException,
-)
+from .deriva_definitions import ExecMetadataVocab
+from .deriva_definitions import RID, Status, FileUploadState, DerivaMLException, MLVocab
 from .deriva_ml_base import DerivaML, FeatureRecord
 from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
 from .dataset_bag import DatasetBag
 from .execution_configuration import ExecutionConfiguration, Workflow
 from .execution_environment import get_execution_environment
 from .upload import (
-    execution_metadata_dir,
-    execution_asset_dir,
     execution_root,
     feature_root,
-    feature_asset_dir,
+    asset_root,
     feature_value_path,
     is_feature_dir,
-    is_feature_asset_dir,
     table_path,
     upload_directory,
+    normalize_asset_dir,
+    asset_file_path,
+    asset_type_path,
 )
 try:
@@ -60,6 +52,51 @@ except ImportError:
         return []
+class AssetFilePath(type(Path())):
+    """Derived class of Path that also includes information about a downloaded.
+    An AssetFilePath has all  the methods associated with a pathlib.Path object. In addition, it defines additional
+    attributes associated with a DerviaML asset.
+    Attributes:
+        asset_types: A list of the types associated with this asset.  From the Asset_Type controlled vocabulary.
+        asset_metadata: A dictionary of names and values of any additional columns  associated with this asset.
+        asset_name: The name of the asset table
+        file_name: The name of the file in the local file system that has the asset contents
+        asset_rid: The RID of the asset if it has been uploaded into an asset table
+    """
+    def __new__(
+        cls,
+        asset_path,
+        asset_name: str,
+        file_name: str,
+        asset_metadata: dict[str, Any],
+        asset_types: list[str] | str,
+        asset_rid: Optional[RID] = None,
+    ):
+        """
+        Create a new Path object that has additional information related to the use of this path as an asset.
+        Args:
+            asset_path: Local path to the location of the asset.
+            asset_name:  The name of the asset in the catalog (e.g. the asset table name).
+            file_name:  Name of the local file that contains the contents of the asset.
+            asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
+            asset_types:  A list of terms from the Asset_Type controlled vocabulary.
+            asset_rid:  The RID of the asset if it has been uploaded into an asset table
+        """
+        obj = super().__new__(cls, asset_path)
+        obj.asset_types = (
+            asset_types if isinstance(asset_types, list) else [asset_types]
+        )
+        obj.asset_metadata = asset_metadata
+        obj.asset_name = asset_name
+        obj.file_name = file_name
+        obj.asset_rid = asset_rid
+        return obj
 class Execution:
     """The Execution class is used to capture the context of an activity within DerivaML.  While these are primarily
     computational, manual processes can be represented by an execution as well.
@@ -102,18 +139,19 @@ class Execution:
         """
         Args:
-            configuration:
-            ml_object:
+            configuration: Execution configuration object that describes the execution.
+            ml_object: The DerivaML instance that created the execution.
             reload: RID of previously initialized execution object.
         """
-        self.asset_paths: list[Path] = []
+        self.asset_paths: list[AssetFilePath] = []
         self.configuration = configuration
         self._ml_object = ml_object
+        self._model = ml_object.model
         self._logger = ml_object._logger
         self.start_time = None
         self.stop_time = None
         self.status = Status.created
-        self.uploaded_assets: list[Path] = []
+        self.uploaded_assets: Optional[dict[str, list[AssetFilePath]]] = None
         self.configuration.argv = sys.argv
         self.dataset_rids: list[RID] = []
@@ -124,6 +162,7 @@ class Execution:
         self._cache_dir = self._ml_object.cache_dir
         self._dry_run = dry_run
+        # Make sure we have a good workflow.
         if isinstance(self.configuration.workflow, Workflow):
             self.workflow_rid = (
                 self._ml_object.add_workflow(self.configuration.workflow)
@@ -140,6 +179,7 @@ class Execution:
                     "Workflow specified in execution configuration is not a Workflow"
                 )
+        # Validate the datasets and assets to be valid.
         for d in self.configuration.datasets:
             if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
                 raise DerivaMLException(
@@ -147,9 +187,7 @@ class Execution:
                 )
         for a in self.configuration.assets:
-            if not self._ml_object.model.is_asset(
-                self._ml_object.resolve_rid(a).table.name
-            ):
+            if not self._model.is_asset(self._ml_object.resolve_rid(a).table.name):
                 raise DerivaMLException(
                     "Asset specified in execution configuration is not a asset table"
                 )
@@ -176,15 +214,12 @@ class Execution:
         self._initialize_execution(reload)
     def _save_runtime_environment(self):
-        runtime_env_path = ExecMetadataVocab.runtime_env.value
-        runtime_env_dir = self.execution_metadata_path(runtime_env_path)
-        with NamedTemporaryFile(
-            "w+",
-            dir=runtime_env_dir,
-            prefix="environment_snapshot_",
-            suffix=".txt",
-            delete=False,
-        ) as fp:
+        runtime_env_path = self.asset_file_path(
+            asset_name="Execution_Metadata",
+            file_name=f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
+            asset_types=ExecMetadataVocab.runtime_env.value,
+        )
+        with open(runtime_env_path, "w") as fp:
             json.dump(get_execution_environment(), fp)
     def _initialize_execution(self, reload: Optional[RID] = None) -> None:
@@ -206,6 +241,7 @@ class Execution:
             )
             self.datasets.append(self.download_dataset_bag(dataset))
             self.dataset_rids.append(dataset.rid)
         # Update execution info
         schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
         if self.dataset_rids and not (reload or self._dry_run):
@@ -218,17 +254,30 @@ class Execution:
         # Download assets....
         self.update_status(Status.running, "Downloading assets ...")
-        self.asset_paths = [
-            self._ml_object.download_asset(asset_rid=a, dest_dir=self._asset_dir())
-            for a in self.configuration.assets
-        ]
-        if self.asset_paths and not (reload or self._dry_run):
-            self._update_execution_asset_table(self.configuration.assets)
+        self.asset_paths = {}
+        for asset_rid in self.configuration.assets:
+            asset_table = self._ml_object.resolve_rid(asset_rid).table.name
+            dest_dir = (
+                execution_root(self._ml_object.working_dir, self.execution_rid)
+                / "downloaded-assets"
+                / asset_table
+            )
+            dest_dir.mkdir(parents=True, exist_ok=True)
+            self.asset_paths.setdefault(asset_table, []).append(
+                self.download_asset(
+                    asset_rid=asset_rid,
+                    dest_dir=dest_dir,
+                    update_catalog=not (reload or self._dry_run),
+                )
+            )
         # Save configuration details for later upload
-        exec_config_path = ExecMetadataVocab.execution_config.value
-        cfile = self.execution_metadata_path(exec_config_path) / "configuration.json"
-        with open(cfile, "w", encoding="utf-8") as config_file:
+        cfile = self.asset_file_path(
+            asset_name="Execution_Metadata",
+            file_name="configuration.json",
+            asset_types=ExecMetadataVocab.execution_config.value,
+        )
+        with open(cfile.as_posix(), "w", encoding="utf-8") as config_file:
             json.dump(self.configuration.model_dump(), config_file)
         # save runtime env
@@ -237,6 +286,42 @@ class Execution:
         self.start_time = datetime.now()
         self.update_status(Status.pending, "Initialize status finished.")
+    @property
+    def _execution_root(self) -> Path:
+        """
+        Args:
+        Returns:
+          :return:
+        """
+        return execution_root(self._working_dir, self.execution_rid)
+    @property
+    def _feature_root(self) -> Path:
+        """The root path to all execution specific files.
+        :return:
+        Args:
+        Returns:
+        """
+        return feature_root(self._working_dir, self.execution_rid)
+    @property
+    def _asset_root(self) -> Path:
+        """The root path to all execution specific files.
+        :return:
+        Args:
+        Returns:
+        """
+        return asset_root(self._working_dir, self.execution_rid)
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def download_dataset_bag(self, dataset: DatasetSpec) -> DatasetBag:
         """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
@@ -276,27 +361,6 @@ class Execution:
             ]
         )
-    def _create_notebook_checkpoint(self):
-        """Trigger a checkpoint creation using Jupyter's API."""
-        server, session = self._ml_object._get_notebook_session()
-        notebook_name = session["notebook"]["path"]
-        notebook_url = f"{server['url']}api/contents/{notebook_name}"
-        # Get notebook content
-        response = requests.get(
-            notebook_url, headers={"Authorization": f"Token {server['token']}"}
-        )
-        if response.status_code == 200:
-            notebook_content = response.json()["content"]
-            # Execution metadata cannot be in a directory, so map path into filename.
-            checkpoint_path = (
-                self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
-                / f"{notebook_name.replace('/', '_')}.checkpoint"
-            )
-            with open(checkpoint_path, "w", encoding="utf-8") as f:
-                json.dump(notebook_content, f)
     def execution_start(self) -> None:
         """Start an execution, uploading status to catalog"""
@@ -318,7 +382,7 @@ class Execution:
                 self._ml_object.ml_schema
             ].Execution.update([{"RID": self.execution_rid, "Duration": duration}])
-    def _upload_execution_dirs(self) -> dict[str, FileUploadState]:
+    def _upload_execution_dirs(self) -> dict[str, list[AssetFilePath]]:
         """Upload execution assets at _working_dir/Execution_asset.
         This routine uploads the contents of the
@@ -332,86 +396,142 @@ class Execution:
           DerivaMLException: If there is an issue uploading the assets.
         """
-        def asset_name(p: str) -> str:
-            return Path(*Path(p).parts[-2:]).as_posix()
         try:
             self.update_status(Status.running, "Uploading execution files...")
-            results = upload_directory(self._ml_object.model, self._execution_root)
-            results = {asset_name(k): v for k, v in results.items()}
-            execution_assets = [
-                r.result["RID"]
-                for r in results.values()
-                if r.state == UploadState.success and "Execution_Asset_Type" in r.result
-            ]
-            execution_metadata = [
-                r.result["RID"]
-                for r in results.values()
-                if r.state == UploadState.success
-                and "Execution_Metadata_Type" in r.result
-            ]
-            self._update_execution_asset_table(execution_assets)
-            self._update_execution_metadata_table(execution_metadata)
-        except Exception as e:
+            results = upload_directory(self._model, self._asset_root)
+        except RuntimeError as e:
             error = format_exception(e)
             self.update_status(Status.failed, error)
             raise DerivaMLException(f"Fail to upload execution_assets. Error: {error}")
+        asset_map = {}
+        for path, status in results.items():
+            asset_table, file_name = normalize_asset_dir(path)
+            asset_map.setdefault(asset_table, []).append(
+                AssetFilePath(
+                    asset_path=path,
+                    asset_name=asset_table,
+                    file_name=file_name,
+                    asset_metadata={
+                        k: v
+                        for k, v in status.result.items()
+                        if k in self._model.asset_metadata(asset_table.split("/")[1])
+                    },
+                    asset_types=[],
+                    asset_rid=status.result["RID"],
+                )
+            )
+        self._update_asset_execution_table(asset_map)
         self.update_status(Status.running, "Updating features...")
-        feature_assets = defaultdict(dict)
-        def traverse_bottom_up(directory: Path):
-            """Traverses the directory tree in a bottom-up order.
-            Args:
-              directory: Path:
-            Returns:
-            """
-            entries = list(directory.iterdir())
-            for entry in entries:
-                if entry.is_dir():
-                    yield from traverse_bottom_up(entry)
-            yield directory
-        for p in traverse_bottom_up(self._feature_root):
-            if m := is_feature_asset_dir(p):
-                try:
-                    self.update_status(
-                        Status.running, f"Uploading feature {m['feature_name']}..."
-                    )
-                    feature_assets[m["target_table"], m["feature_name"]] = (
-                        self._ml_object.upload_assets(p)
-                    )
-                    results |= feature_assets[m["target_table"], m["feature_name"]]
-                except Exception as e:
-                    error = format_exception(e)
-                    self.update_status(Status.failed, error)
-                    raise DerivaMLException(
-                        f"Fail to upload execution metadata. Error: {error}"
-                    )
-            elif m := is_feature_dir(p):
-                files = [f for f in p.iterdir() if f.is_file()]
-                if files:
-                    self._update_feature_table(
-                        target_table=m["target_table"],
-                        feature_name=m["feature_name"],
-                        feature_file=files[0],
-                        uploaded_files=feature_assets[
-                            m["target_table"], m["feature_name"]
-                        ],
-                    )
+        for p in self._feature_root.glob("**/*.jsonl"):
+            m = is_feature_dir(p.parent)
+            self._update_feature_table(
+                target_table=m["target_table"],
+                feature_name=m["feature_name"],
+                feature_file=p,
+                uploaded_files=asset_map,
+            )
         self.update_status(Status.running, "Upload assets complete")
-        return results
+        return asset_map
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def download_asset(
+        self, asset_rid: RID, dest_dir: Path, update_catalog=True
+    ) -> AssetFilePath:
+        """Download an asset from a URL and place it in a local directory.
+        Args:
+            asset_rid: URL of the asset.
+            dest_dir: Destination directory for the asset.
+            update_catalog: Whether to update the catalog execution information after downloading.
+        Returns:
+            A tuple with the name of the asset table and a Path object to the downloaded asset.
+        """
+        asset_table = self._ml_object.resolve_rid(asset_rid).table
+        if not self._model.is_asset(asset_table):
+            raise DerivaMLException(f"RID {asset_rid}  is not for an asset table.")
+        asset_record = self._ml_object.retrieve_rid(asset_rid)
+        asset_metadata = {
+            k: v
+            for k, v in asset_record.items()
+            if k in self._model.asset_metadata(asset_table)
+        }
+        asset_url = asset_record["URL"]
+        asset_filename = dest_dir / asset_record["Filename"]
+        hs = HatracStore("https", self._ml_object.host_name, self._ml_object.credential)
+        hs.get_obj(path=asset_url, destfilename=asset_filename.as_posix())
+        asset_type_table = self._model.find_association(asset_table, MLVocab.asset_type)
+        type_path = self._ml_object.pathBuilder.schemas[
+            asset_type_table.schema.name
+        ].tables[asset_type_table.name]
+        asset_types = [
+            asset_type[MLVocab.asset_type.value]
+            for asset_type in type_path.filter(
+                type_path.columns[asset_table.name] == asset_rid
+            )
+            .attributes(type_path.Asset_Type)
+            .fetch()
+        ]
+        asset_path = AssetFilePath(
+            file_name=asset_filename,
+            asset_rid=asset_rid,
+            asset_path=asset_filename,
+            asset_metadata=asset_metadata,
+            asset_name=asset_table.name,
+            asset_types=asset_types,
+        )
+        if update_catalog:
+            self._update_asset_execution_table(
+                {f"{asset_table.schema.name}/{asset_table.name}": [asset_path]},
+                asset_role="Input",
+            )
+        return asset_path
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def upload_assets(
+        self,
+        assets_dir: str | Path,
+    ) -> dict[Any, FileUploadState] | None:
+        """Upload assets from a directory.
+        This routine assumes that the current upload specification includes a configuration for the specified directory.
+        Every asset in the specified directory is uploaded
+        Args:
+            assets_dir: Directory containing the assets to upload.
+        Returns:
+            Results of the upload operation.
+        Raises:
+            DerivaMLException: If there is an issue uploading the assets.
+        """
+        def path_to_asset(path: str) -> str:
+            """Pull the asset name out of a path to that asset in the filesystem"""
+            components = path.split("/")
+            return components[
+                components.index("asset") + 2
+            ]  # Look for asset in the path to find the name
+        if not self._model.is_asset(Path(assets_dir).name):
+            raise DerivaMLException("Directory does not have name of an asset table.")
+        results = upload_directory(self._model, assets_dir)
+        return {path_to_asset(p): r for p, r in results.items()}
     def upload_execution_outputs(
         self, clean_folder: bool = True
-    ) -> dict[str, FileUploadState]:
+    ) -> dict[str, list[AssetFilePath]]:
         """Upload all the assets and metadata associated with the current execution.
         This will include any new assets, features, or table values.
@@ -427,29 +547,16 @@ class Execution:
         if self._dry_run:
             return {}
         try:
-            uploaded_assets = self._upload_execution_dirs()
+            self.uploaded_assets = self._upload_execution_dirs()
             self.update_status(Status.completed, "Successfully end the execution.")
             if clean_folder:
                 self._clean_folder_contents(self._execution_root)
-            return uploaded_assets
+            return self.uploaded_assets
         except Exception as e:
             error = format_exception(e)
             self.update_status(Status.failed, error)
             raise e
-    def _asset_dir(self) -> Path:
-        """
-        Args:
-        Returns:
-          :return: PathLib path object to model directory.
-        """
-        path = self._working_dir / self.execution_rid / "asset"
-        path.mkdir(parents=True, exist_ok=True)
-        return path
     def _clean_folder_contents(self, folder_path: Path):
         """
@@ -472,7 +579,7 @@ class Execution:
         target_table: str,
         feature_name: str,
         feature_file: str | Path,
-        uploaded_files: dict[str, FileUploadState],
+        uploaded_files: dict[str, list[AssetFilePath]],
     ) -> None:
         """
@@ -480,121 +587,140 @@ class Execution:
             target_table: str:
             feature_name: str:
             feature_file: str | Path:
-            uploaded_files: dict[str: FileUploadState]:
+            uploaded_files: Dictionary whose key ia an asset name, file-name pair, and whose value is a filename, RID of that asset.
         """
+        # Get the column names of all the Feature columns that should be the RID of an asset
         asset_columns = [
             c.name
             for c in self._ml_object.feature_record_class(
                 target_table, feature_name
             ).feature.asset_columns
         ]
+        # Get the names of the columns in the feature that are assets.
+        asset_columns = [
+            c.name
+            for c in self._ml_object.feature_record_class(
+                target_table, feature_name
+            ).feature.asset_columns
+        ]
         feature_table = self._ml_object.feature_record_class(
             target_table, feature_name
         ).feature.feature_table.name
+        asset_map = {
+            (asset_table, asset.file_name): asset.asset_rid
+            for asset_table, assets in uploaded_files.items()
+            for asset in assets
+        }
         def map_path(e):
-            """
-            Args:
-              e:
-            Returns:
-            """
-            # Go through the asset columns and replace the file name with the RID for the uploaded file.
+            """Go through the asset columns and replace the file name with the RID for the uploaded file."""
             for c in asset_columns:
-                e[c] = asset_map[e[c]]
+                e[c] = asset_map[normalize_asset_dir(e[c])]
             return e
-        # Create a map between a file name that appeared in the file to the RID of the uploaded file.
-        asset_map = {
-            file: asset.result["RID"]
-            for file, asset in uploaded_files.items()
-            if asset.state == UploadState.success and asset.result
-        }
+        # Load the JSON file that has the set of records that contain the feature values.
         with open(feature_file, "r") as feature_values:
-            entities = [map_path(e) for e in csv.DictReader(feature_values)]
-        self._ml_object.domain_path.tables[feature_table].insert(entities)
-    def _update_execution_metadata_table(self, assets: list[RID]) -> None:
-        """Upload execution metadata at _working_dir/Execution_metadata."""
-        ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
-        entities = [
-            {"Execution_Metadata": metadata_rid, "Execution": self.execution_rid}
-            for metadata_rid in assets
-        ]
-        ml_schema_path.Execution_Metadata_Execution.insert(entities)
-    def _update_execution_asset_table(self, assets: list[RID]) -> None:
-        """Assets associated with an execution must be linked to an execution entity after they are uploaded into
-        the catalog. This routine takes a list of uploaded assets and makes that association.
-        Args:
-            assets: list of RIDS for execution assets.:
-        """
-        ml_schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
-        entities = [
-            {"Execution_Asset": asset_rid, "Execution": self.execution_rid}
-            for asset_rid in assets
-        ]
-        ml_schema_path.Execution_Asset_Execution.insert(entities)
-    @property
-    def _execution_metadata_dir(self) -> Path:
-        """
-        Args:
-        Returns:
-          to the catalog by the execution_upload method in an execution object.
-          :return:
-        """
-        return execution_metadata_dir(
-            self._working_dir, exec_rid=self.execution_rid, metadata_type=""
+            entities = [json.loads(line.strip()) for line in feature_values]
+        # Update the asset columns in the feature and add to the catalog.
+        self._ml_object.domain_path.tables[feature_table].insert(
+            [map_path(e) for e in entities]
         )
-    def execution_metadata_path(self, metadata_type: str) -> Path:
-        """Return a pathlib Path to the directory in which to place files of type metadata_type.
-        These files are uploaded to the catalog as part of the execution of the upload_execution method in DerivaML.
+    def _update_asset_execution_table(
+        self,
+        uploaded_assets: dict[str, list[AssetFilePath]],
+        asset_role: str = "Output",
+    ):
+        """Add entry to association table connecting an asset to an execution RID
         Args:
-            metadata_type: Type of metadata to be uploaded.  Must be a term in Metadata_Type controlled vocabulary.
-        Returns:
-            Path to the directory in which to place files of type metadata_type.
-        """
-        self._ml_object.lookup_term(
-            MLVocab.execution_metadata_type, metadata_type
-        )  # Make sure metadata type exists.
-        return execution_metadata_dir(
-            self._working_dir, exec_rid=self.execution_rid, metadata_type=metadata_type
-        )
-    @property
-    def _execution_asset_dir(self) -> Path:
-        """
+            uploaded_assets: Dictionary whose key is the name of an asset table, and whose value is a list of RIDs for
+                newly added assets to that table.
+             asset_role: A term or list of terms from the Asset_Role vocabulary.
+        """
+        # Make sure  the asset role is in the controlled vocabulary table.
+        self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
+        pb = self._ml_object.pathBuilder
+        for asset_table, asset_list in uploaded_assets.items():
+            asset_table_name = asset_table.split("/")[
+                1
+            ]  # Peel off the schema from the asset table
+            asset_exe = self._model.find_association(asset_table_name, "Execution")
+            asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
+            asset_exe_path.insert(
+                [
+                    {
+                        asset_table_name: asset_path.asset_rid,
+                        "Execution": self.execution_rid,
+                        "Asset_Role": asset_role,
+                    }
+                    for asset_path in asset_list
+                ]
+            )
-        Args:
+            # Now add in the type names via the asset_asset_type association table.
+            # Get the list of types for each file in the asset.
+            if asset_role == "Input":
+                return
+            asset_type_map = {}
+            with open(
+                asset_type_path(
+                    self._working_dir,
+                    self.execution_rid,
+                    self._model.name_to_table(asset_table_name),
+                ),
+                "r",
+            ) as f:
+                for line in f:
+                    asset_type_map.update(json.loads(line.strip()))
+            for asset_path in asset_list:
+                asset_path.asset_types = asset_type_map[asset_path.file_name]
+            asset_asset_type = self._model.find_association(
+                asset_table_name, "Asset_Type"
+            )
+            type_path = pb.schemas[asset_asset_type.schema.name].tables[
+                asset_asset_type.name
+            ]
+            type_path.insert(
+                [
+                    {asset_table_name: asset.asset_rid, "Asset_Type": t}
+                    for asset in asset_list
+                    for t in asset_type_map[asset.file_name]
+                ]
+            )
-        Returns:
-          :return:
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def asset_file_path(
+        self,
+        asset_name: str,
+        file_name: str,
+        asset_types: Optional[list[str] | str] = None,
+        copy_file=False,
+        **kwargs,
+    ) -> AssetFilePath:
+        """Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
-        """
-        return execution_asset_dir(
-            self._working_dir, exec_rid=self.execution_rid, asset_type=""
-        )
+        Given the name of an asset table, and a file name, register the file for upload, and return a path to that
+        file in the upload directory.  In addition to the filename, additioal asset metadata and file asset types may
+        be specified.
-    def execution_asset_path(self, asset_type: str) -> Path:
-        """Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
+        This routine has three modes, depending on if file_name refers to an existing file.  If it doesn't, a path
+        to a new file with the specified name is returned.  The caller can then open that file for writing.
-        These files are uploaded as part of the upload_execution method in DerivaML class.
+        If the provided filename refers to an existing file and the copy_file argument is False (the default), then the
+        returned path contains a symbolic link to that file.  If the copy_file argument is True then the contents of
+        file_name are copied into the target directory.
         Args:
-            asset_type: Type of asset to be uploaded.  Must be a term in Asset_Type controlled vocabulary.
+            asset_name: Type of asset to be uploaded.  Must be a term in Asset_Type controlled vocabulary.
+            file_name: Name of file to be uploaded.
+            asset_types: Type of asset to be uploaded.  Defaults to name of the asset.
+            **kwargs: Any additional metadata values that may be part of the asset table.
         Returns:
             Path in which to place asset files.
@@ -602,73 +728,46 @@ class Execution:
         Raises:
             DerivaException: If the asset type is not defined.
         """
-        self._ml_object.lookup_term(MLVocab.execution_asset_type, asset_type)
-        return execution_asset_dir(
-            self._working_dir, exec_rid=self.execution_rid, asset_type=asset_type
-        )
-    @property
-    def _execution_root(self) -> Path:
-        """
-        Args:
-        Returns:
-          :return:
-        """
-        return execution_root(self._working_dir, self.execution_rid)
-    @property
-    def _feature_root(self) -> Path:
-        """The root path to all execution specific files.
-        :return:
-        Args:
-        Returns:
-        """
-        return feature_root(self._working_dir, self.execution_rid)
-    def feature_paths(
-        self, table: Table | str, feature_name: str
-    ) -> tuple[Path, dict[str, Path]]:
-        """Return the file path of where to place feature values, and assets for the named feature and table.
+        if not self._model.is_asset(asset_name):
+            DerivaMLException(f"Table {asset_name} is not an asset")
-        A side effect of calling this routine is that the directories in which to place the feature values and assets
-        will be created
+        asset_table = self._model.name_to_table(asset_name)
-        Args:
-            table: The table with which the feature is associated.
-            feature_name: Name of the feature
-        Returns:
-            A tuple whose first element is the path for the feature values and whose second element is a dictionary
-            of associated asset table names and corresponding paths.
-        """
-        feature = self._ml_object.lookup_feature(table, feature_name)
+        asset_types = asset_types or kwargs.get("Asset_Type", None) or asset_name
+        asset_types = [asset_types] if isinstance(asset_types, str) else asset_types
+        for t in asset_types:
+            self._ml_object.lookup_term(MLVocab.asset_type, t)
-        tpath = feature_value_path(
+        file_name = Path(file_name)
+        asset_path = asset_file_path(
             self._working_dir,
-            schema=self._ml_object.domain_schema,
-            target_table=feature.target_table.name,
-            feature_name=feature_name,
-            exec_rid=self.execution_rid,
+            self.execution_rid,
+            self._model.name_to_table(asset_name),
+            file_name.name,
+            metadata=kwargs,
+        )
+        if file_name.exists():
+            if copy_file:
+                asset_path.write_bytes(file_name.read_bytes())
+            else:
+                asset_path.symlink_to(file_name)
+        # Persist the asset types into a file
+        with open(
+            asset_type_path(self._working_dir, self.execution_rid, asset_table),
+            "a",
+            encoding="utf-8",
+        ) as f:
+            f.write(json.dumps({file_name.name: asset_types}) + "\n")
+        return AssetFilePath(
+            asset_path=asset_path,
+            asset_name=asset_name,
+            file_name=file_name.name,
+            asset_metadata=kwargs,
+            asset_types=asset_types,
         )
-        asset_paths = {
-            asset_table.name: feature_asset_dir(
-                self._working_dir,
-                exec_rid=self.execution_rid,
-                schema=self._ml_object.domain_schema,
-                target_table=feature.target_table.name,
-                feature_name=feature_name,
-                asset_table=asset_table.name,
-            )
-            for asset_table in feature.asset_columns
-        }
-        return tpath, asset_paths
     def table_path(self, table: str) -> Path:
         """Return a local file path to a CSV to add values to a table on upload.
@@ -679,10 +778,7 @@ class Execution:
         Returns:
             Pathlib path to the file in which to place table values.
         """
-        if (
-            table
-            not in self._ml_object.model.schemas[self._ml_object.domain_schema].tables
-        ):
+        if table not in self._model.schemas[self._ml_object.domain_schema].tables:
             raise DerivaMLException(
                 "Table '{}' not found in domain schema".format(table)
             )
@@ -693,10 +789,11 @@ class Execution:
     def execute(self) -> Execution:
         """Initiate an execution with provided configuration. Can be used in a context manager."""
+        self.execution_start()
         return self
     @validate_call
-    def write_feature_file(self, features: Iterable[FeatureRecord]) -> None:
+    def add_features(self, features: Iterable[FeatureRecord]) -> None:
         """Given a collection of Feature records, write out a CSV file in the appropriate assets directory so that this
         feature gets uploaded when the execution is complete.
@@ -704,22 +801,28 @@ class Execution:
             features: Iterable of Feature records to write.
         """
-        feature_iter = iter(features)
-        first_row = next(feature_iter)
+        # Make sure feature list is homogeneous:
+        sorted_features = defaultdict(list)
+        for f in features:
+            sorted_features[type(f)].append(f)
+        for fs in sorted_features.values():
+            self._add_features(fs)
+    def _add_features(self, features: list[FeatureRecord]) -> None:
+        # Update feature records to include current execution_rid
+        first_row = features[0]
         feature = first_row.feature
-        csv_path, _ = self.feature_paths(
-            feature.target_table.name, feature.feature_name
+        json_path = feature_value_path(
+            self._working_dir,
+            schema=self._ml_object.domain_schema,
+            target_table=feature.target_table.name,
+            feature_name=feature.feature_name,
+            exec_rid=self.execution_rid,
         )
-        fieldnames = {"Execution", "Feature_Name", feature.target_table.name}
-        fieldnames |= {f.name for f in feature.feature_columns}
-        with open(csv_path, "w") as f:
-            writer = csv.DictWriter(f, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerow(first_row.model_dump())
-            for feature in feature_iter:
-                writer.writerow(feature.model_dump())
+        with open(json_path, "a", encoding="utf-8") as file:
+            for feature in features:
+                feature.Execution = self.execution_rid
+                file.write(json.dumps(feature.model_dump(mode="json")) + "\n")
     @validate_call
     def create_dataset(self, dataset_types: str | list[str], description: str) -> RID:
@@ -748,9 +851,13 @@ class Execution:
         Add new elements to an existing dataset. In addition to adding new members, the minor version number of the
         dataset is incremented and the description, if provide is applied to that new version.
+        The RIDs in the list to not have to be all from the same table, but they must be from a table that has
+        been configured to be a dataset element type.
         Args:
             dataset_rid: RID of dataset_table to extend or None if new dataset_table is to be created.
-            members: List of RIDs of members to add to the  dataset_table.
+            members: List of RIDs of members to add to the  dataset_table. RID must be to a table type that is a
+                dataset element type (see DerivaML.add_dataset_element_type).
             validate: Check rid_list to make sure elements are not already in the dataset_table.
             description: Markdown description of the updated dataset.
         """

deriva-ml 1.10.1__py3-none-any.whl → 1.12.0__py3-none-any.whl

deriva-ml 1.10.1py3-none-any.whl → 1.12.0py3-none-any.whl