PyPI - deriva-ml - Versions diffs - 1.14.47__py3-none-any.whl → 1.17.0__py3-none-any.whl - Mend

deriva-ml 1.14.47py3-none-any.whl → 1.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

deriva_ml/.DS_Store +0 -0
deriva_ml/__init__.py +59 -30
deriva_ml/core/__init__.py +2 -2
deriva_ml/core/base.py +28 -16
deriva_ml/core/config.py +67 -0
deriva_ml/dataset/__init__.py +10 -2
deriva_ml/dataset/aux_classes.py +31 -2
deriva_ml/dataset/dataset.py +7 -5
deriva_ml/dataset/dataset_bag.py +214 -106
deriva_ml/dataset/upload.py +7 -4
deriva_ml/demo_catalog.py +17 -3
deriva_ml/execution/__init__.py +26 -0
deriva_ml/execution/execution.py +50 -28
deriva_ml/execution/execution_configuration.py +26 -31
deriva_ml/execution/workflow.py +8 -0
deriva_ml/model/catalog.py +119 -2
deriva_ml/model/database.py +457 -83
deriva_ml/protocols/dataset.py +19 -0
deriva_ml/run_notebook.py +55 -50
deriva_ml/schema/annotations.py +7 -5
deriva_ml/test.py +94 -0
{deriva_ml-1.14.47.dist-info → deriva_ml-1.17.0.dist-info}/METADATA +10 -7
deriva_ml-1.17.0.dist-info/RECORD +45 -0
deriva_ml/model/sql_mapper.py +0 -44
deriva_ml-1.14.47.dist-info/RECORD +0 -42
{deriva_ml-1.14.47.dist-info → deriva_ml-1.17.0.dist-info}/WHEEL +0 -0
{deriva_ml-1.14.47.dist-info → deriva_ml-1.17.0.dist-info}/entry_points.txt +0 -0
{deriva_ml-1.14.47.dist-info → deriva_ml-1.17.0.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.14.47.dist-info → deriva_ml-1.17.0.dist-info}/top_level.txt +0 -0

deriva_ml/execution/execution.py CHANGED Viewed

@@ -41,7 +41,6 @@ from deriva_ml.core.base import DerivaML
 from deriva_ml.core.definitions import (
     DRY_RUN_RID,
     RID,
-    ExecAssetType,
     ExecMetadataType,
     FileSpec,
     FileUploadState,
@@ -198,7 +197,6 @@ class Execution:
         workflow_rid (RID): RID of the associated workflow.
         status (Status): Current execution status.
         asset_paths (list[AssetFilePath]): Paths to execution assets.
-        parameters (dict): Execution parameters.
         start_time (datetime | None): When execution started.
         stop_time (datetime | None): When execution completed.
@@ -206,7 +204,6 @@ class Execution:
         >>> config = ExecutionConfiguration(
         ...     workflow="analysis",
         ...     description="Process samples",
-        ...     parameters={"threshold": 0.5}
         ... )
         >>> with ml.create_execution(config) as execution:
         ...     execution.download_dataset_bag(dataset_spec)
@@ -250,7 +247,6 @@ class Execution:
         self.dataset_rids: List[RID] = []
         self.datasets: list[DatasetBag] = []
-        self.parameters = self.configuration.parameters
         self._working_dir = self._ml_object.working_dir
         self._cache_dir = self._ml_object.cache_dir
@@ -292,9 +288,18 @@ class Execution:
                 ]
             )[0]["RID"]
-        if isinstance(self.configuration.workflow, Workflow) and self.configuration.workflow.is_notebook:
-            # Put execution_rid into the cell output so we can find it later.
-            display(Markdown(f"Execution RID: {self._ml_object.cite(self.execution_rid)}"))
+        if rid_path := os.environ.get("DERIVA_ML_SAVE_EXECUTION_RID", None):
+            # Put execution_rid into the provided file path so we can find it later.
+            with Path(rid_path).open("w") as f:
+                json.dump(
+                    {
+                        "hostname": self._ml_object.host_name,
+                        "catalog_id": self._ml_object.catalog_id,
+                        "workflow_rid": self.workflow_rid,
+                        "execution_rid": self.execution_rid,
+                    },
+                    f,
+                )
         # Create a directory for execution rid so we can recover the state in case of a crash.
         execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
@@ -302,13 +307,28 @@ class Execution:
     def _save_runtime_environment(self):
         runtime_env_path = self.asset_file_path(
-            "Execution_Metadata",
-            f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
-            ExecMetadataType.runtime_env.value,
+            asset_name="Execution_Metadata",
+            file_name=f"environment_snapshot_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
+            asset_types=ExecMetadataType.runtime_env.value,
         )
         with Path(runtime_env_path).open("w") as fp:
             json.dump(get_execution_environment(), fp)
+    def _upload_hydra_config_assets(self):
+        """Upload hydra assets to the catalog."""
+        hydra_runtime_output_dir = self._ml_object.hydra_runtime_output_dir
+        if hydra_runtime_output_dir:
+            timestamp = hydra_runtime_output_dir.parts[-1]
+            for hydra_asset in hydra_runtime_output_dir.rglob("*"):
+                if hydra_asset.is_dir():
+                    continue
+                asset = self.asset_file_path(
+                    asset_name=MLAsset.execution_metadata,
+                    file_name=hydra_runtime_output_dir / hydra_asset,
+                    rename_file=f"hydra-{timestamp}-{hydra_asset.name}",
+                    asset_types=ExecMetadataType.execution_config.value,
+                )
     def _initialize_execution(self, reload: RID | None = None) -> None:
         """Initialize the execution by a configuration in the Execution_Metadata table.
         Set up a working directory and download all the assets and data.
@@ -354,9 +374,9 @@ class Execution:
         # Save configuration details for later upload
         if not reload:
             cfile = self.asset_file_path(
-                MLAsset.execution_metadata,
-                "configuration.json",
-                ExecMetadataType.execution_config.value,
+                asset_name=MLAsset.execution_metadata,
+                file_name="configuration.json",
+                asset_types=ExecMetadataType.execution_config.value,
             )
             with Path(cfile).open("w", encoding="utf-8") as config_file:
                 json.dump(self.configuration.model_dump(), config_file)
@@ -364,24 +384,18 @@ class Execution:
             lock_file = Path(self.configuration.workflow.git_root) / "uv.lock"
             if lock_file.exists():
                 _ = self.asset_file_path(
-                    MLAsset.execution_metadata,
-                    lock_file,
-                    ExecMetadataType.execution_config.value,
+                    asset_name=MLAsset.execution_metadata,
+                    file_name=lock_file,
+                    asset_types=ExecMetadataType.execution_config.value,
                 )
-            for parameter_file in self.configuration.parameters:
-                self.asset_file_path(
-                    MLAsset.execution_asset,
-                    parameter_file,
-                    ExecAssetType.input_file.value,
-                )
+            self._upload_hydra_config_assets()
             # save runtime env
             self._save_runtime_environment()
             # Now upload the files so we have the info in case the execution fails.
             self.uploaded_assets = self._upload_execution_dirs()
         self.start_time = datetime.now()
         self.update_status(Status.pending, "Initialize status finished.")
@@ -569,7 +583,6 @@ class Execution:
                     asset_rid=status.result["RID"],
                 )
             )
         self._update_asset_execution_table(asset_map)
         self.update_status(Status.running, "Updating features...")
@@ -791,7 +804,7 @@ class Execution:
         self,
         uploaded_assets: dict[str, list[AssetFilePath]],
         asset_role: str = "Output",
-    ):
+    ) -> None:
         """Add entry to the association table connecting an asset to an execution RID
         Args:
@@ -800,6 +813,9 @@ class Execution:
              asset_role: A term or list of terms from the Asset_Role vocabulary.
         """
         # Make sure the asset role is in the controlled vocabulary table.
+        if self._dry_run:
+            # Don't do any updates of we are doing a dry run.
+            return
         self._ml_object.lookup_term(MLVocab.asset_role, asset_role)
         pb = self._ml_object.pathBuilder
@@ -856,6 +872,7 @@ class Execution:
         file_name: str | Path,
         asset_types: list[str] | str | None = None,
         copy_file=False,
+        rename_file: str | None = None,
         **kwargs,
     ) -> AssetFilePath:
         """Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
@@ -875,6 +892,8 @@ class Execution:
             asset_name: Type of asset to be uploaded.  Must be a term in Asset_Type controlled vocabulary.
             file_name: Name of file to be uploaded.
             asset_types: Type of asset to be uploaded.  Defaults to the name of the asset.
+            copy_file: Whether to copy the file rather than creating a symbolic link.
+            rename_file: If provided, the file will be renamed to this name if the file already exists..
             **kwargs: Any additional metadata values that may be part of the asset table.
         Returns:
@@ -893,12 +912,15 @@ class Execution:
         for t in asset_types:
             self._ml_object.lookup_term(MLVocab.asset_type, t)
+        # Determine if we will need to rename an existing file as the asset.
         file_name = Path(file_name)
+        target_name = Path(rename_file) if file_name.exists() and rename_file else file_name
         asset_path = asset_file_path(
             prefix=self._working_dir,
             exec_rid=self.execution_rid,
             asset_table=self._model.name_to_table(asset_name),
-            file_name=file_name.name,
+            file_name=target_name.name,
             metadata=kwargs,
         )
@@ -914,12 +936,12 @@ class Execution:
         # Persist the asset types into a file
         with Path(asset_type_path(self._working_dir, self.execution_rid, asset_table)).open("a") as asset_type_file:
-            asset_type_file.write(json.dumps({file_name.name: asset_types}) + "\n")
+            asset_type_file.write(json.dumps({target_name.name: asset_types}) + "\n")
         return AssetFilePath(
             asset_path=asset_path,
             asset_name=asset_name,
-            file_name=file_name.name,
+            file_name=target_name.name,
             asset_metadata=kwargs,
             asset_types=asset_types,
         )

deriva_ml/execution/execution_configuration.py CHANGED Viewed

@@ -22,11 +22,13 @@ Typical usage example:
 from __future__ import annotations
+from dataclasses import dataclass
 import json
 import sys
 from pathlib import Path
 from typing import Any
+from hydra_zen import builds
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 from deriva_ml.core.definitions import RID
@@ -67,42 +69,18 @@ class ExecutionConfiguration(BaseModel):
     datasets: list[DatasetSpec] = []
     assets: list[RID] = []
     workflow: RID | Workflow
-    parameters: dict[str, Any] | Path = {}
     description: str = ""
     argv: list[str] = Field(default_factory=lambda: sys.argv)
     model_config = ConfigDict(arbitrary_types_allowed=True)
-    @field_validator("parameters", mode="before")
-    @classmethod
-    def validate_parameters(cls, value: Any) -> Any:
-        """Validates and loads execution parameters.
-        If value is a file path, loads and parses it as JSON. Otherwise, returns
-        the value as is.
-        Args:
-            value: Parameter value to validate, either:
-                - Dictionary of parameters
-                - Path to JSON file
-                - String path to JSON file
-        Returns:
-            dict[str, Any]: Validated parameter dictionary.
-        Raises:
-            ValueError: If JSON file is invalid or cannot be read.
-            FileNotFoundError: If parameter file doesn't exist.
-        Example:
-            >>> config = ExecutionConfiguration(parameters="params.json")
-            >>> print(config.parameters)  # Contents of params.json as dict
-        """
-        if isinstance(value, str) or isinstance(value, Path):
-            with Path(value).open("r") as f:
-                return json.load(f)
-        else:
-            return value
+  #  @field_validator("datasets", mode="before")
+  #  @classmethod
+  #  def validate_datasets(cls, value: Any) -> Any:
+  #      if isinstance(value, DatasetList):
+  #          config_list: DatasetList = value
+  #          value = config_list.datasets
+  #      return value
     @field_validator("workflow", mode="before")
     @classmethod
@@ -161,3 +139,20 @@ class ExecutionConfiguration(BaseModel):
     #         hs = HatracStore("https", self.host_name, self.credential)
     #         hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
     #         return ExecutionConfiguration.load_configuration(Path(dest_file.name))
+@dataclass
+class AssetRID(str):
+    rid: str
+    description: str = ""
+    def __new__(cls, rid: str, description: str = ""):
+        obj = super().__new__(cls, rid)
+        obj.description = description
+        return obj
+AssetRIDConfig = builds(AssetRID, populate_full_signature=True)

deriva_ml/execution/workflow.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Any
 import requests
 from pydantic import BaseModel, PrivateAttr, model_validator
 from requests import RequestException
+from setuptools_scm import get_version
 from deriva_ml.core.definitions import RID
 from deriva_ml.core.exceptions import DerivaMLException
@@ -129,6 +130,13 @@ class Workflow(BaseModel):
             self.url, self.checksum = Workflow.get_url_and_checksum(path)
             self.git_root = Workflow._get_git_root(path)
+        self.version = get_version(
+            root=str(self.git_root or Path.cwd()),
+            search_parent_directories=True,
+            # Optional but recommended: provide a safe fallback when tags are absent
+            fallback_version="0.0",
+        )
         self._logger = logging.getLogger("deriva_ml")
         return self

deriva_ml/model/catalog.py CHANGED Viewed

@@ -8,7 +8,8 @@ ML-specific functionality. It handles schema management, feature definitions, an
 from __future__ import annotations
 # Standard library imports
-from collections import Counter
+from collections import Counter, defaultdict
+from graphlib import CycleError, TopologicalSorter
 from typing import Any, Callable, Final, Iterable, NewType, TypeAlias
 from deriva.core.ermrest_catalog import ErmrestCatalog
@@ -21,6 +22,7 @@ from pydantic import ConfigDict, validate_call
 from deriva_ml.core.definitions import (
     ML_SCHEMA,
+    RID,
     DerivaAssetColumns,
     TableDefinition,
 )
@@ -28,6 +30,7 @@ from deriva_ml.core.exceptions import DerivaMLException, DerivaMLTableTypeError
 # Local imports
 from deriva_ml.feature import Feature
+from deriva_ml.protocols.dataset import DatasetLike
 try:
     from icecream import ic
@@ -287,6 +290,118 @@ class DerivaModel:
         else:
             self.model.apply()
+    def list_dataset_element_types(self) -> list[Table]:
+        """
+        Lists the data types of elements contained within a dataset.
+        This method analyzes the dataset and identifies the data types for all
+        elements within it. It is useful for understanding the structure and
+        content of the dataset and allows for better manipulation and usage of its
+        data.
+        Returns:
+            list[str]: A list of strings where each string represents a data type
+            of an element found in the dataset.
+        """
+        dataset_table = self.name_to_table("Dataset")
+        def domain_table(table: Table) -> bool:
+            return table.schema.name == self.domain_schema or table.name == dataset_table.name
+        return [t for a in dataset_table.find_associations() if domain_table(t := a.other_fkeys.pop().pk_table)]
+    def _prepare_wide_table(self,
+                            dataset,
+                            dataset_rid: RID,
+                            include_tables: list[str]) -> tuple[dict[str, Any], list[tuple]]:
+        """
+        Generates details of a wide table from the model
+        Args:
+            include_tables (list[str] | None): List of table names to include in the denormalized dataset. If None,
+                all tables from the dataset will be included.
+        Returns:
+            str: SQL query string that represents the process of denormalization.
+        """
+        # Skip over tables that we don't want to include in the denormalized dataset.
+        # Also, strip off the Dataset/Dataset_X part of the path so we don't include dataset columns in the denormalized
+        # table.
+        include_tables = set(include_tables)
+        for t in include_tables:
+            # Check to make sure the table is in the catalog.
+            _ = self.name_to_table(t)
+        table_paths = [
+            path
+            for path in self._schema_to_paths()
+            if path[-1].name in include_tables and include_tables.intersection({p.name for p in path})
+        ]
+        paths_by_element = defaultdict(list)
+        for p in table_paths:
+            paths_by_element[p[2].name].append(p)
+        # Get the names of all of the tables that can be dataset elements.
+        dataset_element_tables = {
+            e.name for e in self.list_dataset_element_types() if e.schema.name == self.domain_schema
+        }
+        skip_columns = {"RCT", "RMT", "RCB", "RMB"}
+        element_tables = {}
+        for element_table, paths in paths_by_element.items():
+            graph = {}
+            for path in paths:
+                for left, right in zip(path[0:], path[1:]):
+                    graph.setdefault(left.name, set()).add(right.name)
+            # New lets remove any cycles that we may have in the graph.
+            # We will use a topological sort to find the order in which we need to join the tables.
+            # If we find a cycle, we will remove the table from the graph and splice in an additional ON clause.
+            # We will then repeat the process until there are no cycles.
+            graph_has_cycles = True
+            element_join_tables = []
+            element_join_conditions = {}
+            while graph_has_cycles:
+                try:
+                    ts = TopologicalSorter(graph)
+                    element_join_tables = list(reversed(list(ts.static_order())))
+                    graph_has_cycles = False
+                except CycleError as e:
+                    cycle_nodes = e.args[1]
+                    if len(cycle_nodes) > 3:
+                        raise DerivaMLException(f"Unexpected cycle found when normalizing dataset {cycle_nodes}")
+                    # Remove cycle from graph and splice in additional ON constraint.
+                    graph[cycle_nodes[1]].remove(cycle_nodes[0])
+            # The Dataset_Version table is a special case as it points to dataset and dataset to version.
+            if "Dataset_Version" in element_join_tables:
+                element_join_tables.remove("Dataset_Version")
+            for path in paths:
+                for left, right in zip(path[0:], path[1:]):
+                    if right.name == "Dataset_Version":
+                        # The Dataset_Version table is a special case as it points to dataset and dataset to version.
+                        continue
+                    if element_join_tables.index(right.name) < element_join_tables.index(left.name):
+                        continue
+                    table_relationship = self._table_relationship(left, right)
+                    element_join_conditions.setdefault(right.name, set()).add(
+                        (table_relationship[0], table_relationship[1])
+                    )
+            element_tables[element_table] = (element_join_tables, element_join_conditions)
+        # Get the list of columns that will appear in the final denormalized dataset.
+        denormalized_columns = [
+            (table_name, c.name)
+            for table_name in include_tables
+            if not self.is_association(table_name)  # Don't include association columns in the denormalized view.'
+            for c in self.name_to_table(table_name).columns
+            if (not include_tables or table_name in include_tables) and (c.name not in skip_columns)
+        ]
+        return element_tables, denormalized_columns
     def _table_relationship(
         self,
         table1: TableInput,
@@ -302,7 +417,9 @@ class DerivaModel:
             [(fk.referenced_columns[0], fk.foreign_key_columns[0]) for fk in table1.referenced_by if fk.table == table2]
         )
         if len(relationships) != 1:
-            raise DerivaMLException(f"Ambiguous linkage between {table1.name} and {table2.name}")
+            raise DerivaMLException(
+                f"Ambiguous linkage between {table1.name} and {table2.name}: {[(r[0].name, r[1].name) for r in relationships]}"
+            )
         return relationships[0]
     def _schema_to_paths(

deriva-ml 1.14.47__py3-none-any.whl → 1.17.0__py3-none-any.whl

deriva-ml 1.14.47py3-none-any.whl → 1.17.0py3-none-any.whl