PyPI - deriva-ml - Versions diffs - 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl - Mend

deriva-ml 1.17.10py3-none-any.whl → 1.17.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

deriva_ml/__init__.py +43 -1
deriva_ml/asset/__init__.py +17 -0
deriva_ml/asset/asset.py +357 -0
deriva_ml/asset/aux_classes.py +100 -0
deriva_ml/bump_version.py +254 -11
deriva_ml/catalog/__init__.py +21 -0
deriva_ml/catalog/clone.py +1199 -0
deriva_ml/catalog/localize.py +426 -0
deriva_ml/core/__init__.py +29 -0
deriva_ml/core/base.py +817 -1067
deriva_ml/core/config.py +169 -21
deriva_ml/core/constants.py +120 -19
deriva_ml/core/definitions.py +123 -13
deriva_ml/core/enums.py +47 -73
deriva_ml/core/ermrest.py +226 -193
deriva_ml/core/exceptions.py +297 -14
deriva_ml/core/filespec.py +99 -28
deriva_ml/core/logging_config.py +225 -0
deriva_ml/core/mixins/__init__.py +42 -0
deriva_ml/core/mixins/annotation.py +915 -0
deriva_ml/core/mixins/asset.py +384 -0
deriva_ml/core/mixins/dataset.py +237 -0
deriva_ml/core/mixins/execution.py +408 -0
deriva_ml/core/mixins/feature.py +365 -0
deriva_ml/core/mixins/file.py +263 -0
deriva_ml/core/mixins/path_builder.py +145 -0
deriva_ml/core/mixins/rid_resolution.py +204 -0
deriva_ml/core/mixins/vocabulary.py +400 -0
deriva_ml/core/mixins/workflow.py +322 -0
deriva_ml/core/validation.py +389 -0
deriva_ml/dataset/__init__.py +2 -1
deriva_ml/dataset/aux_classes.py +20 -4
deriva_ml/dataset/catalog_graph.py +575 -0
deriva_ml/dataset/dataset.py +1242 -1008
deriva_ml/dataset/dataset_bag.py +1311 -182
deriva_ml/dataset/history.py +27 -14
deriva_ml/dataset/upload.py +225 -38
deriva_ml/demo_catalog.py +126 -110
deriva_ml/execution/__init__.py +46 -2
deriva_ml/execution/base_config.py +639 -0
deriva_ml/execution/execution.py +543 -242
deriva_ml/execution/execution_configuration.py +26 -11
deriva_ml/execution/execution_record.py +592 -0
deriva_ml/execution/find_caller.py +298 -0
deriva_ml/execution/model_protocol.py +175 -0
deriva_ml/execution/multirun_config.py +153 -0
deriva_ml/execution/runner.py +595 -0
deriva_ml/execution/workflow.py +223 -34
deriva_ml/experiment/__init__.py +8 -0
deriva_ml/experiment/experiment.py +411 -0
deriva_ml/feature.py +6 -1
deriva_ml/install_kernel.py +143 -6
deriva_ml/interfaces.py +862 -0
deriva_ml/model/__init__.py +99 -0
deriva_ml/model/annotations.py +1278 -0
deriva_ml/model/catalog.py +286 -60
deriva_ml/model/database.py +144 -649
deriva_ml/model/deriva_ml_database.py +308 -0
deriva_ml/model/handles.py +14 -0
deriva_ml/run_model.py +319 -0
deriva_ml/run_notebook.py +507 -38
deriva_ml/schema/__init__.py +18 -2
deriva_ml/schema/annotations.py +62 -33
deriva_ml/schema/create_schema.py +169 -69
deriva_ml/schema/validation.py +601 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
deriva_ml-1.17.11.dist-info/RECORD +77 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
deriva_ml/protocols/dataset.py +0 -19
deriva_ml/test.py +0 -94
deriva_ml-1.17.10.dist-info/RECORD +0 -45
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0

deriva_ml/execution/workflow.py CHANGED Viewed

@@ -1,18 +1,23 @@
-import inspect
+from __future__ import annotations
 import logging
 import os
 import subprocess
 import sys
 import warnings
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 import requests
-from pydantic import BaseModel, PrivateAttr, model_validator
+from pydantic import BaseModel, ConfigDict, PrivateAttr, model_validator
 from requests import RequestException
 from deriva_ml.core.definitions import RID
 from deriva_ml.core.exceptions import DerivaMLException
+from deriva_ml.execution.find_caller import _get_calling_module
+if TYPE_CHECKING:
+    from deriva_ml.interfaces import DerivaMLCatalog
 try:
     from IPython.core.getipython import get_ipython
@@ -57,26 +62,60 @@ class Workflow(BaseModel):
     a unique identifier, source code location, and type. Workflows are typically
     associated with Git repositories for version control.
+    When a Workflow is retrieved via ``lookup_workflow(rid)`` or ``lookup_workflow_by_url()``,
+    it is bound to a catalog and its ``description`` and ``workflow_type`` properties become
+    writable. Setting these properties will update the catalog record. If the catalog is
+    read-only (a snapshot), attempting to set them will raise a ``DerivaMLException``.
     Attributes:
         name (str): Human-readable name of the workflow.
         url (str): URI to the workflow source code (typically a GitHub URL).
         workflow_type (str): Type of workflow (must be a controlled vocabulary term).
+            When the workflow is bound to a writable catalog, setting this property
+            will update the catalog record. The new value must be a valid term from
+            the Workflow_Type vocabulary.
         version (str | None): Version identifier (semantic versioning).
         description (str | None): Description of workflow purpose and behavior.
+            When the workflow is bound to a writable catalog, setting this property
+            will update the catalog record.
         rid (RID | None): Resource Identifier if registered in catalog.
         checksum (str | None): Git hash of workflow source code.
         is_notebook (bool): Whether workflow is a Jupyter notebook.
     Example:
-        >>> workflow = Workflow(
-        ...     name="RNA Analysis",
-        ...     url="https://github.com/org/repo/analysis.ipynb",
-        ...     workflow_type="python_notebook",
-        ...     version="1.0.0",
-        ...     description="RNA sequence analysis"
-        ... )
+        Create a workflow programmatically::
+            >>> workflow = Workflow(
+            ...     name="RNA Analysis",
+            ...     url="https://github.com/org/repo/analysis.ipynb",
+            ...     workflow_type="python_notebook",
+            ...     version="1.0.0",
+            ...     description="RNA sequence analysis"
+            ... )
+        Look up an existing workflow by RID and update its properties::
+            >>> workflow = ml.lookup_workflow("2-ABC1")
+            >>> workflow.description = "Updated description for RNA analysis"
+            >>> workflow.workflow_type = "python_script"
+            >>> print(workflow.description)
+            Updated description for RNA analysis
+        Look up by URL and update::
+            >>> url = "https://github.com/org/repo/blob/abc123/analysis.py"
+            >>> workflow = ml.lookup_workflow_by_url(url)
+            >>> workflow.description = "New description"
+        Attempting to update on a read-only catalog raises an error::
+            >>> snapshot_ml = ml.catalog_snapshot("2023-01-15T10:30:00")
+            >>> workflow = snapshot_ml.lookup_workflow("2-ABC1")
+            >>> workflow.description = "New description"  # Raises DerivaMLException
     """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
     name: str
     workflow_type: str
     description: str | None = None
@@ -87,8 +126,119 @@ class Workflow(BaseModel):
     is_notebook: bool = False
     git_root: Path | None = None
+    _ml_instance: "DerivaMLCatalog | None" = PrivateAttr(default=None)
     _logger: logging.Logger = PrivateAttr(default=10)
+    def __setattr__(self, name: str, value: Any) -> None:
+        """Override setattr to intercept description and workflow_type updates.
+        When the workflow is bound to a catalog (via lookup_workflow), setting
+        the ``description`` or ``workflow_type`` properties will update the catalog
+        record. If the catalog is read-only (a snapshot), a DerivaMLException is raised.
+        Args:
+            name: The attribute name being set.
+            value: The value to set.
+        Raises:
+            DerivaMLException: If attempting to set properties on a read-only
+                catalog (snapshot), or if workflow_type is not a valid vocabulary term.
+        Examples:
+            Update description::
+                >>> workflow = ml.lookup_workflow("2-ABC1")
+                >>> workflow.description = "Updated description"
+            Update workflow type::
+                >>> workflow = ml.lookup_workflow("2-ABC1")
+                >>> workflow.workflow_type = "python_notebook"
+        """
+        # Only intercept updates after full initialization
+        # Use __dict__ check to avoid recursion during Pydantic model construction
+        if (
+            "__pydantic_private__" in self.__dict__
+            and self.__dict__.get("__pydantic_private__", {}).get("_ml_instance") is not None
+        ):
+            if name == "description":
+                self._update_description_in_catalog(value)
+            elif name == "workflow_type":
+                self._update_workflow_type_in_catalog(value)
+        super().__setattr__(name, value)
+    def _check_writable_catalog(self, operation: str) -> None:
+        """Check that the catalog is writable and workflow is registered.
+        Args:
+            operation: Description of the operation being attempted.
+        Raises:
+            DerivaMLException: If the workflow is not registered (no RID),
+                or if the catalog is read-only (a snapshot).
+        """
+        # Import here to avoid circular dependency at module load
+        import importlib
+        _deriva_core = importlib.import_module("deriva.core")
+        ErmrestSnapshot = _deriva_core.ErmrestSnapshot
+        if self.rid is None:
+            raise DerivaMLException(
+                f"Cannot {operation}: Workflow is not registered in the catalog (no RID)"
+            )
+        if isinstance(self._ml_instance.catalog, ErmrestSnapshot):
+            raise DerivaMLException(
+                f"Cannot {operation} on a read-only catalog snapshot. "
+                "Use a writable catalog connection instead."
+            )
+    def _update_description_in_catalog(self, new_description: str | None) -> None:
+        """Update the description field in the catalog.
+        This internal method is called when the description property is set
+        on a catalog-bound Workflow object.
+        Args:
+            new_description: The new description value.
+        Raises:
+            DerivaMLException: If the workflow is not registered (no RID),
+                or if the catalog is read-only (a snapshot).
+        """
+        self._check_writable_catalog("update description")
+        # Update the catalog record
+        pb = self._ml_instance.pathBuilder()
+        workflow_path = pb.schemas[self._ml_instance.ml_schema].Workflow
+        workflow_path.update([{"RID": self.rid, "Description": new_description}])
+    def _update_workflow_type_in_catalog(self, new_workflow_type: str) -> None:
+        """Update the workflow_type field in the catalog.
+        This internal method is called when the workflow_type property is set
+        on a catalog-bound Workflow object. The new workflow type must be a valid
+        term from the Workflow_Type vocabulary.
+        Args:
+            new_workflow_type: The new workflow type (must be a valid vocabulary term).
+        Raises:
+            DerivaMLException: If the workflow is not registered (no RID),
+                the catalog is read-only (a snapshot), or the workflow_type
+                is not a valid vocabulary term.
+        """
+        self._check_writable_catalog("update workflow_type")
+        # Validate that the new workflow type exists in vocabulary
+        from deriva_ml.core.definitions import MLVocab
+        self._ml_instance.lookup_term(MLVocab.workflow_type, new_workflow_type)
+        # Update the catalog record
+        pb = self._ml_instance.pathBuilder()
+        workflow_path = pb.schemas[self._ml_instance.ml_schema].Workflow
+        workflow_path.update([{"RID": self.rid, "Workflow_Type": new_workflow_type}])
     @model_validator(mode="after")
     def setup_url_checksum(self) -> "Workflow":
         """Creates a workflow from the current execution context.
@@ -100,6 +250,13 @@ class Workflow(BaseModel):
         The behavior can be configured using environment variables:
             - DERIVA_ML_WORKFLOW_URL: Override the detected workflow URL
             - DERIVA_ML_WORKFLOW_CHECKSUM: Override the computed checksum
+            - DERIVAML_MCP_IN_DOCKER: Set to "true" to use Docker metadata instead of git
+        Docker environment variables (used when DERIVAML_MCP_IN_DOCKER=true):
+            - DERIVAML_MCP_VERSION: Semantic version of the Docker image
+            - DERIVAML_MCP_GIT_COMMIT: Git commit hash at build time
+            - DERIVAML_MCP_IMAGE_DIGEST: Docker image digest (unique identifier)
+            - DERIVAML_MCP_IMAGE_NAME: Docker image name (e.g., ghcr.io/org/repo)
         Args:
@@ -107,7 +264,7 @@ class Workflow(BaseModel):
             Workflow: New workflow instance with detected Git information.
         Raises:
-            DerivaMLException: If not in a Git repository or detection fails.
+            DerivaMLException: If not in a Git repository or detection fails (non-Docker).
         Example:
             >>> workflow = Workflow.create_workflow(
@@ -116,22 +273,55 @@ class Workflow(BaseModel):
             ...     description="Process sample data"
             ... )
         """
-        """Initializes logging for the workflow."""
+        self._logger = logging.getLogger("deriva_ml")
-        # Check to see if execution file info is being passed in by calling program.
+        # Check if running in Docker container (no git repo available)
+        if os.environ.get("DERIVAML_MCP_IN_DOCKER", "").lower() == "true":
+            # Use Docker image metadata for provenance
+            self.version = self.version or os.environ.get("DERIVAML_MCP_VERSION", "")
+            # Use image digest as checksum (unique identifier for the container)
+            # Fall back to git commit if digest not available
+            self.checksum = self.checksum or (
+                os.environ.get("DERIVAML_MCP_IMAGE_DIGEST", "")
+                or os.environ.get("DERIVAML_MCP_GIT_COMMIT", "")
+            )
+            # Build URL pointing to the Docker image or source repo
+            if not self.url:
+                image_name = os.environ.get(
+                    "DERIVAML_MCP_IMAGE_NAME",
+                    "ghcr.io/informatics-isi-edu/deriva-ml-mcp",
+                )
+                image_digest = os.environ.get("DERIVAML_MCP_IMAGE_DIGEST", "")
+                if image_digest:
+                    # URL format: image@sha256:digest
+                    self.url = f"{image_name}@{image_digest}"
+                else:
+                    # Fall back to source repo with git commit
+                    source_url = "https://github.com/informatics-isi-edu/deriva-ml-mcp"
+                    git_commit = os.environ.get("DERIVAML_MCP_GIT_COMMIT", "")
+                    self.url = f"{source_url}/commit/{git_commit}" if git_commit else source_url
+            return self
+        # Check to see if execution file info is being passed in by calling program (notebook runner)
         if "DERIVA_ML_WORKFLOW_URL" in os.environ:
             self.url = os.environ["DERIVA_ML_WORKFLOW_URL"]
-            self.checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
-            self.git_root = Workflow._get_git_root(Path(os.environ["DERIVA_ML_NOTEBOOK_PATH"]))
+            self.checksum = os.environ.get("DERIVA_ML_WORKFLOW_CHECKSUM", "")
+            notebook_path = os.environ.get("DERIVA_ML_NOTEBOOK_PATH")
+            if notebook_path:
+                self.git_root = Workflow._get_git_root(Path(notebook_path))
             self.is_notebook = True
+            return self
+        # Standard git detection for local development
         if not self.url:
             path, self.is_notebook = Workflow._get_python_script()
             self.url, self.checksum = Workflow.get_url_and_checksum(path)
             self.git_root = Workflow._get_git_root(path)
         self.version = self.version or Workflow.get_dynamic_version(root=str(self.git_root or Path.cwd()))
-        self._logger = logging.getLogger("deriva_ml")
         return self
     @staticmethod
@@ -260,7 +450,21 @@ class Workflow(BaseModel):
         except RuntimeError:
             return None, None
-        kernel_id = connection_file.split("-", 1)[1].split(".")[0]
+        # Extract kernel ID from connection filename.
+        # Standard Jupyter format: "kernel-<kernel_id>.json"
+        # PyCharm/other formats may vary: "<kernel_id>.json" or other patterns
+        kernel_id = None
+        if connection_file.startswith("kernel-") and "-" in connection_file:
+            # Standard format: kernel-<uuid>.json
+            parts = connection_file.split("-", 1)
+            if len(parts) > 1:
+                kernel_id = parts[1].rsplit(".", 1)[0]
+        else:
+            # Fallback: assume filename (without extension) is the kernel ID
+            kernel_id = connection_file.rsplit(".", 1)[0]
+        if not kernel_id:
+            return None, None
         # Look through the running server sessions to find the matching kernel ID
         for server in get_servers():
@@ -310,23 +514,8 @@ class Workflow(BaseModel):
     @staticmethod
     def _get_python_script() -> tuple[Path, bool]:
         """Return the path to the currently executing script"""
-        is_notebook = True
-        if not (filename := Workflow._get_notebook_path()):
-            is_notebook = False
-            stack = [
-                s.filename
-                for s in inspect.stack()
-                if ("pycharm" not in s.filename) and ("site-packages" not in s.filename)
-            ]
-            # Get the caller's filename, which is two up the stack from here.
-            filename = Path(stack[-1])
-            if not (filename.exists()) or Workflow._in_repl():
-                # Being called from the command line interpreter.
-                filename = Path.cwd() / Path("REPL")
-            # Get the caller's filename, which is two up the stack from here.
-            elif (not filename.exists()) and "PYTEST_CURRENT_TEST" in os.environ:
-                filename = Path.cwd() / Path("pytest")
-        return filename, is_notebook
+        is_notebook = Workflow._get_notebook_path() is not None
+        return Path(_get_calling_module()), is_notebook
     @staticmethod
     def _github_url(executable_path: Path) -> tuple[str, bool]:

deriva_ml/experiment/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+"""Experiment analysis for DerivaML.
+This module provides the Experiment class for analyzing completed executions.
+"""
+from deriva_ml.experiment.experiment import Experiment
+__all__ = ["Experiment"]

deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl

deriva-ml 1.17.10py3-none-any.whl → 1.17.11py3-none-any.whl