PyPI - deriva-ml - Versions diffs - 1.17.10__py3-none-any.whl - Mend

deriva-ml 1.17.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

deriva_ml/.DS_Store +0 -0
deriva_ml/__init__.py +79 -0
deriva_ml/bump_version.py +142 -0
deriva_ml/core/__init__.py +39 -0
deriva_ml/core/base.py +1527 -0
deriva_ml/core/config.py +69 -0
deriva_ml/core/constants.py +36 -0
deriva_ml/core/definitions.py +74 -0
deriva_ml/core/enums.py +222 -0
deriva_ml/core/ermrest.py +288 -0
deriva_ml/core/exceptions.py +28 -0
deriva_ml/core/filespec.py +116 -0
deriva_ml/dataset/__init__.py +12 -0
deriva_ml/dataset/aux_classes.py +225 -0
deriva_ml/dataset/dataset.py +1519 -0
deriva_ml/dataset/dataset_bag.py +450 -0
deriva_ml/dataset/history.py +109 -0
deriva_ml/dataset/upload.py +439 -0
deriva_ml/demo_catalog.py +495 -0
deriva_ml/execution/__init__.py +26 -0
deriva_ml/execution/environment.py +290 -0
deriva_ml/execution/execution.py +1180 -0
deriva_ml/execution/execution_configuration.py +147 -0
deriva_ml/execution/workflow.py +413 -0
deriva_ml/feature.py +228 -0
deriva_ml/install_kernel.py +71 -0
deriva_ml/model/__init__.py +0 -0
deriva_ml/model/catalog.py +485 -0
deriva_ml/model/database.py +719 -0
deriva_ml/protocols/dataset.py +19 -0
deriva_ml/run_notebook.py +228 -0
deriva_ml/schema/__init__.py +3 -0
deriva_ml/schema/annotations.py +473 -0
deriva_ml/schema/check_schema.py +104 -0
deriva_ml/schema/create_schema.py +393 -0
deriva_ml/schema/deriva-ml-reference.json +8525 -0
deriva_ml/schema/policy.json +81 -0
deriva_ml/schema/table_comments_utils.py +57 -0
deriva_ml/test.py +94 -0
deriva_ml-1.17.10.dist-info/METADATA +38 -0
deriva_ml-1.17.10.dist-info/RECORD +45 -0
deriva_ml-1.17.10.dist-info/WHEEL +5 -0
deriva_ml-1.17.10.dist-info/entry_points.txt +9 -0
deriva_ml-1.17.10.dist-info/licenses/LICENSE +201 -0
deriva_ml-1.17.10.dist-info/top_level.txt +1 -0

deriva_ml/execution/execution_configuration.py ADDED Viewed

@@ -0,0 +1,147 @@
+"""Configuration management for DerivaML executions.
+This module provides functionality for configuring and managing execution parameters in DerivaML.
+It includes:
+- ExecutionConfiguration class: Core class for execution settings
+- Parameter validation: Handles JSON and file-based parameters
+- Dataset specifications: Manages dataset versions and materialization
+- Asset management: Tracks required input files
+The module supports both direct parameter specification and JSON-based configuration files.
+Typical usage example:
+    >>> config = ExecutionConfiguration(
+    ...     workflow="analysis_workflow",
+    ...     datasets=[DatasetSpec(rid="1-abc123", version="1.0.0")],
+    ...     parameters={"threshold": 0.5},
+    ...     description="Process sample data"
+    ... )
+    >>> execution = ml.create_execution(config)
+"""
+from __future__ import annotations
+import json
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+from hydra_zen import builds
+from omegaconf import DictConfig
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+from deriva_ml.core.definitions import RID
+from deriva_ml.dataset.aux_classes import DatasetSpec
+from deriva_ml.execution.workflow import Workflow
+class ExecutionConfiguration(BaseModel):
+    """Configuration for a DerivaML execution.
+    Defines the complete configuration for a computational or manual process in DerivaML,
+    including required datasets, input assets, workflow definition, and parameters.
+    Attributes:
+        datasets (list[DatasetSpec]): Dataset specifications, each containing:
+            - rid: Dataset Resource Identifier
+            - version: Version to use
+            - materialize: Whether to extract dataset contents
+        assets (list[RID]): Resource Identifiers of required input assets.
+        workflow (RID | Workflow): Workflow definition or its Resource Identifier.
+        parameters (dict[str, Any] | Path): Execution parameters, either as:
+            - Dictionary of parameter values
+            - Path to JSON file containing parameters
+        description (str): Description of execution purpose (supports Markdown).
+        argv (list[str]): Command line arguments used to start execution.
+    Example:
+        >>> config = ExecutionConfiguration(
+        ...     workflow=Workflow.create_workflow("analysis", "python_script"),
+        ...     datasets=[
+        ...         DatasetSpec(rid="1-abc123", version="1.0.0", materialize=True)
+        ...     ],
+        ...     parameters={"threshold": 0.5, "max_iterations": 100},
+        ...     description="Process RNA sequence data"
+        ... )
+    """
+    datasets: list[DatasetSpec] = []
+    assets: list[RID] = []
+    workflow: RID | Workflow | None = None
+    description: str = ""
+    argv: list[str] = Field(default_factory=lambda: sys.argv)
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    #  @field_validator("datasets", mode="before")
+    #  @classmethod
+    #  def validate_datasets(cls, value: Any) -> Any:
+    #      if isinstance(value, DatasetList):
+    #          config_list: DatasetList = value
+    #          value = config_list.datasets
+    #      return value
+    @field_validator("assets", mode="before")
+    @classmethod
+    def validate_assets(cls, value: Any) -> Any:
+        return [v.rid if isinstance(v, DictConfig) or isinstance(v, AssetRID) else v for v in value]
+    @staticmethod
+    def load_configuration(path: Path) -> ExecutionConfiguration:
+        """Creates an ExecutionConfiguration from a JSON file.
+        Loads and parses a JSON configuration file into an ExecutionConfiguration
+        instance. The file should contain a valid configuration specification.
+        Args:
+            path: Path to JSON configuration file.
+        Returns:
+            ExecutionConfiguration: Loaded configuration instance.
+        Raises:
+            ValueError: If JSON file is invalid or missing required fields.
+            FileNotFoundError: If configuration file doesn't exist.
+        Example:
+            >>> config = ExecutionConfiguration.load_configuration(Path("config.json"))
+            >>> print(f"Workflow: {config.workflow}")
+            >>> print(f"Datasets: {len(config.datasets)}")
+        """
+        with Path(path).open() as fd:
+            config = json.load(fd)
+        return ExecutionConfiguration.model_validate(config)
+    # def download_execution_configuration(
+    #     self, configuration_rid: RID
+    # ) -> ExecutionConfiguration:
+    #     """Create an ExecutionConfiguration object from a catalog RID that points to a JSON representation of that
+    #     configuration in hatrac
+    #
+    #     Args:
+    #         configuration_rid: RID that should be to an asset table that refers to an execution configuration
+    #
+    #     Returns:
+    #         A ExecutionConfiguration object for configured by the parameters in the configuration file.
+    #     """
+    #     AssertionError("Not Implemented")
+    #     configuration = self.retrieve_rid(configuration_rid)
+    #     with NamedTemporaryFile("w+", delete=False, suffix=".json") as dest_file:
+    #         hs = HatracStore("https", self.host_name, self.credential)
+    #         hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
+    #         return ExecutionConfiguration.load_configuration(Path(dest_file.name))
+@dataclass
+class AssetRID(str):
+    rid: str
+    description: str = ""
+    def __new__(cls, rid: str, description: str = ""):
+        obj = super().__new__(cls, rid)
+        obj.description = description
+        return obj
+AssetRIDConfig = builds(AssetRID, populate_full_signature=True)

deriva_ml/execution/workflow.py ADDED Viewed

@@ -0,0 +1,413 @@
+import inspect
+import logging
+import os
+import subprocess
+import sys
+import warnings
+from pathlib import Path
+from typing import Any
+import requests
+from pydantic import BaseModel, PrivateAttr, model_validator
+from requests import RequestException
+from deriva_ml.core.definitions import RID
+from deriva_ml.core.exceptions import DerivaMLException
+try:
+    from IPython.core.getipython import get_ipython
+except ImportError:  # Graceful fallback if IPython isn't installed.
+    def get_ipython() -> None:
+        return None
+try:
+    from jupyter_server.serverapp import list_running_servers
+    def get_servers() -> list[Any]:
+        return list(list_running_servers())
+except ImportError:
+    def list_running_servers():
+        return []
+    def get_servers() -> list[Any]:
+        return list_running_servers()
+try:
+    from ipykernel.connect import get_connection_file
+    def get_kernel_connection() -> str:
+        return get_connection_file()
+except ImportError:
+    def get_connection_file():
+        return ""
+    def get_kernel_connection() -> str:
+        return get_connection_file()
+class Workflow(BaseModel):
+    """Represents a computational workflow in DerivaML.
+    A workflow defines a computational process or analysis pipeline. Each workflow has
+    a unique identifier, source code location, and type. Workflows are typically
+    associated with Git repositories for version control.
+    Attributes:
+        name (str): Human-readable name of the workflow.
+        url (str): URI to the workflow source code (typically a GitHub URL).
+        workflow_type (str): Type of workflow (must be a controlled vocabulary term).
+        version (str | None): Version identifier (semantic versioning).
+        description (str | None): Description of workflow purpose and behavior.
+        rid (RID | None): Resource Identifier if registered in catalog.
+        checksum (str | None): Git hash of workflow source code.
+        is_notebook (bool): Whether workflow is a Jupyter notebook.
+    Example:
+        >>> workflow = Workflow(
+        ...     name="RNA Analysis",
+        ...     url="https://github.com/org/repo/analysis.ipynb",
+        ...     workflow_type="python_notebook",
+        ...     version="1.0.0",
+        ...     description="RNA sequence analysis"
+        ... )
+    """
+    name: str
+    workflow_type: str
+    description: str | None = None
+    url: str | None = None
+    version: str | None = None
+    rid: RID | None = None
+    checksum: str | None = None
+    is_notebook: bool = False
+    git_root: Path | None = None
+    _logger: logging.Logger = PrivateAttr(default=10)
+    @model_validator(mode="after")
+    def setup_url_checksum(self) -> "Workflow":
+        """Creates a workflow from the current execution context.
+        Identifies the currently executing program (script or notebook) and creates
+        a workflow definition. Automatically determines the Git repository information
+        and source code checksum.
+        The behavior can be configured using environment variables:
+            - DERIVA_ML_WORKFLOW_URL: Override the detected workflow URL
+            - DERIVA_ML_WORKFLOW_CHECKSUM: Override the computed checksum
+        Args:
+        Returns:
+            Workflow: New workflow instance with detected Git information.
+        Raises:
+            DerivaMLException: If not in a Git repository or detection fails.
+        Example:
+            >>> workflow = Workflow.create_workflow(
+            ...     name="Sample Analysis",
+            ...     workflow_type="python_script",
+            ...     description="Process sample data"
+            ... )
+        """
+        """Initializes logging for the workflow."""
+        # Check to see if execution file info is being passed in by calling program.
+        if "DERIVA_ML_WORKFLOW_URL" in os.environ:
+            self.url = os.environ["DERIVA_ML_WORKFLOW_URL"]
+            self.checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
+            self.git_root = Workflow._get_git_root(Path(os.environ["DERIVA_ML_NOTEBOOK_PATH"]))
+            self.is_notebook = True
+        if not self.url:
+            path, self.is_notebook = Workflow._get_python_script()
+            self.url, self.checksum = Workflow.get_url_and_checksum(path)
+            self.git_root = Workflow._get_git_root(path)
+        self.version = self.version or Workflow.get_dynamic_version(root=str(self.git_root or Path.cwd()))
+        self._logger = logging.getLogger("deriva_ml")
+        return self
+    @staticmethod
+    def get_url_and_checksum(executable_path: Path) -> tuple[str, str]:
+        """Determines the Git URL and checksum for a file.
+        Computes the Git repository URL and file checksum for the specified path.
+        For notebooks, strips cell outputs before computing the checksum.
+        Args:
+            executable_path: Path to the workflow file.
+        Returns:
+            tuple[str, str]: (GitHub URL, Git object hash)
+        Raises:
+            DerivaMLException: If not in a Git repository.
+        Example:
+            >>> url, checksum = Workflow.get_url_and_checksum(Path("analysis.ipynb"))
+            >>> print(f"URL: {url}")
+            >>> print(f"Checksum: {checksum}")
+        """
+        try:
+            subprocess.run(
+                "git rev-parse --is-inside-work-tree",
+                capture_output=True,
+                text=True,
+                shell=True,
+                check=True,
+            )
+        except subprocess.CalledProcessError:
+            raise DerivaMLException("Not executing in a Git repository.")
+        github_url, is_dirty = Workflow._github_url(executable_path)
+        if is_dirty:
+            logging.getLogger("deriva_ml").warning(
+                f"File {executable_path} has been modified since last commit. Consider commiting before executing"
+            )
+        # If you are in a notebook, strip out the outputs before computing the checksum.
+        cmd = (
+            f"nbstripout -t {executable_path} | git hash-object --stdin"
+            if "ipynb" == executable_path.suffix
+            else f"git hash-object {executable_path}"
+        )
+        checksum = (
+            subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=False,
+                shell=True,
+            ).stdout.strip()
+            if executable_path != "REPL"
+            else "1"
+        )
+        return github_url, checksum
+    @staticmethod
+    def _get_git_root(executable_path: Path) -> str | None:
+        """Gets the root directory of the Git repository.
+        Args:
+            executable_path: Path to check for Git repository.
+        Returns:
+            str | None: Absolute path to repository root, or None if not in repository.
+        """
+        try:
+            result = subprocess.run(
+                ["git", "rev-parse", "--show-toplevel"],
+                cwd=executable_path.parent,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.DEVNULL,
+                text=True,
+                check=True,
+            )
+            return result.stdout.strip()
+        except subprocess.CalledProcessError:
+            return None  # Not in a git repository
+    @staticmethod
+    def _check_nbstrip_status() -> None:
+        """Checks if nbstripout is installed and configured.
+        Verifies that the nbstripout tool is available and properly installed in the
+        Git repository. Issues warnings if setup is incomplete.
+        """
+        logger = logging.getLogger("deriva_ml")
+        try:
+            if subprocess.run(
+                ["nbstripout", "--is-installed"],
+                check=False,
+                capture_output=True,
+            ).returncode:
+                logger.warning("nbstripout is not installed in repository. Please run nbstripout --install")
+        except subprocess.CalledProcessError:
+            logger.error("nbstripout is not found.")
+    @staticmethod
+    def _get_notebook_path() -> Path | None:
+        """Gets the path of the currently executing notebook.
+        Returns:
+            Path | None: Absolute path to current notebook, or None if not in notebook.
+        """
+        server, session = Workflow._get_notebook_session()
+        if server and session:
+            relative_path = session["notebook"]["path"]
+            # Join the notebook directory with the relative path
+            return Path(server["root_dir"]) / relative_path
+        else:
+            return None
+    @staticmethod
+    def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
+        """Return the absolute path of the current notebook."""
+        # Get the kernel's connection file and extract the kernel ID
+        try:
+            if not (connection_file := Path(get_kernel_connection()).name):
+                return None, None
+        except RuntimeError:
+            return None, None
+        kernel_id = connection_file.split("-", 1)[1].split(".")[0]
+        # Look through the running server sessions to find the matching kernel ID
+        for server in get_servers():
+            try:
+                # If a token is required for authentication, include it in headers
+                token = server.get("token", "")
+                headers = {}
+                if token:
+                    headers["Authorization"] = f"token {token}"
+                try:
+                    sessions_url = server["url"] + "api/sessions"
+                    response = requests.get(sessions_url, headers=headers)
+                    response.raise_for_status()
+                    sessions = response.json()
+                except RequestException as e:
+                    raise e
+                for sess in sessions:
+                    if sess["kernel"]["id"] == kernel_id:
+                        return server, sess
+            except Exception as _e:
+                # Ignore servers we can't connect to.
+                pass
+        return None, None
+    @staticmethod
+    def _in_repl():
+        # Standard Python interactive mode
+        if hasattr(sys, "ps1"):
+            return True
+        # Interactive mode forced by -i
+        if sys.flags.interactive:
+            return True
+        # IPython / Jupyter detection
+        try:
+            from IPython import get_ipython
+            if get_ipython() is not None:
+                return True
+        except ImportError:
+            pass
+        return False
+    @staticmethod
+    def _get_python_script() -> tuple[Path, bool]:
+        """Return the path to the currently executing script"""
+        is_notebook = True
+        if not (filename := Workflow._get_notebook_path()):
+            is_notebook = False
+            stack = [
+                s.filename
+                for s in inspect.stack()
+                if ("pycharm" not in s.filename) and ("site-packages" not in s.filename)
+            ]
+            # Get the caller's filename, which is two up the stack from here.
+            filename = Path(stack[-1])
+            if not (filename.exists()) or Workflow._in_repl():
+                # Being called from the command line interpreter.
+                filename = Path.cwd() / Path("REPL")
+            # Get the caller's filename, which is two up the stack from here.
+            elif (not filename.exists()) and "PYTEST_CURRENT_TEST" in os.environ:
+                filename = Path.cwd() / Path("pytest")
+        return filename, is_notebook
+    @staticmethod
+    def _github_url(executable_path: Path) -> tuple[str, bool]:
+        """Return a GitHub URL for the latest commit of the script from which this routine is called.
+        This routine is used to be called from a script or notebook (e.g., python -m file). It assumes that
+        the file is in a GitHub repository and committed.  It returns a URL to the last commited version of this
+        file in GitHub.
+        Returns: A tuple with the gethub_url and a boolean to indicate if uncommited changes
+            have been made to the file.
+        """
+        # Get repo URL from local GitHub repo.
+        if executable_path == "REPL":
+            return "REPL", True
+        try:
+            result = subprocess.run(
+                ["git", "remote", "get-url", "origin"],
+                capture_output=True,
+                text=True,
+                cwd=executable_path.parent,
+            )
+            github_url = result.stdout.strip().removesuffix(".git")
+        except subprocess.CalledProcessError:
+            raise DerivaMLException("No GIT remote found")
+        # Find the root directory for the repository
+        repo_root = Workflow._get_git_root(executable_path)
+        # Now check to see if a file has been modified since the last commit.
+        try:
+            result = subprocess.run(
+                ["git", "status", "--porcelain"],
+                cwd=executable_path.parent,
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+            is_dirty = bool("M " in result.stdout.strip())  # Returns True if the output indicates a modified file
+        except subprocess.CalledProcessError:
+            is_dirty = False  # If the Git command fails, assume no changes
+        """Get SHA-1 hash of latest commit of the file in the repository"""
+        result = subprocess.run(
+            ["git", "log", "-n", "1", "--pretty=format:%H", executable_path],
+            cwd=repo_root,
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        sha = result.stdout.strip()
+        url = f"{github_url}/blob/{sha}/{executable_path.relative_to(repo_root)}"
+        return url, is_dirty
+    @staticmethod
+    def get_dynamic_version(root: str | os.PathLike | None = None) -> str:
+        """
+        Return a dynamic version string based on VCS state (setuptools_scm),
+        including dirty/uncommitted changes if configured.
+        Works under uv / Python 3.10+ by forcing setuptools to use stdlib distutils.
+        """
+        # 1) Tell setuptools to use stdlib distutils (or no override) to avoid
+        #    the '_distutils_hack' assertion you hit.
+        os.environ.setdefault("SETUPTOOLS_USE_DISTUTILS", "stdlib")
+        warnings.filterwarnings(
+            "ignore",
+            category=UserWarning,
+            module="_distutils_hack",
+        )
+        try:
+            from setuptools_scm import get_version
+        except Exception as e:  # ImportError or anything environment-specific
+            raise RuntimeError(f"setuptools_scm is not available: {e}") from e
+        if root is None:
+            # Adjust this to point at your repo root if needed
+            root = Path(__file__).resolve().parents[1]
+        return get_version(root=root)