PyPI - deriva-ml - Versions diffs - 1.14.0__py3-none-any.whl → 1.14.26__py3-none-any.whl - Mend

deriva-ml 1.14.0py3-none-any.whl → 1.14.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

deriva_ml/__init__.py +25 -30
deriva_ml/core/__init__.py +39 -0
deriva_ml/core/base.py +1489 -0
deriva_ml/core/constants.py +36 -0
deriva_ml/core/definitions.py +74 -0
deriva_ml/core/enums.py +222 -0
deriva_ml/core/ermrest.py +288 -0
deriva_ml/core/exceptions.py +28 -0
deriva_ml/core/filespec.py +116 -0
deriva_ml/dataset/__init__.py +4 -0
deriva_ml/{dataset_aux_classes.py → dataset/aux_classes.py} +16 -12
deriva_ml/{dataset.py → dataset/dataset.py} +405 -428
deriva_ml/{dataset_bag.py → dataset/dataset_bag.py} +137 -97
deriva_ml/{history.py → dataset/history.py} +51 -33
deriva_ml/{upload.py → dataset/upload.py} +48 -70
deriva_ml/demo_catalog.py +233 -183
deriva_ml/execution/environment.py +290 -0
deriva_ml/{execution.py → execution/execution.py} +365 -252
deriva_ml/execution/execution_configuration.py +163 -0
deriva_ml/{execution_configuration.py → execution/workflow.py} +206 -218
deriva_ml/feature.py +83 -46
deriva_ml/model/__init__.py +0 -0
deriva_ml/{deriva_model.py → model/catalog.py} +113 -132
deriva_ml/{database_model.py → model/database.py} +52 -74
deriva_ml/model/sql_mapper.py +44 -0
deriva_ml/run_notebook.py +19 -11
deriva_ml/schema/__init__.py +3 -0
deriva_ml/{schema_setup → schema}/annotations.py +31 -22
deriva_ml/schema/check_schema.py +104 -0
deriva_ml/{schema_setup → schema}/create_schema.py +151 -104
deriva_ml/schema/deriva-ml-reference.json +8525 -0
deriva_ml/schema/table_comments_utils.py +57 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/METADATA +5 -4
deriva_ml-1.14.26.dist-info/RECORD +40 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/entry_points.txt +1 -0
deriva_ml/deriva_definitions.py +0 -391
deriva_ml/deriva_ml_base.py +0 -1046
deriva_ml/execution_environment.py +0 -139
deriva_ml/schema_setup/table_comments_utils.py +0 -56
deriva_ml/test-files/execution-parameters.json +0 -1
deriva_ml/test-files/notebook-parameters.json +0 -5
deriva_ml/test_functions.py +0 -141
deriva_ml/test_notebook.ipynb +0 -197
deriva_ml-1.14.0.dist-info/RECORD +0 -31
/deriva_ml/{schema_setup → execution}/__init__.py +0 -0
/deriva_ml/{schema_setup → schema}/policy.json +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/WHEEL +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.14.0.dist-info → deriva_ml-1.14.26.dist-info}/top_level.txt +0 -0

deriva_ml/{execution_configuration.py → execution/workflow.py} RENAMED Viewed

@@ -1,91 +1,239 @@
-"""
-Classes that are used to define an execution configuration.
-"""
-from __future__ import annotations
 import inspect
-import json
 import logging
 import os
-from requests import RequestException
-import requests
 import subprocess
-from typing import Optional, Any
+from pathlib import Path
+from typing import Any
+import requests
 from pydantic import (
     BaseModel,
-    conlist,
-    ConfigDict,
-    field_validator,
-    Field,
     PrivateAttr,
 )
-from pathlib import Path
-import sys
+from requests import RequestException
-from .dataset_aux_classes import DatasetSpec
-from .deriva_definitions import RID, DerivaMLException
+from deriva_ml.core.definitions import RID
+from deriva_ml.core.exceptions import DerivaMLException
 try:
-    from IPython import get_ipython
+    from IPython.core.getipython import get_ipython
 except ImportError:  # Graceful fallback if IPython isn't installed.
-    def get_ipython():
-        """Dummy routine in case you are not running in IPython."""
+    def get_ipython() -> None:
         return None
 try:
     from jupyter_server.serverapp import list_running_servers
+    def get_servers() -> list[Any]:
+        return list(list_running_servers())
 except ImportError:
     def list_running_servers():
-        """Dummy routine in case you are not running in Jupyter."""
         return []
+    def get_servers() -> list[Any]:
+        return list_running_servers()
 try:
-    from ipykernel import get_connection_file
+    from ipykernel.connect import get_connection_file
+    def get_kernel_connection() -> str:
+        return get_connection_file()
 except ImportError:
     def get_connection_file():
-        """Dummy routine in case you are not running in Jupyter."""
         return ""
+    def get_kernel_connection() -> str:
+        return get_connection_file()
 class Workflow(BaseModel):
-    """A specification of a workflow.  Must have a name, URI to the workflow instance, and a type.  The workflow type
-    needs to be an existing-controlled vocabulary term.
+    """Represents a computational workflow in DerivaML.
+    A workflow defines a computational process or analysis pipeline. Each workflow has
+    a unique identifier, source code location, and type. Workflows are typically
+    associated with Git repositories for version control.
     Attributes:
-        name: The name of the workflow
-        url: The URI to the workflow instance.  In most cases should be a GitHub URI to the code being executed.
-        workflow_type: The type of the workflow.  Must be an existing controlled vocabulary term.
-        version: The version of the workflow instance.  Should follow semantic versioning.
-        description: A description of the workflow instance.  Can be in Markdown format.
-        is_notebook: A boolean indicating whether this workflow instance is a notebook or not.
+        name (str): Human-readable name of the workflow.
+        url (str): URI to the workflow source code (typically a GitHub URL).
+        workflow_type (str): Type of workflow (must be a controlled vocabulary term).
+        version (str | None): Version identifier (semantic versioning).
+        description (str | None): Description of workflow purpose and behavior.
+        rid (RID | None): Resource Identifier if registered in catalog.
+        checksum (str | None): Git hash of workflow source code.
+        is_notebook (bool): Whether workflow is a Jupyter notebook.
+    Example:
+        >>> workflow = Workflow(
+        ...     name="RNA Analysis",
+        ...     url="https://github.com/org/repo/analysis.ipynb",
+        ...     workflow_type="python_notebook",
+        ...     version="1.0.0",
+        ...     description="RNA sequence analysis"
+        ... )
     """
     name: str
     url: str
     workflow_type: str
-    version: Optional[str] = None
-    description: str = None
-    rid: Optional[RID] = None
-    checksum: Optional[str] = None
+    version: str | None = None
+    description: str | None = None
+    rid: RID | None = None
+    checksum: str | None = None
     is_notebook: bool = False
     _logger: Any = PrivateAttr()
     def __post_init__(self):
+        """Initializes logging for the workflow."""
         self._logger = logging.getLogger("deriva_ml")
+    @staticmethod
+    def create_workflow(
+        name: str,
+        workflow_type: str,
+        description: str = "",
+    ) -> "Workflow":
+        """Creates a workflow from the current execution context.
+        Identifies the currently executing program (script or notebook) and creates
+        a workflow definition. Automatically determines the Git repository information
+        and source code checksum.
+        The behavior can be configured using environment variables:
+            - DERIVA_ML_WORKFLOW_URL: Override the detected workflow URL
+            - DERIVA_ML_WORKFLOW_CHECKSUM: Override the computed checksum
+        Args:
+            name: Human-readable name for the workflow.
+            workflow_type: Type of workflow (must be a vocabulary term).
+            description: Optional description of workflow purpose.
+        Returns:
+            Workflow: New workflow instance with detected Git information.
+        Raises:
+            DerivaMLException: If not in a Git repository or detection fails.
+        Example:
+            >>> workflow = Workflow.create_workflow(
+            ...     name="Sample Analysis",
+            ...     workflow_type="python_script",
+            ...     description="Process sample data"
+            ... )
+        """
+        # Check to see if execution file info is being passed in by calling program.
+        if "DERIVA_ML_WORKFLOW_URL" in os.environ:
+            github_url = os.environ["DERIVA_ML_WORKFLOW_URL"]
+            checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
+            is_notebook = True
+        else:
+            path, is_notebook = Workflow._get_python_script()
+            github_url, checksum = Workflow.get_url_and_checksum(path)
+        return Workflow(
+            name=name,
+            url=github_url,
+            checksum=checksum,
+            description=description,
+            workflow_type=workflow_type,
+            is_notebook=is_notebook,
+        )
+    @staticmethod
+    def get_url_and_checksum(executable_path: Path) -> tuple[str, str]:
+        """Determines the Git URL and checksum for a file.
+        Computes the Git repository URL and file checksum for the specified path.
+        For notebooks, strips cell outputs before computing the checksum.
+        Args:
+            executable_path: Path to the workflow file.
+        Returns:
+            tuple[str, str]: (GitHub URL, Git object hash)
+        Raises:
+            DerivaMLException: If not in a Git repository.
+        Example:
+            >>> url, checksum = Workflow.get_url_and_checksum(Path("analysis.ipynb"))
+            >>> print(f"URL: {url}")
+            >>> print(f"Checksum: {checksum}")
+        """
+        try:
+            subprocess.run(
+                "git rev-parse --is-inside-work-tree",
+                capture_output=True,
+                text=True,
+                shell=True,
+                check=True,
+            )
+        except subprocess.CalledProcessError:
+            raise DerivaMLException("Not executing in a Git repository.")
+        github_url, is_dirty = Workflow._github_url(executable_path)
+        if is_dirty:
+            logging.getLogger("deriva_ml").warning(
+                f"File {executable_path} has been modified since last commit. Consider commiting before executing"
+            )
+        # If you are in a notebook, strip out the outputs before computing the checksum.
+        cmd = (
+            f"nbstripout -t {executable_path} | git hash-object --stdin"
+            if "ipynb" == executable_path.suffix
+            else f"git hash-object {executable_path}"
+        )
+        checksum = (
+            subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=False,
+                shell=True,
+            ).stdout.strip()
+            if executable_path != "REPL"
+            else "1"
+        )
+        return github_url, checksum
+    @staticmethod
+    def _get_git_root(executable_path: Path) -> str | None:
+        """Gets the root directory of the Git repository.
+        Args:
+            executable_path: Path to check for Git repository.
+        Returns:
+            str | None: Absolute path to repository root, or None if not in repository.
+        """
+        try:
+            result = subprocess.run(
+                ["git", "rev-parse", "--show-toplevel"],
+                cwd=executable_path.parent,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.DEVNULL,
+                text=True,
+                check=True,
+            )
+            return result.stdout.strip()
+        except subprocess.CalledProcessError:
+            return None  # Not in a git repository
     @staticmethod
     def _check_nbstrip_status() -> None:
-        """Check to see if nbstrip is installed"""
+        """Checks if nbstripout is installed and configured.
+        Verifies that the nbstripout tool is available and properly installed in the
+        Git repository. Issues warnings if setup is incomplete.
+        """
         logger = logging.getLogger("deriva_ml")
         try:
             if subprocess.run(
@@ -93,15 +241,17 @@ class Workflow(BaseModel):
                 check=False,
                 capture_output=True,
             ).returncode:
-                logger.warning(
-                    "nbstripout is not installed in repository. Please run nbstripout --install"
-                )
+                logger.warning("nbstripout is not installed in repository. Please run nbstripout --install")
         except subprocess.CalledProcessError:
             logger.error("nbstripout is not found.")
     @staticmethod
     def _get_notebook_path() -> Path | None:
-        """Return the absolute path of the current notebook."""
+        """Gets the path of the currently executing notebook.
+        Returns:
+            Path | None: Absolute path to current notebook, or None if not in notebook.
+        """
         server, session = Workflow._get_notebook_session()
         if server and session:
@@ -116,7 +266,7 @@ class Workflow(BaseModel):
         """Return the absolute path of the current notebook."""
         # Get the kernel's connection file and extract the kernel ID
         try:
-            if not (connection_file := Path(get_connection_file()).name):
+            if not (connection_file := Path(get_kernel_connection()).name):
                 return None, None
         except RuntimeError:
             return None, None
@@ -124,7 +274,7 @@ class Workflow(BaseModel):
         kernel_id = connection_file.split("-", 1)[1].split(".")[0]
         # Look through the running server sessions to find the matching kernel ID
-        for server in list_running_servers():
+        for server in get_servers():
             try:
                 # If a token is required for authentication, include it in headers
                 token = server.get("token", "")
@@ -158,29 +308,27 @@ class Workflow(BaseModel):
             if len(stack) > 1:
                 filename = Path(stack[2].filename)
                 if not filename.exists():
-                    # Begin called from command line interpreter.
+                    # Being called from the command line interpreter.
                     filename = Path("REPL")
                 # Get the caller's filename, which is two up the stack from here.
             else:
-                raise DerivaMLException(
-                    "Looking for caller failed"
-                )  # Stack is too shallow
+                raise DerivaMLException("Looking for caller failed")  # Stack is too shallow
         return filename, is_notebook
     @staticmethod
     def _github_url(executable_path: Path) -> tuple[str, bool]:
-        """Return a GitHUB URL for the latest commit of the script from which this routine is called.
+        """Return a GitHub URL for the latest commit of the script from which this routine is called.
-        This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
-        the file is in a gitHUB repository and commited.  It returns a URL to the last commited version of this
-        file in GitHUB.
+        This routine is used to be called from a script or notebook (e.g., python -m file). It assumes that
+        the file is in a GitHub repository and committed.  It returns a URL to the last commited version of this
+        file in GitHub.
-        Returns: A tuple with the gethub_url and a boolean to indicated if uncommited changes
+        Returns: A tuple with the gethub_url and a boolean to indicate if uncommited changes
             have been made to the file.
         """
-        # Get repo URL from local gitHub repo.
+        # Get repo URL from local GitHub repo.
         if executable_path == "REPL":
             return "REPL", True
         try:
@@ -197,7 +345,7 @@ class Workflow(BaseModel):
         # Find the root directory for the repository
         repo_root = Workflow._get_git_root(executable_path)
-        # Now check to see if file has been modified since the last commit.
+        # Now check to see if a file has been modified since the last commit.
         try:
             result = subprocess.run(
                 ["git", "status", "--porcelain"],
@@ -206,11 +354,9 @@ class Workflow(BaseModel):
                 text=True,
                 check=True,
             )
-            is_dirty = bool(
-                "M " in result.stdout.strip()
-            )  # Returns True if output indicates a modified file
+            is_dirty = bool("M " in result.stdout.strip())  # Returns True if the output indicates a modified file
         except subprocess.CalledProcessError:
-            is_dirty = False  # If Git command fails, assume no changes
+            is_dirty = False  # If the Git command fails, assume no changes
         """Get SHA-1 hash of latest commit of the file in the repository"""
         result = subprocess.run(
@@ -223,161 +369,3 @@ class Workflow(BaseModel):
         sha = result.stdout.strip()
         url = f"{github_url}/blob/{sha}/{executable_path.relative_to(repo_root)}"
         return url, is_dirty
-    @staticmethod
-    def _get_git_root(executable_path: Path):
-        try:
-            result = subprocess.run(
-                ["git", "rev-parse", "--show-toplevel"],
-                cwd=executable_path.parent,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.DEVNULL,
-                text=True,
-                check=True,
-            )
-            return result.stdout.strip()
-        except subprocess.CalledProcessError:
-            return None  # Not in a git repository
-    @staticmethod
-    def create_workflow(
-        name: str,
-        workflow_type: str,
-        description: str = "",
-    ) -> Workflow:
-        """Identify current executing program and return a workflow RID for it
-        Determine the notebook or script that is currently being executed. Assume that  this is
-        being executed from a cloned GitHub repository.  Determine the remote repository name for
-        this object.  Then either retrieve an existing workflow for this executable or create
-        a new one.
-        Args:
-            name: The name of the workflow.
-            workflow_type: The type of the workflow.
-            description: The description of the workflow.
-        """
-        # Check to see if execution file info is being passed in by calling program.
-        if "DERIVA_ML_WORKFLOW_URL" in os.environ:
-            github_url = os.environ["DERIVA_ML_WORKFLOW_URL"]
-            checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
-            is_notebook = True
-        else:
-            path, is_notebook = Workflow._get_python_script()
-            github_url, checksum = Workflow.get_url_and_checksum(path)
-        return Workflow(
-            name=name,
-            url=github_url,
-            checksum=checksum,
-            description=description,
-            workflow_type=workflow_type,
-            is_notebook=is_notebook,
-        )
-    @staticmethod
-    def get_url_and_checksum(executable_path: Path) -> tuple[str, str]:
-        """Determine the checksum for a specified executable"""
-        try:
-            subprocess.run(
-                "git rev-parse --is-inside-work-tree",
-                capture_output=True,
-                text=True,
-                shell=True,
-                check=True,
-            )
-        except subprocess.CalledProcessError:
-            raise DerivaMLException("Not executing in a Git repository.")
-        github_url, is_dirty = Workflow._github_url(executable_path)
-        if is_dirty:
-            logging.getLogger("deriva_ml").warning(
-                f"File {executable_path} has been modified since last commit. Consider commiting before executing"
-            )
-        # If you are in a notebook, strip out the outputs before computing the checksum.
-        cmd = (
-            f"nbstripout -t {executable_path} | git hash-object --stdin"
-            if "ipynb" == executable_path.suffix
-            else f"git hash-object {executable_path}"
-        )
-        checksum = (
-            subprocess.run(
-                cmd,
-                capture_output=True,
-                text=True,
-                check=False,
-                shell=True,
-            ).stdout.strip()
-            if executable_path != "REPL"
-            else "1"
-        )
-        return github_url, checksum
-class ExecutionConfiguration(BaseModel):
-    """Define the parameters that are used to configure a specific execution.
-    Attributes:
-        datasets: List of dataset specifications which specify the dataset RID, version and if the dataset
-            should be materialized.
-        assets: List of assets to be downloaded prior to execution.  The values must be RIDs in an asset table
-        parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
-        workflow: Either a Workflow object, or a RID for a workflow instance.
-        parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
-        description: A description of the execution.  Can use Markdown format.
-    """
-    datasets: conlist(DatasetSpec) = []
-    assets: list[RID | str] = []  # List of RIDs to model files.
-    workflow: RID | Workflow
-    parameters: dict[str, Any] | Path = {}
-    description: str = ""
-    argv: conlist(str) = Field(default_factory=lambda: sys.argv)
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    @field_validator("parameters", mode="before")
-    @classmethod
-    def validate_parameters(cls, value: Any) -> Any:
-        """If a parameter is a file, assume that it has JSON contents for configuration parameters"""
-        if isinstance(value, str) or isinstance(value, Path):
-            with open(value, "r") as f:
-                return json.load(f)
-        else:
-            return value
-    @staticmethod
-    def load_configuration(path: Path) -> ExecutionConfiguration:
-        """Create a ExecutionConfiguration from a JSON configuration file.
-        Args:
-          path: File containing JSON version of execution configuration.
-        Returns:
-          An execution configuration whose values are loaded from the given file.
-        """
-        with open(path) as fd:
-            config = json.load(fd)
-        return ExecutionConfiguration.model_validate(config)
-    # def download_execution_configuration(
-    #     self, configuration_rid: RID
-    # ) -> ExecutionConfiguration:
-    #     """Create an ExecutionConfiguration object from a catalog RID that points to a JSON representation of that
-    #     configuration in hatrac
-    #
-    #     Args:
-    #         configuration_rid: RID that should be to an asset table that refers to an execution configuration
-    #
-    #     Returns:
-    #         A ExecutionConfiguration object for configured by the parameters in the configuration file.
-    #     """
-    #     AssertionError("Not Implemented")
-    #     configuration = self.retrieve_rid(configuration_rid)
-    #     with NamedTemporaryFile("w+", delete=False, suffix=".json") as dest_file:
-    #         hs = HatracStore("https", self.host_name, self.credential)
-    #         hs.get_obj(path=configuration["URL"], destfilename=dest_file.name)
-    #         return ExecutionConfiguration.load_configuration(Path(dest_file.name))

deriva-ml 1.14.0__py3-none-any.whl → 1.14.26__py3-none-any.whl

deriva-ml 1.14.0py3-none-any.whl → 1.14.26py3-none-any.whl