PyPI - deriva-ml - Versions diffs - 1.12.2__py3-none-any.whl → 1.13.0__py3-none-any.whl - Mend

deriva-ml 1.12.2py3-none-any.whl → 1.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

deriva_ml/__init__.py +2 -1
deriva_ml/dataset.py +9 -2
deriva_ml/deriva_definitions.py +6 -0
deriva_ml/deriva_ml_base.py +14 -233
deriva_ml/execution.py +101 -42
deriva_ml/execution_configuration.py +284 -4
deriva_ml/run_notebook.py +155 -0
deriva_ml/test_notebook.ipynb +124 -0
{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/METADATA +2 -1
{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/RECORD +14 -12
{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/entry_points.txt +1 -0
{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/WHEEL +0 -0
{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/top_level.txt +0 -0

deriva_ml/__init__.py CHANGED Viewed

@@ -14,6 +14,7 @@ __all__ = [
     "BuiltinTypes",
     "UploadState",
     "MLVocab",
+    "MLAsset",
     "ExecMetadataVocab",
     "RID",
     "DerivaSystemColumns",
@@ -32,6 +33,7 @@ from .deriva_definitions import (
     RID,
     DerivaMLException,
     MLVocab,
+    MLAsset,
     ExecMetadataVocab,
     DerivaSystemColumns,
 )
@@ -49,4 +51,3 @@ try:
 except PackageNotFoundError:
     # package is not installed
     pass

deriva_ml/dataset.py CHANGED Viewed

@@ -41,7 +41,14 @@ from tempfile import TemporaryDirectory, NamedTemporaryFile
 from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
 from deriva_ml import DatasetBag
-from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
+from .deriva_definitions import (
+    ML_SCHEMA,
+    DerivaMLException,
+    MLVocab,
+    Status,
+    RID,
+    DRY_RUN_RID,
+)
 from .history import iso_to_snap
 from .deriva_model import DerivaModel
 from .database_model import DatabaseModel
@@ -957,7 +964,7 @@ class Dataset:
             for the dataset.
         """
         if (
-            execution_rid
+            execution_rid != DRY_RUN_RID
             and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
         ):
             raise DerivaMLException(f"RID {execution_rid} is not an execution")

deriva_ml/deriva_definitions.py CHANGED Viewed

@@ -21,6 +21,7 @@ from pydantic import (
 from socket import gethostname
 ML_SCHEMA = "deriva-ml"
+DRY_RUN_RID = "0000"
 # We are going to use schema as a field name and this collides with method in pydantic base class
 warnings.filterwarnings(
@@ -191,6 +192,11 @@ class MLVocab(StrEnum):
     asset_role = "Asset_Role"
+class MLAsset(StrEnum):
+    execution_metadata = "Execution_Metadata"
+    execution_asset = "Execution_Asset"
 class ExecMetadataVocab(StrEnum):
     """
     Predefined execution metadata types.

deriva_ml/deriva_ml_base.py CHANGED Viewed

@@ -14,17 +14,16 @@ import getpass
 import logging
 from datetime import datetime
 from itertools import chain
-import inspect
-import setuptools_scm
 from pathlib import Path
 import requests
-import subprocess
 from typing import Optional, Any, Iterable, TYPE_CHECKING
 from deriva.core import (
     get_credential,
     urlquote,
-    DEFAULT_SESSION_CONFIG,
     format_exception,
+    DEFAULT_SESSION_CONFIG,
 )
 import deriva.core.datapath as datapath
 from deriva.core.datapath import DataPathException
@@ -33,7 +32,6 @@ from deriva.core.ermrest_catalog import ResolveRidResult
 from deriva.core.ermrest_model import Key, Table
 from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
 from pydantic import validate_call, ConfigDict
-from requests import RequestException
 from .execution_configuration import ExecutionConfiguration, Workflow
 from .feature import Feature, FeatureRecord
@@ -60,33 +58,6 @@ except ImportError:  # Graceful fallback if IceCream isn't installed.
     ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
-try:
-    from IPython import get_ipython
-except ImportError:  # Graceful fallback if IPython isn't installed.
-    def get_ipython():
-        """Dummy routine in case you are not running in IPython."""
-        return None
-try:
-    from jupyter_server.serverapp import list_running_servers
-except ImportError:
-    def list_running_servers():
-        """Dummy routine in case you are not running in Jupyter."""
-        return []
-try:
-    from ipykernel import get_connection_file
-except ImportError:
-    def get_connection_file():
-        """Dummy routine in case you are not running in Jupyter."""
-        return ""
 if TYPE_CHECKING:
     from .execution import Execution
@@ -165,7 +136,6 @@ class DerivaML(Dataset):
         self.version = model_version
         self.configuration = None
         self._execution: Optional[Execution] = None
-        self.executable_path, self._is_notebook = self._get_python_script()
         self.domain_schema = self.model.domain_schema
         self.project_name = project_name or self.domain_schema
         self.start_time = datetime.now()
@@ -192,102 +162,6 @@ class DerivaML(Dataset):
         except (AttributeError, requests.HTTPError):
             pass
-    def _check_nbstrip_status(self) -> None:
-        """Check to see if nbstrip is installed"""
-        try:
-            if subprocess.run(
-                ["nbstripout", "--is-installed"],
-                check=False,
-                capture_output=True,
-            ).returncode:
-                self._logger.warning(
-                    "nbstripout is not installed in repository. Please run nbstripout --install"
-                )
-        except subprocess.CalledProcessError:
-            self._logger.error("nbstripout is not found.")
-    @staticmethod
-    def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
-        """Return the absolute path of the current notebook."""
-        # Get the kernel's connection file and extract the kernel ID
-        try:
-            if not (connection_file := Path(get_connection_file()).name):
-                return None, None
-        except RuntimeError:
-            return None, None
-        kernel_id = connection_file.split("-", 1)[1].split(".")[0]
-        # Look through the running server sessions to find the matching kernel ID
-        for server in list_running_servers():
-            try:
-                # If a token is required for authentication, include it in headers
-                token = server.get("token", "")
-                headers = {}
-                if token:
-                    headers["Authorization"] = f"token {token}"
-                try:
-                    sessions_url = server["url"] + "api/sessions"
-                    response = requests.get(sessions_url, headers=headers)
-                    response.raise_for_status()
-                    sessions = response.json()
-                except RequestException as e:
-                    raise e
-                for sess in sessions:
-                    if sess["kernel"]["id"] == kernel_id:
-                        return server, sess
-            except Exception as _e:
-                # Ignore servers we can't connect to.
-                pass
-        return None, None
-    def _get_notebook_path(self) -> Path | None:
-        """Return the absolute path of the current notebook."""
-        server, session = DerivaML._get_notebook_session()
-        if server and session:
-            self._check_nbstrip_status()
-            relative_path = session["notebook"]["path"]
-            # Join the notebook directory with the relative path
-            return Path(server["root_dir"]) / relative_path
-        else:
-            return None
-    def _get_python_script(self) -> tuple[Path, bool]:
-        """Return the path to the currently executing script"""
-        is_notebook = False
-        if filename := self._get_notebook_path():
-            is_notebook = True
-        else:
-            stack = inspect.stack()
-            # Get the caller's filename, which is two up the stack from here.
-            if len(stack) > 1:
-                filename = Path(stack[2].filename)
-                if not filename.exists():
-                    # Begin called from command line interpreter.
-                    filename = "REPL"
-                # Get the caller's filename, which is two up the stack from here.
-            else:
-                raise DerivaMLException(
-                    "Looking for caller failed"
-                )  # Stack is too shallow
-        return filename, is_notebook
-    def _get_git_root(self):
-        try:
-            result = subprocess.run(
-                ["git", "rev-parse", "--show-toplevel"],
-                cwd=self.executable_path.parent,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.DEVNULL,
-                text=True,
-                check=True,
-            )
-            return result.stdout.strip()
-        except subprocess.CalledProcessError:
-            return None  # Not in a git repository
     @staticmethod
     def _get_session_config():
         """ """
@@ -311,10 +185,6 @@ class DerivaML(Dataset):
         """Get a new instance of a pathBuilder object."""
         return self.catalog.getPathBuilder()
-    def get_version(self) -> str:
-        """Return the version number of the executable"""
-        return setuptools_scm.get_version(root=self._get_git_root())
     @property
     def domain_path(self):
         """Get a new instance of a pathBuilder object to the domain schema"""
@@ -1117,105 +987,7 @@ class DerivaML(Dataset):
         # Make sure type is correct.
         self.lookup_term(MLVocab.workflow_type, workflow_type)
-        try:
-            subprocess.run(
-                "git rev-parse --is-inside-work-tree",
-                capture_output=True,
-                text=True,
-                shell=True,
-                check=True,
-            )
-        except subprocess.CalledProcessError:
-            raise DerivaMLException("Not executing in a Git repository.")
-        github_url, is_dirty = self._github_url()
-        if is_dirty:
-            self._logger.warning(
-                f"File {self.executable_path} has been modified since last commit. Consider commiting before executing"
-            )
-        # If you are in a notebook, strip out the outputs before computing the checksum.
-        cmd = (
-            f"nbstripout {self.executable_path} | git hash-object --stdin"
-            if self._is_notebook
-            else f"git hash-object {self.executable_path}"
-        )
-        checksum = (
-            subprocess.run(
-                cmd,
-                capture_output=True,
-                text=True,
-                check=False,
-                shell=True,
-            ).stdout.strip()
-            if self.executable_path != "REPL"
-            else "1"
-        )
-        return Workflow(
-            name=name,
-            url=github_url,
-            checksum=checksum,
-            description=description,
-            workflow_type=workflow_type,
-        )
-    def _github_url(self) -> tuple[str, bool]:
-        """Return a GitHUB URL for the latest commit of the script from which this routine is called.
-        This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
-        the file is in a gitHUB repository and commited.  It returns a URL to the last commited version of this
-        file in GitHUB.
-        Returns: A tuple with the gethub_url and a boolean to indicated if uncommited changes
-            have been made to the file.
-        """
-        # Get repo URL from local gitHub repo.
-        if self.executable_path == "REPL":
-            return "REPL", True
-        try:
-            result = subprocess.run(
-                ["git", "remote", "get-url", "origin"],
-                capture_output=True,
-                text=True,
-                cwd=self.executable_path.parent,
-            )
-            github_url = result.stdout.strip().removesuffix(".git")
-        except subprocess.CalledProcessError:
-            raise DerivaMLException("No GIT remote found")
-        # Find the root directory for the repository
-        repo_root = self._get_git_root()
-        # Now check to see if file has been modified since the last commit.
-        try:
-            result = subprocess.run(
-                ["git", "status", "--porcelain"],
-                cwd=self.executable_path.parent,
-                capture_output=True,
-                text=True,
-                check=True,
-            )
-            is_dirty = bool(
-                "M " in result.stdout.strip()
-            )  # Returns True if output indicates a modified file
-        except subprocess.CalledProcessError:
-            is_dirty = False  # If Git command fails, assume no changes
-        """Get SHA-1 hash of latest commit of the file in the repository"""
-        result = subprocess.run(
-            ["git", "log", "-n", "1", "--pretty=format:%H--", self.executable_path],
-            cwd=self.executable_path.parent,
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-        sha = result.stdout.strip()
-        url = f"{github_url}/blob/{sha}/{self.executable_path.relative_to(repo_root)}"
-        return url, is_dirty
+        return Workflow.create_workflow(name, workflow_type, description)
     # @validate_call
     def create_execution(
@@ -1259,6 +1031,15 @@ class DerivaML(Dataset):
             exec_rid=execution_rid,
             file_name="configuration.json",
             asset_table=self.model.name_to_table("Execution_Metadata"),
+            metadata={},
         )
-        configuration = ExecutionConfiguration.load_configuration(cfile)
+        if cfile.exists():
+            configuration = ExecutionConfiguration.load_configuration(cfile)
+        else:
+            execution = self.retrieve_rid(execution_rid)
+            configuration = ExecutionConfiguration(
+                workflow=execution["Workflow"],
+                description=execution["Description"],
+            )
         return Execution(configuration, self, reload=execution_rid)

deriva_ml/execution.py CHANGED Viewed

@@ -5,21 +5,31 @@ This module defined the Execution class which is used to interact with the state
 from __future__ import annotations
 from collections import defaultdict
+from datetime import datetime
 import json
 import logging
 import os
-import shutil
-from datetime import datetime
 from pathlib import Path
-from typing import Iterable, Any, Optional
-from deriva.core import format_exception
 from pydantic import validate_call, ConfigDict
+import regex as re
 import sys
-from deriva.core.hatrac_store import HatracStore
+import shutil
+from typing import Iterable, Any, Optional
+from deriva.core import format_exception
+from deriva.core.datapath import DataPathException
+from deriva.core.hatrac_store import HatracStore
 from .deriva_definitions import ExecMetadataVocab
-from .deriva_definitions import RID, Status, FileUploadState, DerivaMLException, MLVocab
+from .deriva_definitions import (
+    RID,
+    Status,
+    FileUploadState,
+    DerivaMLException,
+    MLVocab,
+    MLAsset,
+    DRY_RUN_RID,
+)
 from .deriva_ml_base import DerivaML, FeatureRecord
 from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
 from .dataset_bag import DatasetBag
@@ -45,11 +55,14 @@ except ImportError:  # Graceful fallback if IceCream isn't installed.
 try:
-    from jupyter_server.serverapp import list_running_servers
+    from IPython.display import display, Markdown
 except ImportError:
-    def list_running_servers():
-        return []
+    def display(s):
+        print(s)
+    def Markdown(s):
+        return s
 class AssetFilePath(type(Path())):
@@ -167,7 +180,7 @@ class Execution:
             self.workflow_rid = (
                 self._ml_object.add_workflow(self.configuration.workflow)
                 if not self._dry_run
-                else "0000"
+                else DRY_RUN_RID
             )
         else:
             self.workflow_rid = self.configuration.workflow
@@ -195,10 +208,10 @@ class Execution:
         schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
         if reload:
             self.execution_rid = reload
-            if self.execution_rid == "0000":
+            if self.execution_rid == DRY_RUN_RID:
                 self._dry_run = True
         elif self._dry_run:
-            self.execution_rid = "0000"
+            self.execution_rid = DRY_RUN_RID
         else:
             self.execution_rid = schema_path.Execution.insert(
                 [
@@ -209,6 +222,15 @@ class Execution:
                 ]
             )[0]["RID"]
+        if (
+            isinstance(self.configuration.workflow, Workflow)
+            and self.configuration.workflow.is_notebook
+        ):
+            # Put execution_rid into cell output so we can find it later.
+            display(
+                Markdown(f"Execution RID: {self._ml_object.cite(self.execution_rid)}")
+            )
         # Create a directory for execution rid so we can recover state in case of a crash.
         execution_root(prefix=self._ml_object.working_dir, exec_rid=self.execution_rid)
         self._initialize_execution(reload)
@@ -272,16 +294,20 @@ class Execution:
             )
         # Save configuration details for later upload
-        cfile = self.asset_file_path(
-            asset_name="Execution_Metadata",
-            file_name="configuration.json",
-            asset_types=ExecMetadataVocab.execution_config.value,
-        )
-        with open(cfile.as_posix(), "w", encoding="utf-8") as config_file:
-            json.dump(self.configuration.model_dump(), config_file)
+        if not reload:
+            cfile = self.asset_file_path(
+                asset_name=MLAsset.execution_metadata,
+                file_name="configuration.json",
+                asset_types=ExecMetadataVocab.execution_config.value,
+            )
+            with open(cfile.as_posix(), "w", encoding="utf-8") as config_file:
+                json.dump(self.configuration.model_dump(), config_file)
+            for parameter_file in self.configuration.parameters:
+                self.asset_file_path(MLAsset.execution_assets, parameter_file)
-        # save runtime env
-        self._save_runtime_environment()
+            # save runtime env
+            self._save_runtime_environment()
         self.start_time = datetime.now()
         self.update_status(Status.pending, "Initialize status finished.")
@@ -625,9 +651,20 @@ class Execution:
         with open(feature_file, "r") as feature_values:
             entities = [json.loads(line.strip()) for line in feature_values]
         # Update the asset columns in the feature and add to the catalog.
-        self._ml_object.domain_path.tables[feature_table].insert(
-            [map_path(e) for e in entities]
-        )
+        try:
+            self._ml_object.domain_path.tables[feature_table].insert(
+                [map_path(e) for e in entities]
+            )
+        except DataPathException as e:
+            if re.match(
+                rf'DETAIL: +Key +\("Execution", +"{target_table}", +"Feature_Name"\)=\(.*\) already exists',
+                e.message,
+            ):
+                self._logger.info(
+                    f"Skipping reload of feature values for {feature_table}"
+                )
+            else:
+                raise e
     def _update_asset_execution_table(
         self,
@@ -652,16 +689,27 @@ class Execution:
             asset_exe = self._model.find_association(asset_table_name, "Execution")
             asset_exe_path = pb.schemas[asset_exe.schema.name].tables[asset_exe.name]
-            asset_exe_path.insert(
-                [
-                    {
-                        asset_table_name: asset_path.asset_rid,
-                        "Execution": self.execution_rid,
-                        "Asset_Role": asset_role,
-                    }
-                    for asset_path in asset_list
-                ]
-            )
+            try:
+                asset_exe_path.insert(
+                    [
+                        {
+                            asset_table_name: asset_path.asset_rid,
+                            "Execution": self.execution_rid,
+                            "Asset_Role": asset_role,
+                        }
+                        for asset_path in asset_list
+                    ]
+                )
+            except DataPathException as e:
+                if re.match(
+                    rf'DETAIL: +Key +\("{asset_table_name}", +"Execution"\)=\(.*\) already exists',
+                    e.message,
+                ):
+                    self._logger.info(
+                        f"Skipping reload of execution assocations for {asset_table_name}"
+                    )
+                else:
+                    raise e
             # Now add in the type names via the asset_asset_type association table.
             # Get the list of types for each file in the asset.
@@ -687,19 +735,30 @@ class Execution:
             type_path = pb.schemas[asset_asset_type.schema.name].tables[
                 asset_asset_type.name
             ]
-            type_path.insert(
-                [
-                    {asset_table_name: asset.asset_rid, "Asset_Type": t}
-                    for asset in asset_list
-                    for t in asset_type_map[asset.file_name]
-                ]
-            )
+            try:
+                type_path.insert(
+                    [
+                        {asset_table_name: asset.asset_rid, "Asset_Type": t}
+                        for asset in asset_list
+                        for t in asset_type_map[asset.file_name]
+                    ]
+                )
+            except DataPathException as e:
+                if re.match(
+                    rf'DETAIL: +Key +\("{asset_table_name}", +"Asset_Type"\)=\(.*\) already exists',
+                    e.message,
+                ):
+                    self._logger.info(
+                        f"Skipping reload of execution asset types for {asset_table_name}"
+                    )
+                else:
+                    raise e
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def asset_file_path(
         self,
         asset_name: str,
-        file_name: str,
+        file_name: str | Path,
         asset_types: Optional[list[str] | str] = None,
         copy_file=False,
         **kwargs,

deriva_ml/execution_configuration.py CHANGED Viewed

@@ -4,16 +4,56 @@ Classes that are used to define an execution configuration.
 from __future__ import annotations
+import inspect
 import json
+import logging
+import os
+from requests import RequestException
+import requests
+import subprocess
 from typing import Optional, Any
-from pydantic import BaseModel, conlist, ConfigDict, field_validator, Field
+from pydantic import (
+    BaseModel,
+    conlist,
+    ConfigDict,
+    field_validator,
+    Field,
+    PrivateAttr,
+)
 from pathlib import Path
 import sys
 from .dataset_aux_classes import DatasetSpec
-from .deriva_definitions import RID
+from .deriva_definitions import RID, DerivaMLException
+try:
+    from IPython import get_ipython
+except ImportError:  # Graceful fallback if IPython isn't installed.
+    def get_ipython():
+        """Dummy routine in case you are not running in IPython."""
+        return None
+try:
+    from jupyter_server.serverapp import list_running_servers
+except ImportError:
+    def list_running_servers():
+        """Dummy routine in case you are not running in Jupyter."""
+        return []
+try:
+    from ipykernel import get_connection_file
+except ImportError:
+    def get_connection_file():
+        """Dummy routine in case you are not running in Jupyter."""
+        return ""
 class Workflow(BaseModel):
@@ -26,15 +66,255 @@ class Workflow(BaseModel):
         workflow_type: The type of the workflow.  Must be an existing controlled vocabulary term.
         version: The version of the workflow instance.  Should follow semantic versioning.
         description: A description of the workflow instance.  Can be in Markdown format.
+        is_notebook: A boolean indicating whether this workflow instance is a notebook or not.
     """
     name: str
     url: str
     workflow_type: str
     version: Optional[str] = None
-    description: Optional[str] = ""
+    description: str = None
     rid: Optional[RID] = None
-    checksum: Optional[str]
+    checksum: Optional[str] = None
+    is_notebook: bool = False
+    _logger: Any = PrivateAttr()
+    def __post_init__(self):
+        self._logger = logging.getLogger("deriva_ml")
+    @staticmethod
+    def _check_nbstrip_status() -> None:
+        """Check to see if nbstrip is installed"""
+        logger = logging.getLogger("deriva_ml")
+        try:
+            if subprocess.run(
+                ["nbstripout", "--is-installed"],
+                check=False,
+                capture_output=True,
+            ).returncode:
+                logger.warning(
+                    "nbstripout is not installed in repository. Please run nbstripout --install"
+                )
+        except subprocess.CalledProcessError:
+            logger.error("nbstripout is not found.")
+    @staticmethod
+    def _get_notebook_path() -> Path | None:
+        """Return the absolute path of the current notebook."""
+        server, session = Workflow._get_notebook_session()
+        if server and session:
+            relative_path = session["notebook"]["path"]
+            # Join the notebook directory with the relative path
+            return Path(server["root_dir"]) / relative_path
+        else:
+            return None
+    @staticmethod
+    def _get_notebook_session() -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
+        """Return the absolute path of the current notebook."""
+        # Get the kernel's connection file and extract the kernel ID
+        try:
+            if not (connection_file := Path(get_connection_file()).name):
+                return None, None
+        except RuntimeError:
+            return None, None
+        kernel_id = connection_file.split("-", 1)[1].split(".")[0]
+        # Look through the running server sessions to find the matching kernel ID
+        for server in list_running_servers():
+            try:
+                # If a token is required for authentication, include it in headers
+                token = server.get("token", "")
+                headers = {}
+                if token:
+                    headers["Authorization"] = f"token {token}"
+                try:
+                    sessions_url = server["url"] + "api/sessions"
+                    response = requests.get(sessions_url, headers=headers)
+                    response.raise_for_status()
+                    sessions = response.json()
+                except RequestException as e:
+                    raise e
+                for sess in sessions:
+                    if sess["kernel"]["id"] == kernel_id:
+                        return server, sess
+            except Exception as _e:
+                # Ignore servers we can't connect to.
+                pass
+        return None, None
+    @staticmethod
+    def _get_python_script() -> tuple[Path, bool]:
+        """Return the path to the currently executing script"""
+        is_notebook = True
+        if not (filename := Workflow._get_notebook_path()):
+            is_notebook = False
+            stack = inspect.stack()
+            # Get the caller's filename, which is two up the stack from here.
+            if len(stack) > 1:
+                filename = Path(stack[2].filename)
+                if not filename.exists():
+                    # Begin called from command line interpreter.
+                    filename = Path("REPL")
+                # Get the caller's filename, which is two up the stack from here.
+            else:
+                raise DerivaMLException(
+                    "Looking for caller failed"
+                )  # Stack is too shallow
+        return filename, is_notebook
+    @staticmethod
+    def _github_url(executable_path: Path) -> tuple[str, bool]:
+        """Return a GitHUB URL for the latest commit of the script from which this routine is called.
+        This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
+        the file is in a gitHUB repository and commited.  It returns a URL to the last commited version of this
+        file in GitHUB.
+        Returns: A tuple with the gethub_url and a boolean to indicated if uncommited changes
+            have been made to the file.
+        """
+        # Get repo URL from local gitHub repo.
+        if executable_path == "REPL":
+            return "REPL", True
+        try:
+            result = subprocess.run(
+                ["git", "remote", "get-url", "origin"],
+                capture_output=True,
+                text=True,
+                cwd=executable_path.parent,
+            )
+            github_url = result.stdout.strip().removesuffix(".git")
+        except subprocess.CalledProcessError:
+            raise DerivaMLException("No GIT remote found")
+        # Find the root directory for the repository
+        repo_root = Workflow._get_git_root(executable_path)
+        # Now check to see if file has been modified since the last commit.
+        try:
+            result = subprocess.run(
+                ["git", "status", "--porcelain"],
+                cwd=executable_path.parent,
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            is_dirty = bool(
+                "M " in result.stdout.strip()
+            )  # Returns True if output indicates a modified file
+        except subprocess.CalledProcessError:
+            is_dirty = False  # If Git command fails, assume no changes
+        """Get SHA-1 hash of latest commit of the file in the repository"""
+        result = subprocess.run(
+            ["git", "log", "-n", "1", "--pretty=format:%H--", executable_path],
+            cwd=executable_path.parent,
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        sha = result.stdout.strip()
+        url = f"{github_url}/blob/{sha}/{executable_path.relative_to(repo_root)}"
+        return url, is_dirty
+    @staticmethod
+    def _get_git_root(executable_path: Path):
+        try:
+            result = subprocess.run(
+                ["git", "rev-parse", "--show-toplevel"],
+                cwd=executable_path.parent,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.DEVNULL,
+                text=True,
+                check=True,
+            )
+            return result.stdout.strip()
+        except subprocess.CalledProcessError:
+            return None  # Not in a git repository
+    @staticmethod
+    def create_workflow(
+        name: str,
+        workflow_type: str,
+        description: str = "",
+    ) -> Workflow:
+        """Identify current executing program and return a workflow RID for it
+        Determine the notebook or script that is currently being executed. Assume that  this is
+        being executed from a cloned GitHub repository.  Determine the remote repository name for
+        this object.  Then either retrieve an existing workflow for this executable or create
+        a new one.
+        Args:
+            name: The name of the workflow.
+            workflow_type: The type of the workflow.
+            description: The description of the workflow.
+        """
+        # Check to see if execution file info is being passed in by calling program.
+        if "DERIVA_ML_WORKFLOW_URL" in os.environ:
+            github_url = os.environ["DERIVA_ML_WORKFLOW_URL"]
+            checksum = os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"]
+            is_notebook = True
+        else:
+            path, is_notebook = Workflow._get_notebook_path()
+            github_url, checksum = Workflow.get_url_and_checksum(path)
+        return Workflow(
+            name=name,
+            url=github_url,
+            checksum=checksum,
+            description=description,
+            workflow_type=workflow_type,
+            is_notebook=is_notebook,
+        )
+    @staticmethod
+    def get_url_and_checksum(executable_path: Path) -> tuple[str, str]:
+        """Determine the checksum for a specified executable"""
+        try:
+            subprocess.run(
+                "git rev-parse --is-inside-work-tree",
+                capture_output=True,
+                text=True,
+                shell=True,
+                check=True,
+            )
+        except subprocess.CalledProcessError:
+            raise DerivaMLException("Not executing in a Git repository.")
+        github_url, is_dirty = Workflow._github_url(executable_path)
+        if is_dirty:
+            logging.getLogger("deriva_ml").warning(
+                f"File {executable_path} has been modified since last commit. Consider commiting before executing"
+            )
+        # If you are in a notebook, strip out the outputs before computing the checksum.
+        cmd = (
+            f"nbstripout -t {executable_path} | git hash-object --stdin"
+            if "ipynb" == executable_path.suffix
+            else f"git hash-object {executable_path}"
+        )
+        checksum = (
+            subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=False,
+                shell=True,
+            ).stdout.strip()
+            if executable_path != "REPL"
+            else "1"
+        )
+        return github_url, checksum
 class ExecutionConfiguration(BaseModel):

deriva_ml/run_notebook.py ADDED Viewed

@@ -0,0 +1,155 @@
+"""Module to run a notebook using papermill"""
+import json
+import os
+import papermill as pm
+from pathlib import Path
+import regex as re
+import tempfile
+from deriva_ml import Workflow, DerivaML, MLVocab
+from deriva.core import BaseCLI
+from deriva_ml import MLAsset
+class DerivaMLRunNotebookCLI(BaseCLI):
+    """Main class to part command line arguments and call model"""
+    def __init__(self, description, epilog, **kwargs):
+        BaseCLI.__init__(self, description, epilog, **kwargs)
+        Workflow._check_nbstrip_status()
+        self.parser.add_argument(
+            "notebook_file", type=Path, help="Path to the notebook file"
+        )
+        self.parser.add_argument(
+            "--file",
+            "-f",
+            type=Path,
+            default=None,
+            help="JSON file with parameter values to inject into the notebook.",
+        )
+        self.parser.add_argument(
+            "--inspect",
+            action="store_true",
+            help="Display parameters information for the given notebook path.",
+        )
+        self.parser.add_argument(
+            "--parameter",
+            "-p",
+            nargs=2,
+            action="append",
+            metavar=("KEY", "VALUE"),
+            default=[],
+            help="Provide a parameter name band value to inject into the notebook.",
+        )
+        self.parser.add_argument(
+            "--kernel", "-k", nargs=1, help="Name of kernel to run..", default=None
+        )
+    @staticmethod
+    def _coerce_number(val: str):
+        """
+        Try to convert a string to int, then float; otherwise return str.
+        """
+        try:
+            return int(val)
+        except ValueError:
+            try:
+                return float(val)
+            except ValueError:
+                return val
+    def main(self):
+        """Parse arguments and set up execution environment."""
+        args = self.parse_cli()
+        notebook_file = args.notebook_file
+        parameter_file = args.file
+        # args.parameter is now a list of [KEY, VALUE] lists
+        # e.g. [['timeout', '30'], ['name', 'Alice'], ...]
+        parameters = {key: self._coerce_number(val) for key, val in args.parameter}
+        if parameter_file:
+            if not (parameter_file.is_file() and parameter_file.suffix == ".json"):
+                print("Parameter file must be an json file.")
+                exit(1)
+            with open(parameter_file, "r") as f:
+                parameters |= json.load(f)
+        if not (notebook_file.is_file() and notebook_file.suffix == ".ipynb"):
+            print("Notebook file must be an ipynb file.")
+            exit(1)
+        # Create a workflow instance for this specific version of the script.  Return an existing workflow if one is found.
+        notebook_parameters = pm.inspect_notebook(notebook_file)
+        if args.inspect:
+            for param, value in notebook_parameters:
+                print(
+                    f"{param}:{value['inferred_type_name']}  (default {value['default']})"
+                )
+            return
+        else:
+            notebook_parameters = {
+                k: v["default"] for k, v in notebook_parameters.items()
+            } | parameters
+            print(f"Running notebook {notebook_file.name} with paremeters:")
+            for param, value in notebook_parameters.items():
+                print(f"  {param}:{value}")
+            self.run_notebook(notebook_file.resolve(), parameters, args.kernel)
+    def run_notebook(self, notebook_file, parameters, kernel=None):
+        url, checksum = Workflow.get_url_and_checksum(Path(notebook_file))
+        os.environ["DERIVA_ML_WORKFLOW_URL"] = url
+        os.environ["DERIVA_ML_WORKFLOW_CHECKSUM"] = checksum
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            notebook_output = Path(tmpdirname) / Path(notebook_file).name
+            pm.execute_notebook(
+                input_path=notebook_file,
+                output_path=notebook_output,
+                parameters=parameters,
+                kernel_name=kernel,
+            )
+            host = catalog_id = execution_rid = None
+            with open(notebook_output, "r") as f:
+                for line in f:
+                    if m := re.search(
+                        r"Execution RID: https://(?P<host>.*)/id/(?P<catalog_id>.*)/(?P<execution_rid>[\w-]+)",
+                        line,
+                    ):
+                        host = m["host"]
+                        catalog_id = m["catalog_id"]
+                        execution_rid = m["execution_rid"]
+            if not execution_rid:
+                print("Execution RID not found.")
+                exit(1)
+            print("Uploaded notebook output for Execution RID:", execution_rid)
+            ml_instance = DerivaML(hostname=host, catalog_id=catalog_id)
+            ml_instance.add_term(
+                MLVocab.asset_type,
+                "Notebook_Output",
+                description="Jupyter Notebook Output",
+            )
+            execution = ml_instance.restore_execution(execution_rid)
+            execution.asset_file_path(
+                asset_name=MLAsset.execution_asset,
+                file_name=notebook_output,
+                asset_types=["Notebook_Output"],
+            )
+            execution.upload_execution_outputs()
+def main():
+    cli = DerivaMLRunNotebookCLI(
+        description="Deriva ML Execution Script Demo", epilog=""
+    )
+    cli.main()
+if __name__ == "__main__":
+    main()

deriva_ml/test_notebook.ipynb ADDED Viewed

@@ -0,0 +1,124 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "import builtins\n",
+    "from deriva.core.utils.globus_auth_utils import GlobusNativeLogin\n",
+    "from deriva_ml import ExecutionConfiguration, MLVocab, DerivaSystemColumns, DatasetSpec, DerivaML, Workflow\n",
+    "from deriva_ml.demo_catalog import create_demo_catalog, DemoML"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "1",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "source": [
+    "foo: int = 1\n",
+    "bar: str = \"hello\"\n",
+    "list_parameter: list[float] = [1, 2, 3]"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "print('foo', foo)\n",
+    "print('bar', bar)\n",
+    "print('list_parameter', list_parameter)"
+   ],
+   "id": "70b23cdd933ce669"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "hostname = 'dev.eye-ai.org'\n",
+    "domain_schema = 'eye-ai'\n",
+    "\n",
+    "gnl = GlobusNativeLogin(host=hostname)\n",
+    "if gnl.is_logged_in([hostname]):\n",
+    "    print(\"You are already logged in.\")\n",
+    "else:\n",
+    "    gnl.login([hostname], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)\n",
+    "    print(\"Login Successful\")\n"
+   ],
+   "id": "2"
+  },
+  {
+   "cell_type": "code",
+   "id": "3",
+   "metadata": {},
+   "source": [
+    "ml_instance = DemoML(hostname, domain_schema)\n",
+    "print(f'Creating catalog at {ml_instance.catalog_id}')\n",
+    "\n",
+    "ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"Initial setup of Model File\")\n",
+    "ml_instance.add_term(MLVocab.asset_type, \"API_Model\", description=\"Model for our API workflow\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "api_workflow = ml_instance.create_workflow(\n",
+    "    name=\"Manual Workflow\",\n",
+    "    workflow_type=\"Manual Workflow\",\n",
+    "    description=\"A manual operation\"\n",
+    ")"
+   ],
+   "id": "5",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "6",
+   "metadata": {},
+   "source": [
+    "manual_execution = ml_instance.create_execution(ExecutionConfiguration( description=\"Sample Execution\", workflow=api_workflow))\n",
+    "manual_execution.upload_execution_outputs()\n",
+    "# Now lets create model configuration for our program."
+   ],
+   "outputs": [],
+   "execution_count": null
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "deriva-test",
+   "language": "python",
+   "name": "deriva-test"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.12.2
+Version: 1.13.0
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
 Requires-Python: >=3.10
@@ -15,6 +15,7 @@ Requires-Dist: semver>3.0.0
 Requires-Dist: setuptools>=64
 Requires-Dist: setuptools-scm<=6.0
 Requires-Dist: nbstripout
+Requires-Dist: papermill
 Dynamic: license-file
 # DerivaML

{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/RECORD RENAMED Viewed

@@ -1,27 +1,29 @@
-deriva_ml/__init__.py,sha256=r1Z9N5vtZkAET7emqhpAx2bf_xJUp5wHOc4_DIplsG8,1082
+deriva_ml/__init__.py,sha256=2sRcX2s72Guo4M7IGW_0_ZyKokZNCbVE6de65tvHBlw,1109
 deriva_ml/database_model.py,sha256=lMbAEqn4n0m7h_JstMX_LX9gbvBIEydG3sRilPn3eLU,14885
-deriva_ml/dataset.py,sha256=oBg4j8loAZA2ccP38fTryeWEolsZ1PStYUOGMlpjE0w,60592
+deriva_ml/dataset.py,sha256=OyWUKWnYeP0ctimSBQ4em-uJrzCNOohx4GPT2uIl6R4,60649
 deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
 deriva_ml/dataset_bag.py,sha256=yS8oYVshfFtRDyhGPRqtbvxjyd3ZFF29lrB783OP4vM,11849
 deriva_ml/demo_catalog.py,sha256=9Qo3JD4bUIwnL3ngPctc2QBeWApvMR_5UyaK9ockTrY,11536
-deriva_ml/deriva_definitions.py,sha256=2eSbTFQ-9rpctphN4PLo8WdtkzMfhfZr3vJeywt6xPM,8897
-deriva_ml/deriva_ml_base.py,sha256=rrImShp1RXvMuXVLft5GfTnxf_PfF1LONHgV1Ee_E9I,46517
+deriva_ml/deriva_definitions.py,sha256=MZl3c23gArbS-0HZ24VDAyb8HI2Kcb8hFdhSnBLOLfo,9030
+deriva_ml/deriva_ml_base.py,sha256=JYTG_a8SURhrPQBTz6OaGMk0D0sSPWpXqCnoVnSNViI,38501
 deriva_ml/deriva_model.py,sha256=wytGCAHutiUaRfnRKr80Ks_P6ci0_wXRU3vq3lthfYU,13260
-deriva_ml/execution.py,sha256=SggLMAfQevnkGyaixF6dRwn36qHO5s07wkLxQXmNCag,36020
-deriva_ml/execution_configuration.py,sha256=XQeXzPz9Gh_AGa_iYW8zF95niwHed3ojv4gnibB0thA,4082
+deriva_ml/execution.py,sha256=t20sGqPRcUaG-5LLHPaQ01pPP8XpqiCveS1h-Fw_XbQ,38093
+deriva_ml/execution_configuration.py,sha256=WiA4PPijNZUftExN6Qm1YScVD1OY3depNKTutIwOfUg,14063
 deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
 deriva_ml/feature.py,sha256=07g0uSrhumdopJluWuWSRMrzagaikAOihqB09bzXBP4,5475
 deriva_ml/history.py,sha256=qTDLDs8Ow_6r7mDO0gZm0Fg81SWKOAgtCU5pzZoDRgM,2828
+deriva_ml/run_notebook.py,sha256=XzI38WNsu9CKDYbWMt8b5ODtlp27dsWsSuMkKwfeWOE,5484
 deriva_ml/test_functions.py,sha256=-eqLHjjCQCLBNAr1ofbZekNiCOfMISSACRxT_YHER8I,4396
+deriva_ml/test_notebook.ipynb,sha256=CatQIh9whsmYWGpwuyw9XMggQ9-TlCueTyH3Wiv4aBc,3116
 deriva_ml/upload.py,sha256=gHTGXAVlf56EwNzmw5zY0gbBf8h08eU2q2GBbb2FdVc,16087
 deriva_ml/schema_setup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deriva_ml/schema_setup/annotations.py,sha256=v0gTpmWYxRqsQ-bcnQzsr8WowGv2pi9pZUsO3WWnu1U,9528
 deriva_ml/schema_setup/create_schema.py,sha256=hNMc-v5tferd0UjfdB6nBw7Rc_o-Mg6NkPqQGie9YOw,11700
 deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
 deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
-deriva_ml-1.12.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-deriva_ml-1.12.2.dist-info/METADATA,sha256=EcMVbM-QX5myyijRGJnUmaQOjOCHhxHf-27T68A8P18,974
-deriva_ml-1.12.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-deriva_ml-1.12.2.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
-deriva_ml-1.12.2.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
-deriva_ml-1.12.2.dist-info/RECORD,,
+deriva_ml-1.13.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+deriva_ml-1.13.0.dist-info/METADATA,sha256=YxPB1VnpB-Y8KL4Yp3VKAYq7F5EUp-R7MfZ1uhWpRZs,999
+deriva_ml-1.13.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+deriva_ml-1.13.0.dist-info/entry_points.txt,sha256=cJnALMa6pjdk6RQCt4HFbKHqALpVa0k6wPeQDPedLJI,295
+deriva_ml-1.13.0.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
+deriva_ml-1.13.0.dist-info/RECORD,,

{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/entry_points.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 [console_scripts]
 deriva-ml-alter-annotation = deriva_ml.schema_setup.alter_annotation:main
 deriva-ml-create-schema = deriva_ml.schema_setup.create_schema:main
+deriva-ml-run-notebook = deriva_ml.run_notebook:main
 deriva-ml-table-comments-utils = deriva_ml.schema_setup.table_comments_utils:main

{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{deriva_ml-1.12.2.dist-info → deriva_ml-1.13.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

deriva-ml 1.12.2__py3-none-any.whl → 1.13.0__py3-none-any.whl

deriva-ml 1.12.2py3-none-any.whl → 1.13.0py3-none-any.whl