PyPI - deriva-ml - Versions diffs - 1.8.4__py3-none-any.whl → 1.8.10__py3-none-any.whl - Mend

deriva-ml 1.8.4py3-none-any.whl → 1.8.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

deriva_ml/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ __all__ = [
     "FileUploadState",
     "FileSpec",
     "ExecutionConfiguration",
+    "Execution",
     "Workflow",
     "DatasetBag",
     "DatasetVersion",
@@ -39,4 +40,13 @@ from .execution_configuration import (
     ExecutionConfiguration,
     Workflow,
 )
+from .execution import Execution
+from importlib.metadata import version, PackageNotFoundError
+try:
+    __version__ = version("deriva_ml")
+except PackageNotFoundError:
+    # package is not installed
+    pass

deriva_ml/deriva_ml_base.py CHANGED Viewed

@@ -13,16 +13,15 @@ from __future__ import annotations
 import getpass
 import logging
 from datetime import datetime
-import hashlib
 from itertools import chain
 import inspect
+import setuptools_scm
 from pathlib import Path
 import requests
-from setuptools_git_versioning import get_latest_file_commit
 import subprocess
+import shutil
 from typing import Optional, Any, Iterable, TYPE_CHECKING
 from deriva.core import (
-    ErmrestCatalog,
     get_credential,
     urlquote,
     DEFAULT_SESSION_CONFIG,
@@ -30,6 +29,7 @@ from deriva.core import (
 )
 import deriva.core.datapath as datapath
 from deriva.core.datapath import DataPathException
+from deriva.core.deriva_server import DerivaServer
 from deriva.core.ermrest_catalog import ResolveRidResult
 from deriva.core.ermrest_model import Key, Table
 from deriva.core.hatrac_store import HatracStore
@@ -115,13 +115,13 @@ class DerivaML(Dataset):
             model_version: A string that indicates the version model.  Typically passed in via
         """
         self.credential = get_credential(hostname)
-        self.catalog = ErmrestCatalog(
+        server = DerivaServer(
             "https",
             hostname,
-            catalog_id,
-            self.credential,
+            credentials=self.credential,
             session_config=self._get_session_config(),
         )
+        self.catalog = server.connect_ermrest(catalog_id)
         self.model = DerivaModel(
             self.catalog.getCatalogModel(), domain_schema=domain_schema
         )
@@ -142,6 +142,8 @@ class DerivaML(Dataset):
         # Initialize dataset class.
         super().__init__(self.model, self.cache_dir)
+        self._logger = logging.getLogger("deriva_ml")
+        self._logger.setLevel(logging_level)
         self.host_name = hostname
         self.catalog_id = catalog_id
@@ -149,25 +151,12 @@ class DerivaML(Dataset):
         self.version = model_version
         self.configuration = None
         self._execution: Optional[Execution] = None
-        self._notebook = None
-        try:
-            from IPython import get_ipython
-            ipython = get_ipython()
-            # Check if running in Jupyter's ZMQ kernel (used by notebooks)
-            if ipython is not None and "IPKernelApp" in ipython.config:
-                self._notebook = Path(ipython.user_ns.get("__session__"))
-            # Check if running in Jupyter's ZMQ kernel (used by notebooks)
-        except (ImportError, AttributeError):
-            pass
+        self._script_path, self._is_notebook = self._get_python_script()
+        self._notebook = self._get_python_notebook()
         self.domain_schema = self.model.domain_schema
         self.project_name = project_name or self.domain_schema
         self.start_time = datetime.now()
         self.status = Status.pending.value
-        self._logger = logging.getLogger("deriva_ml")
-        self._logger.setLevel(logging_level)
         logging.basicConfig(
             level=logging_level,
@@ -190,6 +179,65 @@ class DerivaML(Dataset):
         except (AttributeError, requests.HTTPError):
             pass
+    def _get_python_notebook(self) -> Path | None:
+        """Figure out if you are running in a Jupyter notebook
+        Returns:
+            A Path to the notebook file that is currently being executed.
+        """
+        notebook = None
+        try:
+            ipython = get_ipython()
+            # Check if running in Jupyter's ZMQ kernel (used by notebooks)
+            if ipython is not None and "IPKernelApp" in ipython.config:
+                notebook = Path(ipython.user_ns.get("__session__"))
+                # Check if running in Jupyter's ZMQ kernel (used by notebooks)
+                try:
+                    if subprocess.run(
+                        [shutil.which("nbstripout"), "--is-installed"],
+                        check=False,
+                        capture_output=True,
+                    ).returncode:
+                        self._logger.warning(
+                            "nbstripout is not installed in repository. Please run nbstripout --install"
+                        )
+                except subprocess.CalledProcessError:
+                    self._logger.error("nbstripout is not found.")
+        except (ImportError, AttributeError):
+            pass
+        return notebook
+    def _get_python_script(self) -> tuple[Path, bool]:
+        """Return the path to the currently executing script"""
+        is_notebook = False
+        if filename := self._get_python_notebook():
+            is_notebook = True
+        else:
+            stack = inspect.stack()
+            if len(stack) > 1:
+                filename = Path(
+                    stack[2].filename
+                )  # Get the caller's filename, which is two up the stack from here.
+            else:
+                raise DerivaMLException(
+                    f"Looking for caller failed"
+                )  # Stack is too shallow
+        return filename, is_notebook
+    def _get_git_root(self):
+        try:
+            result = subprocess.run(
+                ["git", "rev-parse", "--show-toplevel"],
+                cwd=self._script_path.parent,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.DEVNULL,
+                text=True,
+                check=True
+            )
+            return result.stdout.strip()
+        except subprocess.CalledProcessError:
+            return None  # Not in a git repository
     @staticmethod
     def _get_session_config():
         """ """
@@ -213,6 +261,9 @@ class DerivaML(Dataset):
         """Get a new instance of a pathBuilder object."""
         return self.catalog.getPathBuilder()
+    def get_version(self) -> str:
+        return setuptools_scm.get_version(root=self._get_git_root())
     @property
     def domain_path(self):
         """Get a new instance of a pathBuilder object to the domain schema"""
@@ -1001,40 +1052,38 @@ class DerivaML(Dataset):
     ) -> RID:
         """Identify current executing program and return a workflow RID for it
-        Determane the notebook of script that is currently being executed. Assume that  this is
+        Determine the notebook or script that is currently being executed. Assume that  this is
         being executed from a cloned GitHub repository.  Determine the remote repository name for
-        this object.  Then either retrieve an existing workflow for this executable of create
+        this object.  Then either retrieve an existing workflow for this executable or create
         a new one.
         Args:
             name: The name of the workflow.
             workflow_type: The type of the workflow.
             description: The description of the workflow.
-            create: Whether or not to create a new workflow.
+            create: Whether to create a new workflow.
         """
         # Make sure type is correct.
         self.lookup_term(MLVocab.workflow_type, workflow_type)
-        filename, github_url, is_dirty = self._github_url()
+        github_url, is_dirty = self._github_url()
         if is_dirty:
             self._logger.warning(
-                f"File {filename} has been modified since last commit. Consider commiting before executing"
+                f"File {self._script_path} has been modified since last commit. Consider commiting before executing"
             )
-        sha256_hash = hashlib.sha256()
-        if self._notebook:
-            # If you are in a notebook, strip out the outputs before computing the checksum.
-            result = subprocess.run(
-                ["nbstripout", "-t", filename],
-                capture_output=True,
-                text=False,
-                check=True,
-            )
-            sha256_hash.update(result.stdout)
-        else:
-            with open(filename, "rb") as f:
-                sha256_hash.update(f.read())
-        checksum = "SHA-256:" + sha256_hash.hexdigest()
+        # If you are in a notebook, strip out the outputs before computing the checksum.
+        cmd = (
+            f"nbstripout {self._script_path} | git hash-object --stdin"
+            if self._is_notebook
+            else f"git hash-object {self._script_path}"
+        )
+        checksum = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            check=True,
+        ).stdout.strip()
         workflow = Workflow(
             name=name,
@@ -1045,67 +1094,57 @@ class DerivaML(Dataset):
         )
         return self.add_workflow(workflow) if create else None
-    def _github_url(self) -> tuple[str, str, bool]:
+    def _github_url(self) -> tuple[str, bool]:
         """Return a GitHUB URL for the latest commit of the script from which this routine is called.
         This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
         the file is in a gitHUB repository and commited.  It returns a URL to the last commited version of this
         file in GitHUB.
-        Returns: A tuple with the filename, gethub_url and a boolaen to indicated if uncommited changes
+        Returns: A tuple with the gethub_url and a boolean to indicated if uncommited changes
             have been made to the file.
         """
-        # Get the name of the script that is calling this function.
-        if self._notebook:
-            # Try to get the __session__ variable from the user namespace.
-            filename = Path("").absolute().parent / self._notebook
-        else:
-            stack = inspect.stack()
-            if len(stack) > 1:
-                filename = Path(
-                    stack[2].filename
-                )  # Get the caller's filename, which is two up the stack from here.
-            else:
-                raise DerivaMLException(
-                    f"Looking for caller failed"
-                )  # Stack is too shallow
         # Get repo URL from local github repo.
         try:
             result = subprocess.run(
-                ["git", "remote", "get-url", "origin"], capture_output=True, text=True
+                ["git", "remote", "get-url", "origin"], capture_output=True, text=True,
+                cwd=self._script_path.parent,
             )
             github_url = result.stdout.strip().removesuffix(".git")
         except subprocess.CalledProcessError:
             raise DerivaMLException(f"No GIT remote found")
         # Find the root directory for the repository
-        repo_root = filename
-        while repo_root != repo_root.root:
-            if (repo_root / ".git").exists():
-                break
-            else:
-                repo_root = repo_root.parent
+        repo_root = self._get_git_root()
         # Now check to see if file has been modified since the last commit.
         try:
             result = subprocess.run(
                 ["git", "status", "--porcelain"],
+                cwd=self._script_path.parent,
                 capture_output=True,
                 text=True,
                 check=True,
             )
             is_dirty = bool(
-                " M " in result.stdout.strip()
+                "M " in result.stdout.strip()
             )  # Returns True if output indicates a modified file
         except subprocess.CalledProcessError:
             is_dirty = False  # If Git command fails, assume no changes
-        sha = get_latest_file_commit(filename)
-        url = f"{github_url}/blob/{sha}/{filename.relative_to(repo_root)}"
-        return filename, url, is_dirty
+        """Get SHA-1 hash of latest commit of the file in the repository"""
+        result = subprocess.run(
+            ["git", "log", "-n", "1", "--pretty=format:%H" "--", self._script_path],
+            cwd=self._script_path.parent,
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        sha = result.stdout.strip()
+        url = f"{github_url}/blob/{sha}/{self._script_path.relative_to(repo_root)}"
+        return url, is_dirty
     # @validate_call
     def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":

deriva_ml/execution.py CHANGED Viewed

@@ -254,7 +254,7 @@ class Execution:
     def _create_notebook_checkpoint(self):
         """Trigger a checkpoint creation using Jupyter's API."""
         notebook_name = self._ml_object._notebook
-        servers = list_running_servers()
         # Look for the server running this notebook.
         root = Path("").absolute().parent.as_posix()
         servers = list(list_running_servers())

{deriva_ml-1.8.4.dist-info → deriva_ml-1.8.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.8.4
+Version: 1.8.10
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
 Requires-Python: >=3.10
@@ -11,8 +11,10 @@ Requires-Dist: pandas
 Requires-Dist: regex~=2024.7.24
 Requires-Dist: pydantic>=2.10.6
 Requires-Dist: semver>3.0.0
-Requires-Dist: setuptools-git-versioning<3,>=2.0
+Requires-Dist: setuptools>=64
+Requires-Dist: setuptools-scm<=6.0
 Requires-Dist: nbstripout
+Dynamic: license-file
 Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
 using a deriva catalog.

{deriva_ml-1.8.4.dist-info → deriva_ml-1.8.10.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,13 @@
-deriva_ml/VERSION.py,sha256=8kdJa8mgK7VES73y02oBbzwoXZCUs42GzbJ4UU-L_3I,22
-deriva_ml/__init__.py,sha256=0PHNB8gRDALLtaffRmU7wCUgWbRHVQZcjuPJxMLNEco,856
+deriva_ml/__init__.py,sha256=r1Z9N5vtZkAET7emqhpAx2bf_xJUp5wHOc4_DIplsG8,1082
 deriva_ml/database_model.py,sha256=uhoyVyd8MQmY8J9ovCH8fjxhZDxxXNkdJyYdeyEGPXA,13898
 deriva_ml/dataset.py,sha256=xC6QPUp4MZcJiEnOEU3NnzoLBL9RcJWtPTyzIQP0Ivw,60666
 deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
 deriva_ml/dataset_bag.py,sha256=e6IHv3saZUnZRfl0EjfnlV2NnmPeOagYYv3PuZqS1l0,11501
 deriva_ml/demo_catalog.py,sha256=xQPhFlflqwJskNQrQ-jdBSnGzBm2-aONBgcRxfsdNKM,11045
 deriva_ml/deriva_definitions.py,sha256=pZLPoUxiuJ-uGglmQ6sF9oVXsSUuOnPEqywoec78XNM,8893
-deriva_ml/deriva_ml_base.py,sha256=3iA1OaPU-6Q7ixt87uDmPuHHZ5P-FyHvX0AKfi4tKp0,42224
-deriva_ml/deriva_ml_execute.py,sha256=y_rGjc97eidBuzy-AaQGe93vuTbWbkNkK9rpReqV0IY,4433
+deriva_ml/deriva_ml_base.py,sha256=aVyGsFERZtpjNxfaVYzvKa7J0Ma-U3DEibfjnbr7lFQ,43817
 deriva_ml/deriva_model.py,sha256=LV3FjIhIlz13ckZSmu0aOJhT9EVE0-M9oVMudfkxb0g,12004
-deriva_ml/execution.py,sha256=c7dbk4HvEh7E4BLlBrf_azUxxhRSUmLQa_6G8t8OKVY,29929
+deriva_ml/execution.py,sha256=VlapQGPDQI2MOmYnA5-hpf-XM6Fu4hPLpFjNN5q9Udo,29889
 deriva_ml/execution_configuration.py,sha256=bjnZwXN6M7YPy5dFQwoGEBU8YjhQRSe1FW0rL0V9TaM,3422
 deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
 deriva_ml/feature.py,sha256=7e8WYPCfJSrGxJh9oUTduYSnB5ekybRhXa_0HIigS_w,5459
@@ -27,9 +25,9 @@ deriva_ml/schema_setup/annotations.py,sha256=Uogm9YkRtoKSdgfQlICqRywbCATppwBO-Xr
 deriva_ml/schema_setup/create_schema.py,sha256=jwziMWJPbjRgjiRBT-KtidnXI8YNEFO74A9fwfptjHY,10626
 deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
 deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
-deriva_ml-1.8.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-deriva_ml-1.8.4.dist-info/METADATA,sha256=F14U7NvY310NBB4wGp3-OVmAUXvMy_sDNuS1ZmRjwek,631
-deriva_ml-1.8.4.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
-deriva_ml-1.8.4.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
-deriva_ml-1.8.4.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
-deriva_ml-1.8.4.dist-info/RECORD,,
+deriva_ml-1.8.10.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+deriva_ml-1.8.10.dist-info/METADATA,sha256=Mhx0joyR1gPEX8G6ZoEpvxNVW4sUG9C_S5TIA6ueZKk,670
+deriva_ml-1.8.10.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
+deriva_ml-1.8.10.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
+deriva_ml-1.8.10.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
+deriva_ml-1.8.10.dist-info/RECORD,,

{deriva_ml-1.8.4.dist-info → deriva_ml-1.8.10.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (76.1.0)
+Generator: setuptools (77.0.3)
 Root-Is-Purelib: true
 Tag: py3-none-any

deriva_ml/VERSION.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "1.8.4"

deriva_ml/deriva_ml_execute.py DELETED Viewed

@@ -1,104 +0,0 @@
-from sympy import cxxcode
-from  deriva_ml import DerivaML, execution_configuration
-def execute(host, catalog, script):
-    workflow_rid = foobar
-    execution_configuration = cxxcode(
-    )
-    ml_instance = DerivaML()
-    ml_instance.create_execution(configuration)
-    script
-from deriva_ml import DerivaML, ExecutionConfiguration, DatasetSpec, RID, DerivaMLException
-import os
-import sys
-import json
-import traceback
-import argparse
-import requests
-from requests.exceptions import HTTPError, ConnectionError
-from deriva.transfer import GenericDownloader
-from deriva.transfer.download import DerivaDownloadError, DerivaDownloadConfigurationError, \
-    DerivaDownloadAuthenticationError, DerivaDownloadAuthorizationError, DerivaDownloadTimeoutError, \
-    DerivaDownloadBaggingError
-from deriva.core import BaseCLI, KeyValuePairArgs, format_credential, format_exception, urlparse
-class DerivaMLExecCLI(BaseCLI):
-    def __init__(self, description, epilog, **kwargs):
-        BaseCLI.__init__(self, description, epilog, **kwargs)
-        self.parser.add_argument("--catalog", default=1, metavar="<1>", help="Catalog number. Default: 1")
-        self.parser.add_argument("--timeout", metavar="<seconds>",
-                                 help="Total number of seconds elapsed before the download is aborted.")
-        self.parser.add_argument("output_dir", metavar="<output dir>", help="Path to an output directory.")
-        self.parser.add_argument("envars", metavar="[key=value key=value ...]",
-                                 nargs=argparse.REMAINDER, action=KeyValuePairArgs, default={},
-                                 help="Variable length of whitespace-delimited key=value pair arguments used for "
-                                      "string interpolation in specific parts of the configuration file. "
-                                      "For example: key1=value1 key2=value2")
-    def main(self):
-        try:
-            args = self.parse_cli()
-        except ValueError as e:
-            sys.stderr.write(str(e))
-            return 2
-        if not args.quiet:
-            sys.stderr.write("\n")
-        try:
-            try:
-                ml_instance = DerivaML(args.hostname, args.catalog)
-                downloaded = self.execute()
-                sys.stdout.write("\n%s\n" % (json.dumps(downloaded)))
-            except ConnectionError as e:
-                raise DerivaDownloadError("Connection error occurred. %s" % format_exception(e))
-            except HTTPError as e:
-                if e.response.status_code == requests.codes.unauthorized:
-                    raise DerivaDownloadAuthenticationError(
-                        "The requested service requires authentication and a valid login session could "
-                        "not be found for the specified host. Server responded: %s" % e)
-                elif e.response.status_code == requests.codes.forbidden:
-                    raise DerivaDownloadAuthorizationError(
-                        "A requested operation was forbidden. Server responded: %s" % e)
-        except (DerivaDownloadError, DerivaDownloadConfigurationError, DerivaDownloadAuthenticationError,
-                DerivaDownloadAuthorizationError, DerivaDownloadTimeoutError, DerivaDownloadBaggingError) as e:
-            sys.stderr.write(("\n" if not args.quiet else "") + format_exception(e))
-            if args.debug:
-                traceback.print_exc()
-            return 1
-        except:
-            sys.stderr.write("An unexpected error occurred.")
-            traceback.print_exc()
-            return 1
-        finally:
-            if not args.quiet:
-                sys.stderr.write("\n\n")
-        return 0
-def do_stuff():
-    pass
-def main(datasets: list[RID], model: list[RID], hostname: str, catalog_id: str):
-    my_url = DerivaML.github_url()
-    ml_instance = DerivaML(hostname, catalog_id)
-    ml_instance.lookup_workflow(my_url)
-    config = ExecutionConfiguration(
-        datasets=[DatasetSpec(rid=dataset,
-                              version=ml_instance.dataset_version(dataset)) for dataset in datasets],
-        assets=model,
-        workflow= ml_instance.lookup_workflow(my_url)
-    )
-    execution = ml_instance.create_execution(config)
-    with execution as e:
-        do_stuff()
-    execution.upload_execution_outputs()
-if __name__ == "__main__":
-    main(datasets, model, hostname, catalog_id)
-if __file__ == matplotlib_inline

{deriva_ml-1.8.4.dist-info → deriva_ml-1.8.10.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{deriva_ml-1.8.4.dist-info → deriva_ml-1.8.10.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{deriva_ml-1.8.4.dist-info → deriva_ml-1.8.10.dist-info}/top_level.txt RENAMED Viewed

File without changes

deriva-ml 1.8.4__py3-none-any.whl → 1.8.10__py3-none-any.whl

deriva-ml 1.8.4py3-none-any.whl → 1.8.10py3-none-any.whl