PyPI - deriva-ml - Versions diffs - 1.9.1__py3-none-any.whl → 1.10.1__py3-none-any.whl - Mend

deriva-ml 1.9.1py3-none-any.whl → 1.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

deriva_ml/demo_catalog.py CHANGED Viewed

@@ -295,7 +295,9 @@ def create_demo_catalog(
         dataset_table = deriva_ml.dataset_table
         dataset_table.annotations.update(
             Dataset(
-                deriva_ml.model, deriva_ml.cache_dir
+                deriva_ml.model,
+                cache_dir=deriva_ml.cache_dir,
+                working_dir=deriva_ml.working_dir,
             )._generate_dataset_annotations()
         )
         deriva_ml.model.apply()

deriva_ml/deriva_ml_base.py CHANGED Viewed

@@ -1136,8 +1136,8 @@ class DerivaML(Dataset):
             return None
     def create_workflow(
-        self, name: str, workflow_type: str, description: str = "", create: bool = True
-    ) -> RID | None:
+        self, name: str, workflow_type: str, description: str = ""
+    ) -> Workflow:
         """Identify current executing program and return a workflow RID for it
         Determine the notebook or script that is currently being executed. Assume that  this is
@@ -1149,10 +1149,21 @@ class DerivaML(Dataset):
             name: The name of the workflow.
             workflow_type: The type of the workflow.
             description: The description of the workflow.
-            create: Whether to create a new workflow.
         """
         # Make sure type is correct.
         self.lookup_term(MLVocab.workflow_type, workflow_type)
+        try:
+            subprocess.run(
+                "git rev-parse --is-inside-work-tree",
+                capture_output=True,
+                text=True,
+                shell=True,
+                check=True,
+            )
+        except subprocess.CalledProcessError:
+            raise DerivaMLException("Not executing in a Git repository.")
         github_url, is_dirty = self._github_url()
         if is_dirty:
@@ -1174,14 +1185,13 @@ class DerivaML(Dataset):
             shell=True,
         ).stdout.strip()
-        workflow = Workflow(
+        return Workflow(
             name=name,
             url=github_url,
             checksum=checksum,
             description=description,
             workflow_type=workflow_type,
         )
-        return self.add_workflow(workflow) if create else None
     def _github_url(self) -> tuple[str, bool]:
         """Return a GitHUB URL for the latest commit of the script from which this routine is called.
@@ -1238,7 +1248,9 @@ class DerivaML(Dataset):
         return url, is_dirty
     # @validate_call
-    def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
+    def create_execution(
+        self, configuration: ExecutionConfiguration, dry_run: bool = False
+    ) -> "Execution":
         """Create an execution object
         Given an execution configuration, initialize the local compute environment to prepare for executing an
@@ -1249,18 +1261,14 @@ class DerivaML(Dataset):
         Args:
             configuration: ExecutionConfiguration:
+            dryrun: Do not create an execution record or upload results.
         Returns:
             An execution object.
         """
         from .execution import Execution
-        if self._execution:
-            DerivaMLException(
-                "Only one execution can be created for a Deriva ML instance."
-            )
-        else:
-            self._execution = Execution(configuration, self)
+        self._execution = Execution(configuration, self, dry_run=dry_run)
         return self._execution
     # @validate_call

deriva_ml/execution.py CHANGED Viewed

@@ -18,6 +18,7 @@ from typing import Iterable, Any, Optional
 from deriva.core import format_exception
 from deriva.core.ermrest_model import Table
 from pydantic import validate_call, ConfigDict
+import sys
 from .deriva_definitions import MLVocab, ExecMetadataVocab
 from .deriva_definitions import (
@@ -30,7 +31,7 @@ from .deriva_definitions import (
 from .deriva_ml_base import DerivaML, FeatureRecord
 from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
 from .dataset_bag import DatasetBag
-from .execution_configuration import ExecutionConfiguration
+from .execution_configuration import ExecutionConfiguration, Workflow
 from .execution_environment import get_execution_environment
 from .upload import (
     execution_metadata_dir,
@@ -96,6 +97,7 @@ class Execution:
         configuration: ExecutionConfiguration,
         ml_object: "DerivaML",
         reload: Optional[RID] = None,
+        dry_run: bool = False,
     ):
         """
@@ -107,23 +109,36 @@ class Execution:
         self.asset_paths: list[Path] = []
         self.configuration = configuration
         self._ml_object = ml_object
+        self._logger = ml_object._logger
         self.start_time = None
         self.stop_time = None
         self.status = Status.created
         self.uploaded_assets: list[Path] = []
+        self.configuration.argv = sys.argv
         self.dataset_rids: list[RID] = []
         self.datasets: list[DatasetBag] = []
+        self.parameters = self.configuration.parameters
         self._working_dir = self._ml_object.working_dir
         self._cache_dir = self._ml_object.cache_dir
+        self._dry_run = dry_run
-        self.workflow_rid = self.configuration.workflow
-        if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
-            raise DerivaMLException(
-                "Workflow specified in execution configuration is not a Workflow"
+        if isinstance(self.configuration.workflow, Workflow):
+            self.workflow_rid = (
+                self._ml_object.add_workflow(self.configuration.workflow)
+                if not self._dry_run
+                else "0000"
             )
+        else:
+            self.workflow_rid = self.configuration.workflow
+            if (
+                self._ml_object.resolve_rid(configuration.workflow).table.name
+                != "Workflow"
+            ):
+                raise DerivaMLException(
+                    "Workflow specified in execution configuration is not a Workflow"
+                )
         for d in self.configuration.datasets:
             if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
@@ -142,6 +157,10 @@ class Execution:
         schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
         if reload:
             self.execution_rid = reload
+            if self.execution_rid == "0000":
+                self._dry_run = True
+        elif self._dry_run:
+            self.execution_rid = "0000"
         else:
             self.execution_rid = schema_path.Execution.insert(
                 [
@@ -189,7 +208,7 @@ class Execution:
             self.dataset_rids.append(dataset.rid)
         # Update execution info
         schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
-        if self.dataset_rids and not reload:
+        if self.dataset_rids and not (reload or self._dry_run):
             schema_path.Dataset_Execution.insert(
                 [
                     {"Dataset": d, "Execution": self.execution_rid}
@@ -203,7 +222,7 @@ class Execution:
             self._ml_object.download_asset(asset_rid=a, dest_dir=self._asset_dir())
             for a in self.configuration.assets
         ]
-        if self.asset_paths and not reload:
+        if self.asset_paths and not (reload or self._dry_run):
             self._update_execution_asset_table(self.configuration.assets)
         # Save configuration details for later upload
@@ -242,6 +261,11 @@ class Execution:
             msg: Additional information about the status
         """
         self.status = status
+        self._logger.info(msg)
+        if self._dry_run:
+            return
         self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
             [
                 {
@@ -278,7 +302,7 @@ class Execution:
         self.start_time = datetime.now()
         self.uploaded_assets = None
-        self.update_status(Status.initializing, "Start ML algorithm ...")
+        self.update_status(Status.initializing, "Start execution  ...")
     def execution_stop(self) -> None:
         """Finish the execution and update the duration and status of execution."""
@@ -288,13 +312,11 @@ class Execution:
         minutes, seconds = divmod(remainder, 60)
         duration = f"{round(hours, 0)}H {round(minutes, 0)}min {round(seconds, 4)}sec"
-        if self._ml_object._is_notebook:
-            self._create_notebook_checkpoint()
         self.update_status(Status.completed, "Algorithm execution ended.")
-        self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
-            [{"RID": self.execution_rid, "Duration": duration}]
-        )
+        if not self._dry_run:
+            self._ml_object.pathBuilder.schemas[
+                self._ml_object.ml_schema
+            ].Execution.update([{"RID": self.execution_rid, "Duration": duration}])
     def _upload_execution_dirs(self) -> dict[str, FileUploadState]:
         """Upload execution assets at _working_dir/Execution_asset.
@@ -402,6 +424,8 @@ class Execution:
             Uploaded assets with key as assets' suborder name, values as an
             ordered dictionary with RID and metadata in the Execution_Asset table.
         """
+        if self._dry_run:
+            return {}
         try:
             uploaded_assets = self._upload_execution_dirs()
             self.update_status(Status.completed, "Successfully end the execution.")

deriva_ml/execution_configuration.py CHANGED Viewed

@@ -1,14 +1,11 @@
 from __future__ import annotations
 import json
-from typing import Optional
+from typing import Optional, Any
-from pydantic import (
-    BaseModel,
-    conlist,
-    ConfigDict,
-)
+from pydantic import BaseModel, conlist, ConfigDict, field_validator, Field
 from pathlib import Path
+import sys
 from .dataset_aux_classes import DatasetSpec
@@ -43,17 +40,30 @@ class ExecutionConfiguration(BaseModel):
         datasets: List of dataset specifications which specify the dataset RID, version and if the dataset
             should be materialized.
         assets: List of assets to be downloaded prior to execution.  The values must be RIDs in an asset table
+        parameters: Either a dictionary or a path to a JSON file that contains configuration parameters for the execution.
         workflow: A RID for a workflow instance.  Must have a name, URI to the workflow instance, and a type.
         description: A description of the execution.  Can use Markdown format.
     """
     datasets: conlist(DatasetSpec) = []
     assets: list[RID | str] = []  # List of RIDs to model files.
-    workflow: RID
+    workflow: RID | Workflow
+    parameters: dict[str, Any] = {}
     description: str = ""
+    argv: conlist(str) = Field(default_factory=lambda: sys.argv)
     model_config = ConfigDict(arbitrary_types_allowed=True)
+    @field_validator("parameters", mode="before")
+    @classmethod
+    def validate_parameters(cls, value: Any) -> Any:
+        """If parameter is a file, assume that it has JSON contents for configuration parameters"""
+        if isinstance(value, str) or isinstance(value, Path):
+            with open(value, "r") as f:
+                return json.load(f)
+        else:
+            return value
     @staticmethod
     def load_configuration(path: Path) -> ExecutionConfiguration:
         """Create a ExecutionConfiguration from a JSON configuration file.

{deriva_ml-1.9.1.dist-info → deriva_ml-1.10.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.9.1
+Version: 1.10.1
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
 Requires-Python: >=3.10

{deriva_ml-1.9.1.dist-info → deriva_ml-1.10.1.dist-info}/RECORD RENAMED Viewed

@@ -3,12 +3,12 @@ deriva_ml/database_model.py,sha256=58iweWRteLeKKjjeNA9_e7TbUb4Av92lxH2zKvZzwA8,1
 deriva_ml/dataset.py,sha256=h7Zkhnhy66GhPg6O1ud-YCx-jFKAabWF-nwuIDsR8SU,60785
 deriva_ml/dataset_aux_classes.py,sha256=YxjQnu2kS9kK_f8bGqhmgE6ty9GNeitCxfvReT9vaM0,6537
 deriva_ml/dataset_bag.py,sha256=aOJxFA9t5apjE5BNBrk8Pi9R1Cp8AWnnaL-10P8ELrQ,11515
-deriva_ml/demo_catalog.py,sha256=zQAHWSvrVPxMg-vyRUqoC0Jj5RhfGjkBwXW3mBksLhA,10986
+deriva_ml/demo_catalog.py,sha256=1442Lbxmlq45_fgFx0SZPag6dZLimXCk57-TRFee3VA,11064
 deriva_ml/deriva_definitions.py,sha256=jNiInYA2Cb1GE4OOT1CofxBygdLDSOmNsw5Wl6NbZQE,8943
-deriva_ml/deriva_ml_base.py,sha256=nzPzn_iLQIUJDCxTdRgAVEWqS7LbRTZriofWYmdEYe8,46975
+deriva_ml/deriva_ml_base.py,sha256=9LeHUf20MTL3wawUAZz0rRZrxdjo-kki2zRpfv7Rgzg,47141
 deriva_ml/deriva_model.py,sha256=B4gwr3-92IQU-mEZlusgNEnRyulD96esWGS67q9MzHk,12024
-deriva_ml/execution.py,sha256=on8hAtuZr9qFiyxuk_vDCmnRJ9Cv4kFOgHK4HY4CmV8,29585
-deriva_ml/execution_configuration.py,sha256=vsdL31J09dz7CQDd2rYXIjyBPwNlgAWvrTqsXNWi82g,3357
+deriva_ml/execution.py,sha256=15z4S5tElF-pUFIKgPGmxaC1wwh4Via0Mfd1S_ZiZ8c,30404
+deriva_ml/execution_configuration.py,sha256=ZdLHLTUcg5V1id1sVjbp7Nm5bjh42ATG7hOGKaiCSj4,4013
 deriva_ml/execution_environment.py,sha256=bCRKrCELDbGQDo7_FKfw7e8iMzVjSRZK3baKkqH5-_0,3264
 deriva_ml/feature.py,sha256=7e8WYPCfJSrGxJh9oUTduYSnB5ekybRhXa_0HIigS_w,5459
 deriva_ml/history.py,sha256=qTDLDs8Ow_6r7mDO0gZm0Fg81SWKOAgtCU5pzZoDRgM,2828
@@ -19,9 +19,9 @@ deriva_ml/schema_setup/annotations.py,sha256=v0gTpmWYxRqsQ-bcnQzsr8WowGv2pi9pZUs
 deriva_ml/schema_setup/create_schema.py,sha256=BRdYeWW5I8HxuATkB1hkKuIw4n-JQu620xod7EQoVSE,10674
 deriva_ml/schema_setup/policy.json,sha256=77sf0Imy6CAQV0_VwwbA56_KROJ05WXsvT-Wjtkk538,1633
 deriva_ml/schema_setup/table_comments_utils.py,sha256=-2_ubEpoH7ViLVb-ZfW9wZbQ26DTKNgjkCABMzGu4i4,2140
-deriva_ml-1.9.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-deriva_ml-1.9.1.dist-info/METADATA,sha256=zdDl9mmw2-DwvfYyWtq3vKnsB175gRFUsfHvRNLOGLg,941
-deriva_ml-1.9.1.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
-deriva_ml-1.9.1.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
-deriva_ml-1.9.1.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
-deriva_ml-1.9.1.dist-info/RECORD,,
+deriva_ml-1.10.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+deriva_ml-1.10.1.dist-info/METADATA,sha256=0kxsip-JxmgtpvQeHUebq0DDUB3sSOsFJeIpJ6Qdaww,942
+deriva_ml-1.10.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+deriva_ml-1.10.1.dist-info/entry_points.txt,sha256=ZiOvrYj022x544TQwi018ujeHRRDahNmwJnzn5ThacM,242
+deriva_ml-1.10.1.dist-info/top_level.txt,sha256=I1Q1dkH96cRghdsFRVqwpa2M7IqJpR2QPUNNc5-Bnpw,10
+deriva_ml-1.10.1.dist-info/RECORD,,

{deriva_ml-1.9.1.dist-info → deriva_ml-1.10.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (77.0.3)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{deriva_ml-1.9.1.dist-info → deriva_ml-1.10.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{deriva_ml-1.9.1.dist-info → deriva_ml-1.10.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{deriva_ml-1.9.1.dist-info → deriva_ml-1.10.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

deriva-ml 1.9.1__py3-none-any.whl → 1.10.1__py3-none-any.whl

deriva-ml 1.9.1py3-none-any.whl → 1.10.1py3-none-any.whl