PyPI - deriva-ml - Versions diffs - 1.9.0__tar.gz → 1.10.0__tar.gz - Mend

deriva-ml 1.9.0tar.gz → 1.10.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

{deriva_ml-1.9.0/src/deriva_ml.egg-info → deriva_ml-1.10.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.9.0
+Version: 1.10.0
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
 Requires-Python: >=3.10
@@ -16,5 +16,14 @@ Requires-Dist: setuptools-scm<=6.0
 Requires-Dist: nbstripout
 Dynamic: license-file
-Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
+# DerivaML
+Deriva-ML is a python library to simplify the process of creating and executing reproducible machine learning workflows
 using a deriva catalog.
+## Installing the GitHub CLI
+The script release.sh will create a new release tag in GitHub.  This script requires the
+GitHUB CLI be installed.
+See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.

deriva_ml-1.10.0/README.md ADDED Viewed

@@ -0,0 +1,11 @@
+# DerivaML
+Deriva-ML is a python library to simplify the process of creating and executing reproducible machine learning workflows
+using a deriva catalog.
+## Installing the GitHub CLI
+The script release.sh will create a new release tag in GitHub.  This script requires the
+GitHUB CLI be installed.
+See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.

{deriva_ml-1.9.0 → deriva_ml-1.10.0}/release.sh RENAMED Viewed

@@ -9,6 +9,7 @@ fi
 # Default version bump is patch unless specified (patch, minor, or major)
 VERSION_TYPE=${1:-patch}
 echo "Bumping version: $VERSION_TYPE"
 # Bump the version using bump-my-version.
@@ -32,6 +33,6 @@ python -m build
 NEW_TAG=$(git describe --tags --abbrev=0)
 echo "New version tag: $NEW_TAG"
-twine upload "dist/*${NEW_TAG/v/}"
+twine upload dist/*${NEW_TAG/v/}*
 echo "Release process complete!"

{deriva_ml-1.9.0 → deriva_ml-1.10.0}/src/deriva_ml/database_model.py RENAMED Viewed

@@ -84,7 +84,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
         except KeyError:
             raise DerivaMLException(f"Dataset {dataset_rid} not found")
-    def __init__(self, minid: DatasetMinid, bag_path: Path):
+    def __init__(self, minid: DatasetMinid, bag_path: Path, dbase_path: Path):
         """Create a new DatabaseModel.
         Args:
@@ -95,8 +95,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
         self.bag_path = bag_path
         self.minid = minid
         self.dataset_rid = minid.dataset_rid
-        dir_path = bag_path.parent
-        self.dbase_file = dir_path / f"{minid.version_rid}.db"
+        self.dbase_file = dbase_path / f"{minid.version_rid}.db"
         self.dbase = sqlite3.connect(self.dbase_file)
         super().__init__(

{deriva_ml-1.9.0 → deriva_ml-1.10.0}/src/deriva_ml/dataset.py RENAMED Viewed

@@ -67,11 +67,12 @@ class Dataset:
     _Logger = logging.getLogger("deriva_ml")
-    def __init__(self, model: DerivaModel, cache_dir: Path):
+    def __init__(self, model: DerivaModel, cache_dir: Path, working_dir: Path):
         self._model = model
         self._ml_schema = ML_SCHEMA
         self.dataset_table = self._model.schemas[self._ml_schema].tables["Dataset"]
         self._cache_dir = cache_dir
+        self._working_dir = working_dir
         self._logger = logging.getLogger("deriva_ml")
     def _is_dataset_rid(self, dataset_rid: RID, deleted: bool = False) -> bool:
@@ -783,7 +784,6 @@ class Dataset:
         snapshot: Optional[Dataset] = None,
         dataset_nesting_depth: Optional[int] = None,
     ) -> set[tuple[Table, ...]]:
         snapshot_catalog = snapshot if snapshot else self
         dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
@@ -831,9 +831,7 @@ class Dataset:
         nested_paths = set()
         if dataset_rid:
             for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
-                nested_paths |= self._collect_paths(
-                    c, snapshot=snapshot_catalog
-                )
+                nested_paths |= self._collect_paths(c, snapshot=snapshot_catalog)
         else:
             # Initialize nesting depth if not already provided.
             dataset_nesting_depth = (
@@ -979,7 +977,7 @@ class Dataset:
             if dataset.materialize
             else self._download_dataset_minid(minid)
         )
-        return DatabaseModel(minid, bag_path).get_dataset()
+        return DatabaseModel(minid, bag_path, self._working_dir).get_dataset()
     def _version_snapshot(self, dataset: DatasetSpec) -> str:
         """Return a catalog with snapshot for the specified dataset version"""

{deriva_ml-1.9.0 → deriva_ml-1.10.0}/src/deriva_ml/demo_catalog.py RENAMED Viewed

@@ -295,7 +295,9 @@ def create_demo_catalog(
         dataset_table = deriva_ml.dataset_table
         dataset_table.annotations.update(
             Dataset(
-                deriva_ml.model, deriva_ml.cache_dir
+                deriva_ml.model,
+                cache_dir=deriva_ml.cache_dir,
+                working_dir=deriva_ml.working_dir,
             )._generate_dataset_annotations()
         )
         deriva_ml.model.apply()

{deriva_ml-1.9.0 → deriva_ml-1.10.0}/src/deriva_ml/deriva_ml_base.py RENAMED Viewed

@@ -163,7 +163,7 @@ class DerivaML(Dataset):
         self.cache_dir.mkdir(parents=True, exist_ok=True)
         # Initialize dataset class.
-        super().__init__(self.model, self.cache_dir)
+        super().__init__(self.model, self.cache_dir, self.working_dir)
         self._logger = logging.getLogger("deriva_ml")
         self._logger.setLevel(logging_level)
@@ -257,7 +257,7 @@ class DerivaML(Dataset):
     def _get_notebook_path(self) -> Path | None:
         """Return the absolute path of the current notebook."""
-        server, session = self._get_notebook_session()
+        server, session = DerivaML._get_notebook_session()
         if server and session:
             self._check_nbstrip_status()
             relative_path = session["notebook"]["path"]
@@ -1136,8 +1136,8 @@ class DerivaML(Dataset):
             return None
     def create_workflow(
-        self, name: str, workflow_type: str, description: str = "", create: bool = True
-    ) -> RID | None:
+        self, name: str, workflow_type: str, description: str = ""
+    ) -> Workflow:
         """Identify current executing program and return a workflow RID for it
         Determine the notebook or script that is currently being executed. Assume that  this is
@@ -1149,10 +1149,21 @@ class DerivaML(Dataset):
             name: The name of the workflow.
             workflow_type: The type of the workflow.
             description: The description of the workflow.
-            create: Whether to create a new workflow.
         """
         # Make sure type is correct.
         self.lookup_term(MLVocab.workflow_type, workflow_type)
+        try:
+            subprocess.run(
+                "git rev-parse --is-inside-work-tree",
+                capture_output=True,
+                text=True,
+                shell=True,
+                check=True,
+            )
+        except subprocess.CalledProcessError:
+            raise DerivaMLException("Not executing in a Git repository.")
         github_url, is_dirty = self._github_url()
         if is_dirty:
@@ -1174,14 +1185,13 @@ class DerivaML(Dataset):
             shell=True,
         ).stdout.strip()
-        workflow = Workflow(
+        return Workflow(
             name=name,
             url=github_url,
             checksum=checksum,
             description=description,
             workflow_type=workflow_type,
         )
-        return self.add_workflow(workflow) if create else None
     def _github_url(self) -> tuple[str, bool]:
         """Return a GitHUB URL for the latest commit of the script from which this routine is called.
@@ -1238,7 +1248,9 @@ class DerivaML(Dataset):
         return url, is_dirty
     # @validate_call
-    def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
+    def create_execution(
+        self, configuration: ExecutionConfiguration, dryrun: bool = False
+    ) -> "Execution":
         """Create an execution object
         Given an execution configuration, initialize the local compute environment to prepare for executing an
@@ -1249,6 +1261,7 @@ class DerivaML(Dataset):
         Args:
             configuration: ExecutionConfiguration:
+            dryrun: Do not create an execution record or upload results.
         Returns:
             An execution object.
@@ -1260,7 +1273,7 @@ class DerivaML(Dataset):
                 "Only one execution can be created for a Deriva ML instance."
             )
         else:
-            self._execution = Execution(configuration, self)
+            self._execution = Execution(configuration, self, dryrun=dryrun)
         return self._execution
     # @validate_call

{deriva_ml-1.9.0 → deriva_ml-1.10.0}/src/deriva_ml/execution.py RENAMED Viewed

@@ -30,7 +30,7 @@ from .deriva_definitions import (
 from .deriva_ml_base import DerivaML, FeatureRecord
 from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
 from .dataset_bag import DatasetBag
-from .execution_configuration import ExecutionConfiguration
+from .execution_configuration import ExecutionConfiguration, Workflow
 from .execution_environment import get_execution_environment
 from .upload import (
     execution_metadata_dir,
@@ -96,6 +96,7 @@ class Execution:
         configuration: ExecutionConfiguration,
         ml_object: "DerivaML",
         reload: Optional[RID] = None,
+        dry_run: bool = False,
     ):
         """
@@ -107,6 +108,7 @@ class Execution:
         self.asset_paths: list[Path] = []
         self.configuration = configuration
         self._ml_object = ml_object
+        self._logger = ml_object._logger
         self.start_time = None
         self.stop_time = None
         self.status = Status.created
@@ -117,13 +119,23 @@ class Execution:
         self._working_dir = self._ml_object.working_dir
         self._cache_dir = self._ml_object.cache_dir
+        self._dry_run = dry_run
-        self.workflow_rid = self.configuration.workflow
-        if self._ml_object.resolve_rid(configuration.workflow).table.name != "Workflow":
-            raise DerivaMLException(
-                "Workflow specified in execution configuration is not a Workflow"
+        if isinstance(self.configuration.workflow, Workflow):
+            self.workflow_rid = (
+                self._ml_object.add_workflow(self.configuration.workflow)
+                if not self._dry_run
+                else "0000"
             )
+        else:
+            self.workflow_rid = self.configuration.workflow
+            if (
+                self._ml_object.resolve_rid(configuration.workflow).table.name
+                != "Workflow"
+            ):
+                raise DerivaMLException(
+                    "Workflow specified in execution configuration is not a Workflow"
+                )
         for d in self.configuration.datasets:
             if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
@@ -142,6 +154,10 @@ class Execution:
         schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
         if reload:
             self.execution_rid = reload
+            if self.execution_rid == "0000":
+                self._dry_run = True
+        elif self._dry_run:
+            self.execution_rid = "0000"
         else:
             self.execution_rid = schema_path.Execution.insert(
                 [
@@ -189,7 +205,7 @@ class Execution:
             self.dataset_rids.append(dataset.rid)
         # Update execution info
         schema_path = self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema]
-        if self.dataset_rids and not reload:
+        if self.dataset_rids and not (reload or self._dry_run):
             schema_path.Dataset_Execution.insert(
                 [
                     {"Dataset": d, "Execution": self.execution_rid}
@@ -203,7 +219,7 @@ class Execution:
             self._ml_object.download_asset(asset_rid=a, dest_dir=self._asset_dir())
             for a in self.configuration.assets
         ]
-        if self.asset_paths and not reload:
+        if self.asset_paths and not (reload or self._dry_run):
             self._update_execution_asset_table(self.configuration.assets)
         # Save configuration details for later upload
@@ -242,6 +258,11 @@ class Execution:
             msg: Additional information about the status
         """
         self.status = status
+        self._logger.info(msg)
+        if self._dry_run:
+            return
         self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
             [
                 {
@@ -278,7 +299,7 @@ class Execution:
         self.start_time = datetime.now()
         self.uploaded_assets = None
-        self.update_status(Status.initializing, "Start ML algorithm ...")
+        self.update_status(Status.initializing, "Start execution  ...")
     def execution_stop(self) -> None:
         """Finish the execution and update the duration and status of execution."""
@@ -288,13 +309,11 @@ class Execution:
         minutes, seconds = divmod(remainder, 60)
         duration = f"{round(hours, 0)}H {round(minutes, 0)}min {round(seconds, 4)}sec"
-        if self._ml_object._is_notebook:
-            self._create_notebook_checkpoint()
         self.update_status(Status.completed, "Algorithm execution ended.")
-        self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
-            [{"RID": self.execution_rid, "Duration": duration}]
-        )
+        if not self._dry_run:
+            self._ml_object.pathBuilder.schemas[
+                self._ml_object.ml_schema
+            ].Execution.update([{"RID": self.execution_rid, "Duration": duration}])
     def _upload_execution_dirs(self) -> dict[str, FileUploadState]:
         """Upload execution assets at _working_dir/Execution_asset.
@@ -402,6 +421,8 @@ class Execution:
             Uploaded assets with key as assets' suborder name, values as an
             ordered dictionary with RID and metadata in the Execution_Asset table.
         """
+        if self._dry_run:
+            return {}
         try:
             uploaded_assets = self._upload_execution_dirs()
             self.update_status(Status.completed, "Successfully end the execution.")

{deriva_ml-1.9.0 → deriva_ml-1.10.0}/src/deriva_ml/execution_configuration.py RENAMED Viewed

@@ -49,7 +49,7 @@ class ExecutionConfiguration(BaseModel):
     datasets: conlist(DatasetSpec) = []
     assets: list[RID | str] = []  # List of RIDs to model files.
-    workflow: RID
+    workflow: RID | Workflow
     description: str = ""
     model_config = ConfigDict(arbitrary_types_allowed=True)

{deriva_ml-1.9.0 → deriva_ml-1.10.0/src/deriva_ml.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.9.0
+Version: 1.10.0
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
 Requires-Python: >=3.10
@@ -16,5 +16,14 @@ Requires-Dist: setuptools-scm<=6.0
 Requires-Dist: nbstripout
 Dynamic: license-file
-Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
+# DerivaML
+Deriva-ML is a python library to simplify the process of creating and executing reproducible machine learning workflows
 using a deriva catalog.
+## Installing the GitHub CLI
+The script release.sh will create a new release tag in GitHub.  This script requires the
+GitHUB CLI be installed.
+See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.

deriva_ml-1.9.0/README.md DELETED Viewed

	@@ -1,2 +0,0 @@
1	- Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
2	- using a deriva catalog.