PyPI - deriva-ml - Versions diffs - 1.8.2__tar.gz → 1.8.5__tar.gz - Mend

deriva-ml 1.8.2tar.gz → 1.8.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{deriva_ml-1.8.2/src/deriva_ml.egg-info → deriva_ml-1.8.5}/PKG-INFO RENAMED Viewed

@@ -1,16 +1,19 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.8.2
+Version: 1.8.5
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: deriva~=1.7.6
+Requires-Dist: deriva~=1.7.7
 Requires-Dist: pandas
 Requires-Dist: regex~=2024.7.24
 Requires-Dist: pydantic>=2.10.6
 Requires-Dist: semver>3.0.0
+Requires-Dist: setuptools-git-versioning<3,>=2.0
+Requires-Dist: nbstripout
+Dynamic: license-file
 Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
 using a deriva catalog.

{deriva_ml-1.8.2 → deriva_ml-1.8.5}/pyproject.toml RENAMED Viewed

@@ -13,11 +13,13 @@ description = "Utilities to simplify use of Dervia and Pandas to create reproduc
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "deriva~=1.7.6",
+    "deriva~=1.7.7",
     "pandas",
     "regex~=2024.7.24",
     "pydantic>=2.10.6",
-    "semver>3.0.0"
+    "semver>3.0.0",
+     "setuptools-git-versioning>=2.0,<3",
+    "nbstripout",
 ]
 [tool.setuptools.package-data]

deriva_ml-1.8.5/src/deriva_ml/VERSION.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.8.5"

{deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/__init__.py RENAMED Viewed

@@ -4,6 +4,7 @@ __all__ = [
     "FileUploadState",
     "FileSpec",
     "ExecutionConfiguration",
+    "Execution",
     "Workflow",
     "DatasetBag",
     "DatasetVersion",
@@ -39,4 +40,4 @@ from .execution_configuration import (
     ExecutionConfiguration,
     Workflow,
 )
+from .execution import Execution

{deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/dataset.py RENAMED Viewed

@@ -6,6 +6,7 @@ accessible via a DerivaML class instance.
 """
+from __future__ import annotations
 from bdbag.fetch.fetcher import fetch_single_file
 from bdbag import bdbag_api as bdb
 from collections import defaultdict
@@ -37,7 +38,7 @@ from pydantic import (
 import requests
 from tempfile import TemporaryDirectory, NamedTemporaryFile
-from typing import Any, Callable, Optional, Iterable, Iterator
+from typing import Any, Callable, Optional, Iterable, Iterator, TYPE_CHECKING
 from deriva_ml import DatasetBag
 from .deriva_definitions import ML_SCHEMA, DerivaMLException, MLVocab, Status, RID
@@ -52,6 +53,9 @@ from .dataset_aux_classes import (
     DatasetSpec,
 )
+if TYPE_CHECKING:
+    from .deriva_ml_base import DerivaML
 class Dataset:
     """
@@ -83,29 +87,32 @@ class Dataset:
         else:
             return not list(rid_info.datapath.entities().fetch())[0]["Deleted"]
-    def _insert_dataset_version(
+    def _insert_dataset_versions(
         self,
-        dataset_rid: RID,
-        dataset_version: DatasetVersion,
+        dataset_list: list[DatasetSpec],
         description: Optional[str] = "",
         execution_rid: Optional[RID] = None,
     ) -> RID:
         schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
-        version_path = schema_path.tables["Dataset_Version"]
-        version_rid = version_path.insert(
-            [
-                {
-                    "Dataset": dataset_rid,
-                    "Version": str(dataset_version),
-                    "Description": description,
-                    "Execution": execution_rid,
-                }
-            ]
-        )[0]["RID"]
-        schema_path.tables["Dataset"].update(
-            [{"RID": dataset_rid, "Version": version_rid}]
-        )
-        return version_rid
+        # Construct version records for insert
+        version_records = [
+            {
+                "Dataset": dataset.rid,
+                "Version": str(dataset.version),
+                "Description": description,
+                "Execution": execution_rid,
+            }
+            for dataset in dataset_list
+        ]
+        # Insert version records and construct entities for updating the dataset version column.
+        version_rids = [
+            {"Version": v["RID"], "RID": v["Dataset"]}
+            for v in schema_path.tables["Dataset_Version"].insert(version_records)
+        ]
+        schema_path.tables["Dataset"].update(version_rids)
+        return version_rids
     def _bootstrap_versions(self):
         datasets = [ds["RID"] for ds in self.find_datasets()]
@@ -237,16 +244,20 @@ class Dataset:
         Raises:
           DerivaMLException: if provided RID is not to a dataset_table.
         """
-        for dataset in self._build_dataset_graph(dataset_rid=dataset_rid):
-            version = self.dataset_version(dataset)
-            new_version = version.increment_version(component)
-            self._insert_dataset_version(
-                dataset,
-                new_version,
-                description=description,
-                execution_rid=execution_rid,
+        # Find all of the datasets that are reachable from this dataset and determine their new version numbers.
+        related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
+        version_update_list = [
+            DatasetSpec(
+                rid=ds_rid,
+                version=self.dataset_version(ds_rid).increment_version(component),
             )
-        return self.dataset_version(dataset_rid)
+            for ds_rid in related_datasets
+        ]
+        updated_versions = self._insert_dataset_versions(
+            version_update_list, description=description, execution_rid=execution_rid
+        )
+        return [d.version for d in version_update_list if d.rid == dataset_rid][0]
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def create_dataset(
@@ -323,9 +334,8 @@ class Dataset:
             pb.schemas[self._ml_schema].Dataset_Execution.insert(
                 [{"Dataset": dataset_rid, "Execution": execution_rid}]
             )
-        self._insert_dataset_version(
-            dataset_rid,
-            dataset_version=version,
+        self._insert_dataset_versions(
+            [DatasetSpec(rid=dataset_rid, version=version)],
             execution_rid=execution_rid,
             description="Initial dataset creation.",
         )
@@ -455,7 +465,7 @@ class Dataset:
             dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
             dataset_rid: RID:
             recurse:  (Default value = False)
-            limit: If provided, the maxiumum number of members to return for each element type.
+            limit: If provided, the maximum number of members to return for each element type.
         Returns:
             Dictionary of entities associated with a specific dataset_table.  Key is the table from which the elements
@@ -697,11 +707,25 @@ class Dataset:
           list of RIDs of nested datasets.
         """
-        children = [d["RID"] for d in self.list_dataset_members(dataset_rid)["Dataset"]]
-        if recurse:
-            for child in children.copy():
-                children.extend(self.list_dataset_children(child, recurse=recurse))
-        return children
+        dataset_dataset_path = (
+            self._model.catalog.getPathBuilder()
+            .schemas[self._ml_schema]
+            .tables["Dataset_Dataset"]
+        )
+        nested_datasets = list(dataset_dataset_path.entities().fetch())
+        def find_children(rid: RID):
+            children = [
+                child["Nested_Dataset"]
+                for child in nested_datasets
+                if child["Dataset"] == rid
+            ]
+            if recurse:
+                for child in children.copy():
+                    children.extend(find_children(child))
+            return children
+        return find_children(dataset_rid)
     def _vocabulary_specification(
         self, writer: Callable[[str, str, Table], list[dict[str, Any]]]
@@ -727,20 +751,19 @@ class Dataset:
         ]
     def _table_paths(
-        self, dataset: DatasetSpec = None
-    ) -> Iterator[tuple[list[str], list[str], list[Table]]]:
+        self, dataset: DatasetSpec = None, snapshot_catalog: Optional[DerivaML] = None
+    ) -> Iterator[tuple[str, str, Table]]:
-        dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
-        paths = self._collect_paths(dataset and dataset.rid)
+        paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
         def source_path(path: tuple[Table, ...]):
+            """Convert a tuple representing a path into a source path component with FK linkage"""
             path = list(path)
             p = [f"{self._model.ml_schema}:Dataset/RID={{Dataset_RID}}"]
             for table in path[1:]:
-                if table == dataset_dataset:
+                if table.name == "Dataset_Dataset":
                     p.append("(RID)=(deriva-ml:Dataset_Dataset:Dataset)")
-                elif table == self.dataset_table:
+                elif table.name == "Dataset":
                     p.append("(Nested_Dataset)=(deriva-ml:Dataset:RID)")
                 elif table.name == "Dataset_Version":
                     p.append(f"(RID)=({self._model.ml_schema}:Dataset_Version:Dataset)")
@@ -751,63 +774,76 @@ class Dataset:
         src_paths = ["/".join(source_path(p)) for p in paths]
         dest_paths = ["/".join([t.name for t in p]) for p in paths]
         target_tables = [p[-1] for p in paths]
         return zip(src_paths, dest_paths, target_tables)
     def _collect_paths(
         self,
         dataset_rid: Optional[RID] = None,
+        snapshot_catalog: Optional[DerivaML] = None,
         dataset_nesting_depth: Optional[int] = None,
     ) -> set[tuple[Table, ...]]:
-        dataset_nesting_depth = (
-            self._dataset_nesting_depth()
-            if dataset_nesting_depth is None
-            else dataset_nesting_depth
-        )
-        dataset_dataset = self._model.schemas[self._ml_schema].tables["Dataset_Dataset"]
-        # Figure out which paths we don't need to query for this dataset.  If no dataset is provided, use them all.
-        dataset_elements = (
-            [
-                self._model.name_to_table(e)
-                for e, m in self.list_dataset_members(
+        snapshot_catalog = snapshot_catalog or self
+        dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
+            "Dataset"
+        ]
+        dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
+            "Dataset_Dataset"
+        ]
+        dataset_associations = [
+            a
+            for a in self.dataset_table.find_associations()
+            if a.table.schema.name != self._ml_schema
+            or a.table.name == "Dataset_Dataset"
+        ]
+        if dataset_rid:
+            # Get a list of the members of the dataset so we can figure out which tables to query.
+            dataset_elements = [
+                snapshot_catalog._model.name_to_table(e)
+                for e, m in snapshot_catalog.list_dataset_members(
                     dataset_rid=dataset_rid, limit=1
                 ).items()
                 if m
             ]
-            if dataset_rid
-            else self.list_dataset_element_types()
-        )
-        dataset_associations = [a.table for a in self.dataset_table.find_associations()]
-        included_associations = [
-            a.table
-            for a in self.dataset_table.find_associations()
-            if a.other_fkeys.pop().pk_table in dataset_elements
-        ]
+            included_associations = [
+                a.table
+                for a in dataset_table.find_associations()
+                if a.other_fkeys.pop().pk_table in dataset_elements
+            ]
+        else:
+            included_associations = dataset_associations
         # Get the paths through the schema and filter out all of dataset paths not used by this dataset.
         paths = {
             tuple(p)
-            for p in self._model._schema_to_paths()
+            for p in snapshot_catalog._model._schema_to_paths()
             if (len(p) == 1)
-            or (p[1] not in dataset_associations)
-            or (p[1] in included_associations)
+            or (p[1] not in dataset_associations)  # Tables in the domain schema
+            or (
+                p[1] in included_associations
+            )  # Tables that include members of the dataset
         }
         # Now get paths for nested datasets
         nested_paths = set()
         if dataset_rid:
-            for c in self.list_dataset_children(dataset_rid=dataset_rid):
-                nested_paths |= self._collect_paths(c)
+            for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
+                nested_paths |= self._collect_paths(
+                    c, snapshot_catalog=snapshot_catalog
+                )
         else:
+            # Initialize nesting depth if not already provided.
+            dataset_nesting_depth = (
+                self._dataset_nesting_depth()
+                if dataset_nesting_depth is None
+                else dataset_nesting_depth
+            )
             if dataset_nesting_depth:
                 nested_paths = self._collect_paths(
                     dataset_nesting_depth=dataset_nesting_depth - 1
                 )
         if nested_paths:
             paths |= {
-                tuple([self.dataset_table]),
-                (self.dataset_table, dataset_dataset),
+                tuple([dataset_table]),
+                (dataset_table, dataset_dataset),
             }
         paths |= {(self.dataset_table, dataset_dataset) + p for p in nested_paths}
         return paths
@@ -863,6 +899,7 @@ class Dataset:
         self,
         writer: Callable[[str, str, Table], list[dict[str, Any]]],
         dataset: DatasetSpec,
+        snapshot_catalog: Optional[DerivaML] = None,
     ) -> list[dict[str, Any]]:
         """Output a download/export specification for a dataset_table.  Each element of the dataset_table will be placed in its own dir
         The top level data directory of the resulting BDBag will have one subdirectory for element type. the subdirectory
@@ -902,21 +939,24 @@ class Dataset:
             A dataset_table specification.
         """
         element_spec = []
-        for path in self._table_paths(dataset=dataset):
+        for path in self._table_paths(
+            dataset=dataset, snapshot_catalog=snapshot_catalog
+        ):
             element_spec.extend(writer(*path))
         return self._vocabulary_specification(writer) + element_spec
-    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
-    def download_dataset_bag(
+    def _download_dataset_bag(
         self,
         dataset: DatasetSpec,
         execution_rid: Optional[RID] = None,
+        snapshot_catalog: Optional[DerivaML] = None,
     ) -> DatasetBag:
         """Download a dataset onto the local file system.  Create a MINID for the dataset if one doesn't already exist.
         Args:
             dataset: Specification of the dataset to be downloaded.
             execution_rid: Execution RID for the dataset.
+            snapshot_catalog: Snapshot catalog for the dataset version if specified.
         Returns:
             Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
@@ -927,16 +967,17 @@ class Dataset:
             and self._model.catalog.resolve_rid(execution_rid).table.name != "Execution"
         ):
             raise DerivaMLException(f"RID {execution_rid} is not an execution")
-        minid = self.get_dataset_minid(dataset)
+        minid = self._get_dataset_minid(dataset, snapshot_catalog=snapshot_catalog)
         bag_path = (
             self._materialize_dataset_bag(minid, execution_rid=execution_rid)
             if dataset.materialize
-            else self._download_dataset_bag(minid)
+            else self._download_dataset_minid(minid)
         )
         return DatabaseModel(minid, bag_path).get_dataset()
     def _version_snapshot(self, dataset: DatasetSpec) -> str:
+        """Return a catalog with snapshot for the specified dataset version"""
         version_record = [
             h
             for h in self.dataset_history(dataset_rid=dataset.rid)
@@ -944,13 +985,17 @@ class Dataset:
         ][0]
         return f"{self._model.catalog.catalog_id}@{iso_to_snap(version_record.timestamp.isoformat())}"
-    def _create_dataset_minid(self, dataset: DatasetSpec) -> str:
+    def _create_dataset_minid(
+        self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
+    ) -> str:
         with TemporaryDirectory() as tmp_dir:
             # Generate a download specification file for the current catalog schema. By default, this spec
             # will generate a minid and place the bag into S3 storage.
             spec_file = f"{tmp_dir}/download_spec.json"
             with open(spec_file, "w", encoding="utf-8") as ds:
-                json.dump(self._generate_dataset_download_spec(dataset), ds)
+                json.dump(
+                    self._generate_dataset_download_spec(dataset, snapshot_catalog), ds
+                )
             try:
                 self._logger.info(
                     f"Downloading dataset minid for catalog: {dataset.rid}@{str(dataset.version)}"
@@ -987,14 +1032,17 @@ class Dataset:
             version_path.update([{"RID": version_rid, "Minid": minid_page_url}])
         return minid_page_url
-    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
-    def get_dataset_minid(
-        self, dataset: DatasetSpec, create: bool = True
+    def _get_dataset_minid(
+        self,
+        dataset: DatasetSpec,
+        snapshot_catalog: Optional[DerivaML] = None,
+        create: bool = True,
     ) -> DatasetMinid:
         """Return a MINID to the specified dataset.  If no version is specified, use the latest.
         Args:
             dataset: Specification of the dataset.
+            snapshot_catalog: Snapshot catalog for the dataset version if specified.
             create: Create a new MINID if one doesn't already exist.
         Returns:
@@ -1025,12 +1073,12 @@ class Dataset:
                         f"Minid for dataset {dataset.rid} doesn't exist"
                     )
                 self._logger.info("Creating new MINID for dataset %s", dataset.rid)
-                minid_url = self._create_dataset_minid(dataset)
+                minid_url = self._create_dataset_minid(dataset, snapshot_catalog)
             # If provided a MINID, use the MINID metadata to get the checksum and download the bag.
         r = requests.get(minid_url, headers={"accept": "application/json"})
         return DatasetMinid(dataset_version=dataset.version, **r.json())
-    def _download_dataset_bag(self, minid: DatasetMinid) -> Path:
+    def _download_dataset_minid(self, minid: DatasetMinid) -> Path:
         """Given a RID to a dataset_table, or a MINID to an existing bag, download the bag file, extract it and validate
         that all the metadata is correct
@@ -1097,7 +1145,7 @@ class Dataset:
             return True
         # request metadata
-        bag_path = self._download_dataset_bag(minid)
+        bag_path = self._download_dataset_minid(minid)
         bag_dir = bag_path.parent
         validated_check = bag_dir / "validated_check.txt"
@@ -1112,7 +1160,9 @@ class Dataset:
         return Path(bag_path)
     def _export_outputs(
-        self, dataset: Optional[DatasetSpec] = None
+        self,
+        dataset: Optional[DatasetSpec] = None,
+        snapshot_catalog: Optional[DerivaML] = None,
     ) -> list[dict[str, Any]]:
         """Return and output specification for the datasets in the provided model
@@ -1150,9 +1200,13 @@ class Dataset:
                 "source": {"api": "schema", "skip_root_path": True},
                 "destination": {"type": "json", "name": "schema"},
             },
-        ] + self._dataset_specification(writer, dataset)
+        ] + self._dataset_specification(
+            writer, dataset, snapshot_catalog=snapshot_catalog
+        )
-    def _processor_params(self, dataset: DatasetSpec) -> list[dict[str, Any]]:
+    def _processor_params(
+        self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML] = None
+    ) -> list[dict[str, Any]]:
         """
         Returns:
           a download specification for the datasets in the provided model.
@@ -1178,7 +1232,7 @@ class Dataset:
                 "processor": "json",
                 "processor_params": {"query_path": "/schema", "output_path": "schema"},
             }
-        ] + self._dataset_specification(writer, dataset)
+        ] + self._dataset_specification(writer, dataset, snapshot_catalog)
     @staticmethod
     def _download_dataset_element(
@@ -1257,7 +1311,9 @@ class Dataset:
             )
         return exports
-    def _generate_dataset_download_spec(self, dataset: DatasetSpec) -> dict[str, Any]:
+    def _generate_dataset_download_spec(
+        self, dataset: DatasetSpec, snapshot_catalog: Optional[DerivaML]
+    ) -> dict[str, Any]:
         """
         Returns:
@@ -1315,7 +1371,7 @@ class Dataset:
                         },
                     },
                 ]
-                + self._processor_params(dataset),
+                + self._processor_params(dataset, snapshot_catalog),
             },
         }

{deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/deriva_definitions.py RENAMED Viewed

@@ -139,7 +139,6 @@ class FileSpec(BaseModel):
         if url_parts.scheme == "tag":
             return v
         elif not url_parts.scheme:
-            print(v)
             return f'tag://{gethostname()},{date.today()}:file://{v}'
         else:
             raise ValidationError("url is not a file URL")

{deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/deriva_ml_base.py RENAMED Viewed

@@ -15,8 +15,12 @@ import logging
 from datetime import datetime
 import hashlib
 from itertools import chain
+import inspect
 from pathlib import Path
 import requests
+from setuptools_git_versioning import get_latest_file_commit
+import subprocess
+import shutil
 from typing import Optional, Any, Iterable, TYPE_CHECKING
 from deriva.core import (
     ErmrestCatalog,
@@ -27,6 +31,7 @@ from deriva.core import (
 )
 import deriva.core.datapath as datapath
 from deriva.core.datapath import DataPathException
+from deriva.core.deriva_server import DerivaServer
 from deriva.core.ermrest_catalog import ResolveRidResult
 from deriva.core.ermrest_model import Key, Table
 from deriva.core.hatrac_store import HatracStore
@@ -35,6 +40,8 @@ from pydantic import validate_call, ConfigDict
 from .execution_configuration import ExecutionConfiguration, Workflow
 from .feature import Feature, FeatureRecord
 from .dataset import Dataset
+from .dataset_aux_classes import DatasetSpec
+from .dataset_bag import DatasetBag
 from .deriva_model import DerivaModel
 from .upload import (
     table_path,
@@ -56,6 +63,18 @@ from .deriva_definitions import (
     FileSpec,
 )
+try:
+    from icecream import ic
+except ImportError:  # Graceful fallback if IceCream isn't installed.
+    ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
+try:
+    from IPython import get_ipython
+except ImportError:  # Graceful fallback if IPython isn't installed.
+    get_ipython = lambda: None
 if TYPE_CHECKING:
     from .execution import Execution
@@ -98,13 +117,13 @@ class DerivaML(Dataset):
             model_version: A string that indicates the version model.  Typically passed in via
         """
         self.credential = get_credential(hostname)
-        self.catalog = ErmrestCatalog(
+        server = DerivaServer(
             "https",
             hostname,
-            catalog_id,
-            self.credential,
+            credentials=self.credential,
             session_config=self._get_session_config(),
         )
+        self.catalog = server.connect_ermrest(catalog_id)
         self.model = DerivaModel(
             self.catalog.getCatalogModel(), domain_schema=domain_schema
         )
@@ -132,6 +151,29 @@ class DerivaML(Dataset):
         self.version = model_version
         self.configuration = None
         self._execution: Optional[Execution] = None
+        self._notebook = None
+        try:
+            from IPython import get_ipython
+            ipython = get_ipython()
+            # Check if running in Jupyter's ZMQ kernel (used by notebooks)
+            if ipython is not None and "IPKernelApp" in ipython.config:
+                self._notebook = Path(ipython.user_ns.get("__session__"))
+                # Check if running in Jupyter's ZMQ kernel (used by notebooks)
+                try:
+                    if subprocess.run(
+                        [shutil.which("nbstripout"), "--is-installed"],
+                        check=False,
+                        capture_output=True,
+                    ).returncode:
+                        self._logger.warn(
+                            "nbstripout is not installed in repository. Please run nbstripout --install"
+                        )
+                except subprocess.CalledProcessError:
+                    self._logger.error("nbstripout is not found.")
+        except (ImportError, AttributeError):
+            pass
         self.domain_schema = self.model.domain_schema
         self.project_name = project_name or self.domain_schema
@@ -705,6 +747,28 @@ class DerivaML(Dataset):
             for v in pb.schemas[table.schema.name].tables[table.name].entities().fetch()
         ]
+    @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
+    def download_dataset_bag(
+        self,
+        dataset: DatasetSpec,
+        execution_rid: Optional[RID] = None,
+    ) -> DatasetBag:
+        """Download a dataset onto the local file system.  Create a MINID for the dataset if one doesn't already exist.
+        Args:
+            dataset: Specification of the dataset to be downloaded.
+            execution_rid: Execution RID for the dataset.
+        Returns:
+            Tuple consisting of the path to the dataset, the RID of the dataset that was downloaded and the MINID
+            for the dataset.
+        """
+        return self._download_dataset_bag(
+            dataset=dataset,
+            execution_rid=execution_rid,
+            snapshot_catalog=DerivaML(self.host_name, self._version_snapshot(dataset)),
+        )
     @validate_call(config=ConfigDict(arbitrary_types_allowed=True))
     def download_asset(self, asset_rid: RID, dest_dir: Path) -> Path:
         """Download an asset from a URL and place it in a local directory.
@@ -808,8 +872,10 @@ class DerivaML(Dataset):
             Iterable of the RIDs of the files that were added.
         """
         defined_types = self.list_vocabulary_terms(MLVocab.file_type)
-        if execution_rid and self.resolve_rid(execution_rid).table.name != 'Execution':
-            raise DerivaMLException(f'RID {execution_rid} is not for an execution table.')
+        if execution_rid and self.resolve_rid(execution_rid).table.name != "Execution":
+            raise DerivaMLException(
+                f"RID {execution_rid} is not for an execution table."
+            )
         def check_file_type(dtype: str) -> bool:
             for term in defined_types:
@@ -862,18 +928,11 @@ class DerivaML(Dataset):
         self, file_types: Optional[list[str]] = None
     ) -> list[dict[str, Any]]:
         """Return the contents of the file table.  Denormalized file types into the file record."""
-        atable = next(
-            self._model.schemas[self._ml_schema]
-            .tables[MLVocab.dataset_type]
-            .find_associations()
-        ).name
         ml_path = self.pathBuilder.schemas[self._ml_schema]
-        atable_path = ml_path.tables[atable]
         file_path = ml_path.File
         type_path = ml_path.File_File_Type
         # Get a list of all the dataset_type values associated with this dataset_table.
-        files = []
         path = file_path.link(type_path)
         path = path.attributes(
             path.File.RID,
@@ -885,10 +944,12 @@ class DerivaML(Dataset):
         )
         file_map = {}
         for f in path.fetch():
-            file_map.setdefault(f['RID'], f | {'File_Types': []})['File_Types'].append(f['File_Type'])
+            file_map.setdefault(f["RID"], f | {"File_Types": []})["File_Types"].append(
+                f["File_Type"]
+            )
         # Now get rid of the File_Type key and return the result
-        return [ (f, f.pop('File_Type'))[0] for f in file_map.values()]
+        return [(f, f.pop("File_Type"))[0] for f in file_map.values()]
     def list_workflows(self) -> list[Workflow]:
         """Return a list of all the workflows in the catalog."""
@@ -901,6 +962,7 @@ class DerivaML(Dataset):
                 version=w["Version"],
                 description=w["Description"],
                 rid=w["RID"],
+                checksum=w["Checksum"],
             )
             for w in workflow_path.entities().fetch()
         ]
@@ -917,33 +979,18 @@ class DerivaML(Dataset):
         """
         # Check to make sure that the workflow is not already in the table. If it's not, add it.
-        def get_checksum(url) -> str:
-            """Get the checksum of a file from a URL."""
-            try:
-                response = requests.get(url)
-                response.raise_for_status()
-            except Exception:
-                raise DerivaMLException(f"Invalid URL: {url}")
-            else:
-                sha256_hash = hashlib.sha256()
-                sha256_hash.update(response.content)
-                checksum = "SHA-256: " + sha256_hash.hexdigest()
-            return checksum
+        if workflow_rid := self.lookup_workflow(workflow.url):
+            return workflow_rid
         ml_schema_path = self.pathBuilder.schemas[self.ml_schema]
         try:
-            url_column = ml_schema_path.Workflow.URL
-            workflow_record = list(
-                ml_schema_path.Workflow.filter(url_column == workflow.url).entities()
-            )[0]
-            workflow_rid = workflow_record["RID"]
-        except IndexError:
             # Record doesn't exist already
             workflow_record = {
                 "URL": workflow.url,
                 "Name": workflow.name,
                 "Description": workflow.description,
-                "Checksum": get_checksum(workflow.url),
+                "Checksum": workflow.checksum,
                 "Version": workflow.version,
                 MLVocab.workflow_type: self.lookup_term(
                     MLVocab.workflow_type, workflow.workflow_type
@@ -955,6 +1002,125 @@ class DerivaML(Dataset):
             raise DerivaMLException(f"Failed to insert workflow. Error: {error}")
         return workflow_rid
+    def lookup_workflow(self, url: str) -> Optional[RID]:
+        workflow_path = self.pathBuilder.schemas[self.ml_schema].Workflow
+        try:
+            url_column = workflow_path.URL
+            return list(workflow_path.filter(url_column == url).entities())[0]["RID"]
+        except IndexError:
+            return None
+    def create_workflow(
+        self, name: str, workflow_type: str, description: str = "", create: bool = True
+    ) -> RID:
+        """Identify current executing program and return a workflow RID for it
+        Determine the notebook or script that is currently being executed. Assume that  this is
+        being executed from a cloned GitHub repository.  Determine the remote repository name for
+        this object.  Then either retrieve an existing workflow for this executable or create
+        a new one.
+        Args:
+            name: The name of the workflow.
+            workflow_type: The type of the workflow.
+            description: The description of the workflow.
+            create: Whether to create a new workflow.
+        """
+        # Make sure type is correct.
+        self.lookup_term(MLVocab.workflow_type, workflow_type)
+        filename, github_url, is_dirty = self._github_url()
+        if is_dirty:
+            self._logger.warning(
+                f"File {filename} has been modified since last commit. Consider commiting before executing"
+            )
+        sha256_hash = hashlib.sha256()
+        if self._notebook:
+            # If you are in a notebook, strip out the outputs before computing the checksum.
+            result = subprocess.run(
+                ["nbstripout", "-t", filename],
+                capture_output=True,
+                text=False,
+                check=True,
+            )
+            sha256_hash.update(result.stdout)
+        else:
+            with open(filename, "rb") as f:
+                sha256_hash.update(f.read())
+        checksum = "SHA-256:" + sha256_hash.hexdigest()
+        workflow = Workflow(
+            name=name,
+            url=github_url,
+            checksum=checksum,
+            description=description,
+            workflow_type=workflow_type,
+        )
+        return self.add_workflow(workflow) if create else None
+    def _github_url(self) -> tuple[Path, str, bool]:
+        """Return a GitHUB URL for the latest commit of the script from which this routine is called.
+        This routine is used to be called from a script or notebook (e.g. python -m file). It assumes that
+        the file is in a gitHUB repository and commited.  It returns a URL to the last commited version of this
+        file in GitHUB.
+        Returns: A tuple with the filename, gethub_url and a boolean to indicated if uncommited changes
+            have been made to the file.
+        """
+        # Get the name of the script that is calling this function.
+        if self._notebook:
+            # Try to get the __session__ variable from the user namespace.
+            filename = Path("").absolute().parent / self._notebook
+        else:
+            stack = inspect.stack()
+            if len(stack) > 1:
+                filename = Path(
+                    stack[2].filename
+                )  # Get the caller's filename, which is two up the stack from here.
+            else:
+                raise DerivaMLException(
+                    f"Looking for caller failed"
+                )  # Stack is too shallow
+        # Get repo URL from local github repo.
+        try:
+            result = subprocess.run(
+                ["git", "remote", "get-url", "origin"], capture_output=True, text=True
+            )
+            github_url = result.stdout.strip().removesuffix(".git")
+        except subprocess.CalledProcessError:
+            raise DerivaMLException(f"No GIT remote found")
+        # Find the root directory for the repository
+        repo_root = filename
+        while repo_root != repo_root.root:
+            if (repo_root / ".git").exists():
+                break
+            else:
+                repo_root = repo_root.parent
+        # Now check to see if file has been modified since the last commit.
+        try:
+            result = subprocess.run(
+                ["git", "status", "--porcelain"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            is_dirty = bool(
+                "M " in result.stdout.strip()
+            )  # Returns True if output indicates a modified file
+        except subprocess.CalledProcessError:
+            is_dirty = False  # If Git command fails, assume no changes
+        sha = get_latest_file_commit(filename)
+        url = f"{github_url}/blob/{sha}/{filename.relative_to(repo_root)}"
+        return filename, url, is_dirty
     # @validate_call
     def create_execution(self, configuration: ExecutionConfiguration) -> "Execution":
         """Create an execution object

{deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/execution.py RENAMED Viewed

@@ -12,6 +12,7 @@ import os
 import shutil
 from datetime import datetime
 from pathlib import Path
+import requests
 from tempfile import NamedTemporaryFile
 from typing import Iterable, Any, Optional
 from deriva.core import format_exception
@@ -28,7 +29,6 @@ from .deriva_definitions import (
 )
 from .deriva_ml_base import DerivaML, FeatureRecord
 from .dataset_aux_classes import DatasetSpec, DatasetVersion, VersionPart
-from .dataset import Dataset
 from .dataset_bag import DatasetBag
 from .execution_configuration import ExecutionConfiguration
 from .execution_environment import get_execution_environment
@@ -51,6 +51,12 @@ except ImportError:  # Graceful fallback if IceCream isn't installed.
     ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
+try:
+    from jupyter_server.serverapp import list_running_servers
+except ImportError:
+    list_running_servers = lambda: []
 class Execution:
     """The Execution class is used to capture the context of an activity within DerivaML.  While these are primarily
     computational, manual processes can be represented by an execution as well.
@@ -100,6 +106,7 @@ class Execution:
         self.configuration = configuration
         self._ml_object = ml_object
         self.start_time = None
+        self.stop_time = None
         self.status = Status.created
         self.uploaded_assets: list[Path] = []
@@ -221,8 +228,9 @@ class Execution:
         Returns:
             the location of the unpacked and validated dataset_table bag and the RID of the bag
         """
-        ds = Dataset(self._ml_object.model, cache_dir=self._cache_dir)
-        return ds.download_dataset_bag(dataset, execution_rid=self.execution_rid)
+        return self._ml_object.download_dataset_bag(
+            dataset, execution_rid=self.execution_rid
+        )
     @validate_call
     def update_status(self, status: Status, msg: str) -> None:
@@ -243,6 +251,35 @@ class Execution:
             ]
         )
+    def _create_notebook_checkpoint(self):
+        """Trigger a checkpoint creation using Jupyter's API."""
+        notebook_name = self._ml_object._notebook
+        # Look for the server running this notebook.
+        root = Path("").absolute().parent.as_posix()
+        servers = list(list_running_servers())
+        # Jupyterhub seems to handle root_dir differently then server case.
+        server = (
+            servers
+            if len(servers) == 1
+            else [s for s in servers if s["root_dir"] == root]
+        )[0]
+        notebook_url = f"{server['url']}api/contents/{notebook_name}"
+        # Get notebook content
+        response = requests.get(
+            notebook_url, headers={"Authorization": f"Token {server['token']}"}
+        )
+        if response.status_code == 200:
+            notebook_content = response.json()["content"]
+            # Execution metadata cannot be in a directory, so map path into filename.
+            checkpoint_path = (
+                self.execution_metadata_path(ExecMetadataVocab.runtime_env.value)
+                / f"{notebook_name.as_posix().replace('/','_')}.checkpoint"
+            )
+            with open(checkpoint_path, "w", encoding="utf-8") as f:
+                json.dump(notebook_content, f)
     def execution_start(self) -> None:
         """Start an execution, uploading status to catalog"""
@@ -252,11 +289,15 @@ class Execution:
     def execution_stop(self) -> None:
         """Finish the execution and update the duration and status of execution."""
-        duration = datetime.now() - self.start_time
+        self.stop_time = datetime.now()
+        duration = self.stop_time - self.start_time
         hours, remainder = divmod(duration.total_seconds(), 3600)
         minutes, seconds = divmod(remainder, 60)
         duration = f"{round(hours, 0)}H {round(minutes, 0)}min {round(seconds, 4)}sec"
+        if self._ml_object._notebook:
+            self._create_notebook_checkpoint()
         self.update_status(Status.completed, "Algorithm execution ended.")
         self._ml_object.pathBuilder.schemas[self._ml_object.ml_schema].Execution.update(
             [{"RID": self.execution_rid, "Duration": duration}]

{deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/execution_configuration.py RENAMED Viewed

@@ -33,18 +33,18 @@ class Workflow(BaseModel):
     version: Optional[str] = None
     description: Optional[str] = ""
     rid: Optional[RID] = None
+    checksum: Optional[str]
 class ExecutionConfiguration(BaseModel):
     """Define the parameters that are used to configure a specific execution.
     Attributes:
-        datasets: List of dataset_table RIDS, MINIDS for datasets to be downloaded prior to execution.  By default,
-                     all  the datasets are materialized. However, if the assets associated with a dataset_table are not
-                     needed, a dictionary that defines the rid and the materialization parameter for the
-                     download_dataset_bag method can be specified, e.g.  datasets=[{'rid': RID, 'materialize': True}].
+        datasets: List of dataset specifications which specify the dataset RID, version and if the dataset
+            should be materialized.
         assets: List of assets to be downloaded prior to execution.  The values must be RIDs in an asset table
-        workflow: A workflow instance.  Must have a name, URI to the workflow instance, and a type.
+        workflow: A RID for a workflow instance.  Must have a name, URI to the workflow instance, and a type.
         description: A description of the execution.  Can use Markdown format.
     """

{deriva_ml-1.8.2 → deriva_ml-1.8.5}/src/deriva_ml/upload.py RENAMED Viewed

@@ -70,8 +70,11 @@ exec_asset_regex = (
 exec_metadata_dir_regex = (
     exec_dir_regex + r"/execution-metadata/(?P<execution_metadata_type>[-\w]+)"
 )
+# May have more than one suffix
 exec_metadata_regex = (
-    exec_metadata_dir_regex + r"/(?P<filename>[-\w]+)[.](?P<file_ext>[a-z0-9]*)$"
+    exec_metadata_dir_regex
+    + r"/(?P<filename>[-\w]+([.][\w]+)*)[.](?P<file_ext>[a-z0-9]*)$"
 )
 feature_dir_regex = exec_dir_regex + r"/feature"
 feature_table_dir_regex = (

{deriva_ml-1.8.2 → deriva_ml-1.8.5/src/deriva_ml.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,16 +1,19 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.8.2
+Version: 1.8.5
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: deriva~=1.7.6
+Requires-Dist: deriva~=1.7.7
 Requires-Dist: pandas
 Requires-Dist: regex~=2024.7.24
 Requires-Dist: pydantic>=2.10.6
 Requires-Dist: semver>3.0.0
+Requires-Dist: setuptools-git-versioning<3,>=2.0
+Requires-Dist: nbstripout
+Dynamic: license-file
 Deriva-ML is a python libary to simplify the process of creating and executing reproducible machine learning workflows
 using a deriva catalog.