PyPI - deriva-ml - Versions diffs - 1.11.0__tar.gz → 1.12.0__tar.gz - Mend

deriva-ml 1.11.0tar.gz → 1.12.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

{deriva_ml-1.11.0/src/deriva_ml.egg-info → deriva_ml-1.12.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.11.0
+Version: 1.12.0
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
 Requires-Python: >=3.10
@@ -28,3 +28,4 @@ The script release.sh will create a new release tag in GitHub.  This script requ
 GitHUB CLI be installed.
 See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.

{deriva_ml-1.11.0 → deriva_ml-1.12.0}/README.md RENAMED Viewed

@@ -9,3 +9,4 @@ The script release.sh will create a new release tag in GitHub.  This script requ
 GitHUB CLI be installed.
 See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.

{deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/Notebooks/DerivaML Execution.ipynb RENAMED Viewed

@@ -721,12 +721,12 @@
     "    pass\n",
     "\n",
     "    # Write a new model\n",
-    "    model_file = manual_execution.asset_path('API_Model') / 'modelfile.txt'\n",
+    "    model_file = manual_execution.asset_path('API_Model', 'modelfile.txt')\n",
     "    with open(model_file, 'w') as f:\n",
     "        f.write(\"Hello there a new model;\\n\")\n",
     "\n",
     "    # Create some new feature values.\n",
-    "    bb_csv_path, bb_asset_paths = ml_execution.feature_paths('Image', 'BoundingBox')\n",
+    "    bb_csv_path, bb_asset_paths = ml_execution.asset_path('Image', 'BoundingBox')\n",
     "    bounding_box_files = [bb_asset_paths['BoundingBox'] / f\"box{i}.txt\" for i in range(10)]\n",
     "    for i in range(10):\n",
     "        bounding_box_files.append(fn := bb_asset_paths['BoundingBox'] / f\"box{i}.txt\")\n",

{deriva_ml-1.11.0 → deriva_ml-1.12.0}/docs/user-guide/datasets.md RENAMED Viewed

@@ -17,7 +17,6 @@ Dataset types are assigned from a controlled vocabulary called `MLVocab.dataset_
 as you need:
 ```
 from deriva_ml import MLVocab
-...
 ml_instance.add_term(MLVocab.dataset_type, "DemoSet", description="A test dataset_table")
 ```
 When you create a dataset, you can provide as many dataset types as required to streamline orginizing and discovering
@@ -30,7 +29,6 @@ Its important to know how a dataset was created, so the most common way to creat
 # Now lets create model configuration for our program.
 api_workflow = Workflow(
     name="API Workflow",
-    url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Dataset.ipynb",
     workflow_type="Create Dataset Notebook"
 )

{deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/database_model.py RENAMED Viewed

@@ -1,4 +1,4 @@
-"""Ths module contains the definition of the DatabaseModel class.  The role of this class is to provide an nterface between the BDBag representation
+"""Ths module contains the definition of the DatabaseModel class.  The role of this class is to provide an interface between the BDBag representation
 of a dataset and a sqllite database in which the contents of the bag are stored.
 """
@@ -51,7 +51,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
     appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
     into DatabaseModels, is kept in the class variable `_rid_map`.
-    Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
+    Because you can load different versions of a dataset simultaneously, the dataset RID and version number are tracked, and a new
     sqllite instance is created for every new dataset version present.
     Attributes:
@@ -290,6 +290,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
         return DatasetBag(self, dataset_rid or self.dataset_rid)
     def dataset_version(self, dataset_rid: Optional[RID] = None) -> DatasetVersion:
+        """Return the version of the specified dataset."""
         if dataset_rid and dataset_rid not in self.bag_rids:
             DerivaMLException(f"Dataset RID {dataset_rid} is not in model.")
         return self.bag_rids[dataset_rid]

{deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/dataset.py RENAMED Viewed

@@ -232,12 +232,10 @@ class Dataset:
         """Increment the version of the specified dataset_table.
         Args:
-          dataset_rid: RID to a dataset_table
-          component: Which version of the dataset_table to increment.
-          dataset_rid: RID of the dataset whose version is to be incremented.
-          component: Major, Minor or Patch
-          description: Description of the version update of the dataset_table.
-          execution_rid: Which execution is performing increment.
+            dataset_rid: RID of the dataset whose version is to be incremented.
+            component: Which version of the dataset_table to increment. Major, Minor or Patch
+            description: Description of the version update of the dataset_table.
+            execution_rid: Which execution is performing increment.
         Returns:
           new semantic version of the dataset_table as a 3-tuple
@@ -275,9 +273,6 @@ class Dataset:
             description: Description of the dataset_table.
             execution_rid: Execution under which the dataset_table will be created.
             version: Version of the dataset_table.
-            type: str | list[str]:
-            description: str:
         Returns:
             New dataset_table RID.
@@ -349,7 +344,6 @@ class Dataset:
         Args:
             dataset_rid: RID of the dataset_table to delete.
             recurse: If True, delete the dataset_table along with any nested datasets. (Default value = False)
-            dataset_rid: RID:
         """
         # Get association table entries for this dataset_table
         # Delete association table entries
@@ -397,7 +391,7 @@ class Dataset:
             filtered_path = dataset_path
         else:
             filtered_path = dataset_path.filter(
-                (dataset_path.Deleted == False) | (dataset_path.Deleted == None)
+                (dataset_path.Deleted == False) | (dataset_path.Deleted == None)  # noqa: E712
             )
         # Get a list of all the dataset_type values associated with this dataset_table.
@@ -439,8 +433,7 @@ class Dataset:
         routine makes it possible to add objects from the specified table to a dataset_table.
         Args:
-            element: Name or the table or table object that is to be added to the dataset_table.
-            element: str | Table:
+            element: Name of the table or table object that is to be added to the dataset_table.
         Returns:
             The table object that was added to the dataset_table.
@@ -464,7 +457,6 @@ class Dataset:
         Args:
             dataset_rid: param recurse: If this is a nested dataset_table, list the members of the contained datasets
-            dataset_rid: RID:
             recurse:  (Default value = False)
             limit: If provided, the maximum number of members to return for each element type.
@@ -677,7 +669,6 @@ class Dataset:
         Args:
             dataset_rid: return: RID of the parent dataset_table.
-            dataset_rid: RID:
         Returns:
             RID of the parent dataset_table.

{deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/dataset_bag.py RENAMED Viewed

@@ -168,7 +168,7 @@ class DatasetBag:
                 yield dict(zip(col_names, row))
     @validate_call
-    def list_dataset_members(self, recurse: bool = False) -> dict[str, dict[str, Any]]:
+    def list_dataset_members(self, recurse: bool = False) -> dict[str, dict[str, list]]:
         """Return a list of entities associated with a specific _dataset_table.
         Args:

{deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/deriva_ml_base.py RENAMED Viewed

@@ -265,10 +265,13 @@ class DerivaML(Dataset):
             is_notebook = True
         else:
             stack = inspect.stack()
+            # Get the caller's filename, which is two up the stack from here.
             if len(stack) > 1:
-                filename = Path(
-                    stack[2].filename
-                )  # Get the caller's filename, which is two up the stack from here.
+                filename = Path(stack[2].filename)
+                if not filename.exists():
+                    # Begin called from command line interpreter.
+                    filename = "REPL"
+                # Get the caller's filename, which is two up the stack from here.
             else:
                 raise DerivaMLException(
                     "Looking for caller failed"
@@ -326,7 +329,6 @@ class DerivaML(Dataset):
         """Return a local file path in which to place a CSV to add values to a table on upload.
         Args:
-          table: return:
           table: str | Table:
         Returns:
@@ -1143,13 +1145,17 @@ class DerivaML(Dataset):
             if self._is_notebook
             else f"git hash-object {self.executable_path}"
         )
-        checksum = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            check=True,
-            shell=True,
-        ).stdout.strip()
+        checksum = (
+            subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=False,
+                shell=True,
+            ).stdout.strip()
+            if self.executable_path != "REPL"
+            else "1"
+        )
         return Workflow(
             name=name,
@@ -1172,6 +1178,8 @@ class DerivaML(Dataset):
         """
         # Get repo URL from local gitHub repo.
+        if self.executable_path == "REPL":
+            return "REPL", True
         try:
             result = subprocess.run(
                 ["git", "remote", "get-url", "origin"],
@@ -1240,6 +1248,7 @@ class DerivaML(Dataset):
     # @validate_call
     def restore_execution(self, execution_rid: Optional[RID] = None) -> "Execution":
         """Return an Execution object for a previously started execution with the specified RID."""
         from .execution import Execution
         # Find path to execution

{deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/deriva_model.py RENAMED Viewed

@@ -27,6 +27,8 @@ from typing import Iterable, Optional
 class DerivaModel:
     """Augmented interface to deriva model class.
+    This class provides a number of DerivaML specific methods that augment the interface in the deriva model class.
     Attributes:
         domain_schema: Schema name for domain specific tables and relationships.
         model: ERMRest model for the catalog.
@@ -71,6 +73,10 @@ class DerivaModel:
             # No domain schema defined.
             self.domain_schema = domain_schema
+    def __getattr__(self, name):
+        # Called only if `name` is not found in Manager.  Delegate attributes to model class.
+        return getattr(self.model, name)
     def name_to_table(self, table: str | Table) -> Table:
         """Return the table object corresponding to the given table name.
@@ -129,7 +135,7 @@ class DerivaModel:
     def find_association(self, table1: Table | str, table2: Table | str) -> Table:
         """Given two tables, return an association table that connects the two.
-        Raises"
+        Raises:
             DerivaML exception if there is either not an association table or more than one association table.
         """
         table1 = self.name_to_table(table1)
@@ -138,7 +144,7 @@ class DerivaModel:
         tables = [
             a.table
             for a in table1.find_associations(pure=False)
-            if (t := a.other_fkeys.pop().pk_table) == table2
+            if a.other_fkeys.pop().pk_table == table2
         ]
         if len(tables) == 1:
             return tables[0]

{deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/execution.py RENAMED Viewed

@@ -66,7 +66,6 @@ class AssetFilePath(type(Path())):
         asset_rid: The RID of the asset if it has been uploaded into an asset table
     """
     def __new__(
         cls,
         asset_path,
@@ -76,6 +75,17 @@ class AssetFilePath(type(Path())):
         asset_types: list[str] | str,
         asset_rid: Optional[RID] = None,
     ):
+        """
+        Create a new Path object that has additional information related to the use of this path as an asset.
+        Args:
+            asset_path: Local path to the location of the asset.
+            asset_name:  The name of the asset in the catalog (e.g. the asset table name).
+            file_name:  Name of the local file that contains the contents of the asset.
+            asset_metadata: Any additional columns associated with this asset beyond the URL, Length, and checksum.
+            asset_types:  A list of terms from the Asset_Type controlled vocabulary.
+            asset_rid:  The RID of the asset if it has been uploaded into an asset table
+        """
         obj = super().__new__(cls, asset_path)
         obj.asset_types = (
             asset_types if isinstance(asset_types, list) else [asset_types]
@@ -133,7 +143,7 @@ class Execution:
             ml_object: The DerivaML instance that created the execution.
             reload: RID of previously initialized execution object.
         """
-        self.asset_paths: list[Path] = []
+        self.asset_paths: list[AssetFilePath] = []
         self.configuration = configuration
         self._ml_object = ml_object
         self._model = ml_object.model
@@ -141,7 +151,7 @@ class Execution:
         self.start_time = None
         self.stop_time = None
         self.status = Status.created
-        self.uploaded_assets: list[Path] = []
+        self.uploaded_assets: Optional[dict[str, list[AssetFilePath]]] = None
         self.configuration.argv = sys.argv
         self.dataset_rids: list[RID] = []
@@ -152,6 +162,7 @@ class Execution:
         self._cache_dir = self._ml_object.cache_dir
         self._dry_run = dry_run
+        # Make sure we have a good workflow.
         if isinstance(self.configuration.workflow, Workflow):
             self.workflow_rid = (
                 self._ml_object.add_workflow(self.configuration.workflow)
@@ -168,6 +179,7 @@ class Execution:
                     "Workflow specified in execution configuration is not a Workflow"
                 )
+        # Validate the datasets and assets to be valid.
         for d in self.configuration.datasets:
             if self._ml_object.resolve_rid(d.rid).table.name != "Dataset":
                 raise DerivaMLException(
@@ -265,7 +277,7 @@ class Execution:
             file_name="configuration.json",
             asset_types=ExecMetadataVocab.execution_config.value,
         )
-        with open(cfile, "w", encoding="utf-8") as config_file:
+        with open(cfile.as_posix(), "w", encoding="utf-8") as config_file:
             json.dump(self.configuration.model_dump(), config_file)
         # save runtime env
@@ -387,7 +399,7 @@ class Execution:
         try:
             self.update_status(Status.running, "Uploading execution files...")
             results = upload_directory(self._model, self._asset_root)
-        except Exception as e:
+        except RuntimeError as e:
             error = format_exception(e)
             self.update_status(Status.failed, error)
             raise DerivaMLException(f"Fail to upload execution_assets. Error: {error}")
@@ -519,7 +531,7 @@ class Execution:
     def upload_execution_outputs(
         self, clean_folder: bool = True
-    ) -> dict[str, AssetFilePath]:
+    ) -> dict[str, list[AssetFilePath]]:
         """Upload all the assets and metadata associated with the current execution.
         This will include any new assets, features, or table values.
@@ -535,11 +547,11 @@ class Execution:
         if self._dry_run:
             return {}
         try:
-            uploaded_assets = self._upload_execution_dirs()
+            self.uploaded_assets = self._upload_execution_dirs()
             self.update_status(Status.completed, "Successfully end the execution.")
             if clean_folder:
                 self._clean_folder_contents(self._execution_root)
-            return uploaded_assets
+            return self.uploaded_assets
         except Exception as e:
             error = format_exception(e)
             self.update_status(Status.failed, error)
@@ -688,16 +700,26 @@ class Execution:
         asset_name: str,
         file_name: str,
         asset_types: Optional[list[str] | str] = None,
+        copy_file=False,
         **kwargs,
     ) -> AssetFilePath:
         """Return a pathlib Path to the directory in which to place files for the specified execution_asset type.
-        These files are uploaded as part of the upload_execution method in DerivaML class.
+        Given the name of an asset table, and a file name, register the file for upload, and return a path to that
+        file in the upload directory.  In addition to the filename, additioal asset metadata and file asset types may
+        be specified.
+        This routine has three modes, depending on if file_name refers to an existing file.  If it doesn't, a path
+        to a new file with the specified name is returned.  The caller can then open that file for writing.
+        If the provided filename refers to an existing file and the copy_file argument is False (the default), then the
+        returned path contains a symbolic link to that file.  If the copy_file argument is True then the contents of
+        file_name are copied into the target directory.
         Args:
             asset_name: Type of asset to be uploaded.  Must be a term in Asset_Type controlled vocabulary.
-            asset_types: Type of asset to be uploaded.  Defaults to name of the asset.
             file_name: Name of file to be uploaded.
+            asset_types: Type of asset to be uploaded.  Defaults to name of the asset.
             **kwargs: Any additional metadata values that may be part of the asset table.
         Returns:
@@ -716,26 +738,33 @@ class Execution:
         for t in asset_types:
             self._ml_object.lookup_term(MLVocab.asset_type, t)
+        file_name = Path(file_name)
         asset_path = asset_file_path(
             self._working_dir,
             self.execution_rid,
             self._model.name_to_table(asset_name),
-            file_name,
+            file_name.name,
             metadata=kwargs,
         )
+        if file_name.exists():
+            if copy_file:
+                asset_path.write_bytes(file_name.read_bytes())
+            else:
+                asset_path.symlink_to(file_name)
         # Persist the asset types into a file
         with open(
             asset_type_path(self._working_dir, self.execution_rid, asset_table),
             "a",
             encoding="utf-8",
         ) as f:
-            f.write(json.dumps({file_name: asset_types}) + "\n")
+            f.write(json.dumps({file_name.name: asset_types}) + "\n")
         return AssetFilePath(
             asset_path=asset_path,
             asset_name=asset_name,
-            file_name=file_name,
+            file_name=file_name.name,
             asset_metadata=kwargs,
             asset_types=asset_types,
         )
@@ -760,6 +789,7 @@ class Execution:
     def execute(self) -> Execution:
         """Initiate an execution with provided configuration. Can be used in a context manager."""
+        self.execution_start()
         return self
     @validate_call

{deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/execution_configuration.py RENAMED Viewed

@@ -1,3 +1,7 @@
+"""
+Classes that are used to define an execution configuration.
+"""
 from __future__ import annotations
 import json

{deriva_ml-1.11.0 → deriva_ml-1.12.0}/src/deriva_ml/upload.py RENAMED Viewed

@@ -216,10 +216,10 @@ def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table):
         "checksum_types": ["sha256", "md5"],
         "hatrac_options": {"versioned_urls": True},
         "hatrac_templates": {
-            "hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}",
+            "hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}.{{file_ext}}",
             "content-disposition": "filename*=UTF-8''{file_name}.{file_ext}",
         },
-        "record_query_template": "/entity/{target_table}/MD5={{md5}}&Filename={{file_name}}",
+        "record_query_template": "/entity/{target_table}/MD5={{md5}}&Filename={file_name}.{file_ext}",
     }
@@ -252,10 +252,10 @@ def bulk_upload_configuration(model: DerivaModel) -> dict[str, Any]:
                 "checksum_types": ["sha256", "md5"],
                 "hatrac_options": {"versioned_urls": True},
                 "hatrac_templates": {
-                    "hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}",
+                    "hatrac_uri": "/hatrac/{asset_table}/{md5}.{file_name}.{file_ext}",
                     "content-disposition": "filename*=UTF-8''{file_name}.{file_ext}",
                 },
-                "record_query_template": "/entity/{target_table}/MD5={{md5}}&Filename={{file_name}}",
+                "record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}.{file_ext}",
             },
             # {
             #  Upload the records into a  table
@@ -448,7 +448,7 @@ def asset_type_path(prefix: Path | str, exec_rid: RID, asset_table: Table) -> Pa
         asset_table: Table in which to place assets.
     Returns:
-        Path to the file in which to place asset_type values for the named asset..
+        Path to the file in which to place asset_type values for the named asset.
     """
     path = (
         execution_root(prefix, exec_rid=exec_rid)

{deriva_ml-1.11.0 → deriva_ml-1.12.0/src/deriva_ml.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.11.0
+Version: 1.12.0
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
 Requires-Python: >=3.10
@@ -28,3 +28,4 @@ The script release.sh will create a new release tag in GitHub.  This script requ
 GitHUB CLI be installed.
 See [https://cli.github.com](https://cli.github.com) for instructions on how to install and configure the CLI.