PyPI - deriva-ml - Versions diffs - 1.8.10__tar.gz → 1.9.0__tar.gz - Mend

deriva-ml 1.8.10tar.gz → 1.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

{deriva_ml-1.8.10/src/deriva_ml.egg-info → deriva_ml-1.9.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deriva-ml
-Version: 1.8.10
+Version: 1.9.0
 Summary: Utilities to simplify use of Dervia and Pandas to create reproducable ML pipelines
 Author-email: ISRD <isrd-dev@isi.edu>
 Requires-Python: >=3.10

{deriva_ml-1.8.10 → deriva_ml-1.9.0}/docs/Notebooks/DerivaML Execution.ipynb RENAMED Viewed

@@ -28,7 +28,7 @@
    "source": [
     "import builtins\n",
     "from deriva.core.utils.globus_auth_utils import GlobusNativeLogin\n",
-    "from deriva_ml import ExecutionConfiguration, Workflow, MLVocab, DerivaSystemColumns\n",
+    "from deriva_ml import ExecutionConfiguration, MLVocab, DerivaSystemColumns\n",
     "from deriva_ml.demo_catalog import create_demo_catalog, DemoML\n",
     "from IPython.display import display, Markdown, JSON\n",
     "import itertools\n",
@@ -166,12 +166,11 @@
    "metadata": {},
    "cell_type": "code",
    "source": [
-    "ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"Inital setup of Model File\")\n",
+    "ml_instance.add_term(MLVocab.workflow_type, \"Manual Workflow\", description=\"Initial setup of Model File\")\n",
     "ml_instance.add_term(MLVocab.execution_asset_type, \"API_Model\", description=\"Model for our API workflow\")\n",
     "\n",
-    "api_workflow = Workflow(\n",
+    "api_workflow = ml_instance.create_workflow(\n",
     "    name=\"Manual Workflow\",\n",
-    "    url='https://github.com/informatics-isi-edu/deriva-ml/blob/main/docs/Notebooks/DerivaML%20Execution.ipynb',\n",
     "    workflow_type=\"Manual Workflow\",\n",
     "    description=\"A manual operation\"\n",
     ")\n",
@@ -207,13 +206,6 @@
    "source": [
     "ml_instance.add_term(MLVocab.workflow_type, \"ML Demo\", description=\"A ML Workflow that uses Deriva ML API\")\n",
     "\n",
-    "api_workflow = Workflow(\n",
-    "    name=\"ML Demo\",\n",
-    "    url=\"https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml\",\n",
-    "    workflow_type=\"ML Demo\",\n",
-    "    description=\"A workflow that uses Deriva ML\"\n",
-    ")\n",
-    "\n",
     "config = ExecutionConfiguration(\n",
     "    datasets=[training_dataset_rid, {'rid':testing_dataset_rid, 'materialize':False}],\n",
     "    assets = [training_model_rid],\n",

deriva_ml-1.9.0/docs/user-guide/execution-configuration.md ADDED Viewed

@@ -0,0 +1,26 @@
+# Configuring an execution
+One of the essential functions of DerivaML is to help keep track how ML model results are created so that hey can be shared and reproduced.
+Every execution in DerivaML is represented by an Execution object, whick keeps track of all of the paramemters associated with and execution and
+provides a number of functions that enable a program to help keep track of the configuation and results of a model execution.
+The first step in creating a DerivaML execution is to create an `ExectuionConfiguration`.
+The `ExecutionConfiguration` class is used to specify the inputs that go are to be used by an Execution.
+These inputs include
+* A list of datasets that are used
+* A list of other files (assets) that are to be used. This can include existing models, or any other infomration that the execution might need.
+* The actual code that is being executed.
+[`ExecutionConfiguration`][deriva_ml.execution_configuration.ExecutionConfiguration]  is a Pydantic dataclass.
+As part of initializing an execution, the assets and datasets in the configuration object are downloaded and cached.
+The datasets are provided as a list of DatasetSpecw which
+```DatasetSpec(dataset_rid:RID, version:DatasetVersion, materialize:bool)```
+it will be common to just want to use the latest version of the dataset, in which case you would use: `
+````
+deriva_nl = DerivaML(...)
+dataset_rid = ...
+datasets = [DatasetSpec(dataset_rid, version=deriva_ml.dataset_version(dataset_rid))]
+```
+If a dataset is large, downloading from the catalog might take a signficant amount of time.

{deriva_ml-1.8.10 → deriva_ml-1.9.0}/release.sh RENAMED Viewed

@@ -13,7 +13,7 @@ echo "Bumping version: $VERSION_TYPE"
 # Bump the version using bump-my-version.
 # This command should update version files, commit the changes, and create a Git tag.
-bump-my-version bump $VERSION_TYPE --verbose
+bump-my-version bump "$VERSION_TYPE" --verbose
 # Push commits and tags to the remote repository.
 echo "Pushing changes to remote repository..."
@@ -32,5 +32,6 @@ python -m build
 NEW_TAG=$(git describe --tags --abbrev=0)
 echo "New version tag: $NEW_TAG"
+twine upload "dist/*${NEW_TAG/v/}"
 echo "Release process complete!"

{deriva_ml-1.8.10 → deriva_ml-1.9.0}/src/deriva_ml/database_model.py RENAMED Viewed

@@ -1,12 +1,15 @@
-"""Ths module constains the definition of the DatabaseModel class.  The role of this class is to provide an nterface between the BDBag representation
+"""Ths module contains the definition of the DatabaseModel class.  The role of this class is to provide an nterface between the BDBag representation
 of a dataset and a sqllite database in which the contents of the bag are stored.
 """
+from __future__ import annotations
 import logging
 import sqlite3
 from csv import reader
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Generator
 from urllib.parse import urlparse
 from deriva.core.ermrest_model import Model
@@ -20,7 +23,7 @@ from .dataset_bag import DatasetBag
 class DatabaseModelMeta(type):
     """Use metaclass to ensure that there is onl one instance per path"""
-    _paths_loaded: dict[Path:"DatabaseModel"] = {}
+    _paths_loaded: dict[Path, "DatabaseModel"] = {}
     def __call__(cls, *args, **kwargs):
         logger = logging.getLogger("deriva_ml")
@@ -47,7 +50,7 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
     Because of nested datasets, it's possible that more than one dataset rid is in a bag, or that a dataset rid might
     appear in more than one database. To help manage this, a global list of all the datasets that have been loaded
     into DatabaseModels, is kept in the class variable `_rid_map`.
     Because you can load diffent versions of a dataset simultaniously, the dataset RID and version number are tracked, and a new
     sqllite instance is created for every new dataset version present.
@@ -315,6 +318,26 @@ class DatabaseModel(DerivaModel, metaclass=DatabaseModelMeta):
             )
         return datasets
+    def get_table_as_dict(self, table: str) -> Generator[dict[str, Any], None, None]:
+        """Retrieve the contents of the specified table as a dictionary.
+        Args:
+            table: Table to retrieve data from. f schema is not provided as part of the table name,
+                the method will attempt to locate the schema for the table.
+        Returns:
+          A generator producing dictionaries containing the contents of the specified table as name/value pairs.
+        """
+        table_name = self.normalize_table_name(table)
+        with self.dbase as dbase:
+            col_names = [
+                c[1]
+                for c in dbase.execute(f'PRAGMA table_info("{table_name}")').fetchall()
+            ]
+            result = self.dbase.execute(f'SELECT * FROM "{table_name}"')
+            while row := result.fetchone():
+                yield dict(zip(col_names, row))
     def normalize_table_name(self, table: str) -> str:
         """Attempt to insert the schema into a table name if it's not provided.

{deriva_ml-1.8.10 → deriva_ml-1.9.0}/src/deriva_ml/dataset.py RENAMED Viewed

@@ -92,7 +92,7 @@ class Dataset:
         dataset_list: list[DatasetSpec],
         description: Optional[str] = "",
         execution_rid: Optional[RID] = None,
-    ) -> RID:
+    ) -> list[dict[str, Any]]:
         schema_path = self._model.catalog.getPathBuilder().schemas[self._ml_schema]
         # Construct version records for insert
@@ -245,7 +245,7 @@ class Dataset:
           DerivaMLException: if provided RID is not to a dataset_table.
         """
-        # Find all of the datasets that are reachable from this dataset and determine their new version numbers.
+        # Find all the datasets that are reachable from this dataset and determine their new version numbers.
         related_datasets = list(self._build_dataset_graph(dataset_rid=dataset_rid))
         version_update_list = [
             DatasetSpec(
@@ -254,7 +254,7 @@ class Dataset:
             )
             for ds_rid in related_datasets
         ]
-        updated_versions = self._insert_dataset_versions(
+        self._insert_dataset_versions(
             version_update_list, description=description, execution_rid=execution_rid
         )
         return [d.version for d in version_update_list if d.rid == dataset_rid][0]
@@ -751,9 +751,10 @@ class Dataset:
         ]
     def _table_paths(
-        self, dataset: DatasetSpec = None, snapshot_catalog: Optional[DerivaML] = None
+        self,
+        dataset: Optional[DatasetSpec] = None,
+        snapshot_catalog: Optional[DerivaML] = None,
     ) -> Iterator[tuple[str, str, Table]]:
         paths = self._collect_paths(dataset and dataset.rid, snapshot_catalog)
         def source_path(path: tuple[Table, ...]):
@@ -779,17 +780,20 @@ class Dataset:
     def _collect_paths(
         self,
         dataset_rid: Optional[RID] = None,
-        snapshot_catalog: Optional[DerivaML] = None,
+        snapshot: Optional[Dataset] = None,
         dataset_nesting_depth: Optional[int] = None,
     ) -> set[tuple[Table, ...]]:
-        snapshot_catalog = snapshot_catalog or self
+        snapshot_catalog = snapshot if snapshot else self
         dataset_table = snapshot_catalog._model.schemas[self._ml_schema].tables[
             "Dataset"
         ]
         dataset_dataset = snapshot_catalog._model.schemas[self._ml_schema].tables[
             "Dataset_Dataset"
         ]
+        # Figure out what types of elements the dataset contains.
         dataset_associations = [
             a
             for a in self.dataset_table.find_associations()
@@ -812,7 +816,8 @@ class Dataset:
             ]
         else:
             included_associations = dataset_associations
-        # Get the paths through the schema and filter out all of dataset paths not used by this dataset.
+        # Get the paths through the schema and filter out all the dataset paths not used by this dataset.
         paths = {
             tuple(p)
             for p in snapshot_catalog._model._schema_to_paths()
@@ -827,7 +832,7 @@ class Dataset:
         if dataset_rid:
             for c in snapshot_catalog.list_dataset_children(dataset_rid=dataset_rid):
                 nested_paths |= self._collect_paths(
-                    c, snapshot_catalog=snapshot_catalog
+                    c, snapshot=snapshot_catalog
                 )
         else:
             # Initialize nesting depth if not already provided.

{deriva_ml-1.8.10 → deriva_ml-1.9.0}/src/deriva_ml/dataset_bag.py RENAMED Viewed

@@ -109,7 +109,7 @@ class DatasetBag:
         for ts, on in paths:
             tables = " JOIN ".join(ts)
             on_expression = " and ".join(
-                [f"{column_name(l)}={column_name(r)}" for l, r in on]
+                [f"{column_name(left)}={column_name(right)}" for left, right in on]
             )
             sql.append(
                 f"SELECT {select_args} FROM {tables} ON {on_expression} WHERE {dataset_table_name}.RID IN ({datasets})"

{deriva_ml-1.8.10 → deriva_ml-1.9.0}/src/deriva_ml/demo_catalog.py RENAMED Viewed

@@ -5,6 +5,7 @@ import logging
 from random import random, randint
 import tempfile
 from tempfile import TemporaryDirectory
+from typing import Optional
 import itertools
 from deriva.config.acl_config import AclConfig
@@ -18,7 +19,6 @@ from requests import HTTPError
 from deriva_ml import (
     DerivaML,
     ExecutionConfiguration,
-    Workflow,
     MLVocab,
     BuiltinTypes,
     ColumnDefinition,
@@ -169,12 +169,9 @@ def create_demo_features(ml_instance):
         description="Model for our API workflow",
     )
-    api_workflow = ml_instance.add_workflow(
-        Workflow(
-            name="API Workflow",
-            url="https://github.com/informatics-isi-edu/deriva-ml/blob/main/pyproject.toml",
-            workflow_type="API Workflow",
-        )
+    api_workflow = ml_instance.create_workflow(
+        name="API Workflow",
+        workflow_type="API Workflow",
     )
     api_execution = ml_instance.create_execution(
@@ -322,7 +319,11 @@ def create_demo_catalog(
 class DemoML(DerivaML):
     def __init__(
-        self, hostname, catalog_id, cache_dir: str = None, working_dir: str = None
+        self,
+        hostname,
+        catalog_id,
+        cache_dir: Optional[str] = None,
+        working_dir: Optional[str] = None,
     ):
         super().__init__(
             hostname=hostname,

{deriva_ml-1.8.10 → deriva_ml-1.9.0}/src/deriva_ml/deriva_definitions.py RENAMED Viewed

@@ -8,7 +8,7 @@ from enum import Enum
 from typing import Any, Iterable, Optional, Annotated
 import deriva.core.ermrest_model as em
-from urllib.parse import urlparse, urljoin
+from urllib.parse import urlparse
 from deriva.core.ermrest_model import builtin_types
 from pydantic import (
     BaseModel,
@@ -139,13 +139,18 @@ class FileSpec(BaseModel):
         if url_parts.scheme == "tag":
             return v
         elif not url_parts.scheme:
-            return f'tag://{gethostname()},{date.today()}:file://{v}'
+            return f"tag://{gethostname()},{date.today()}:file://{v}"
         else:
             raise ValidationError("url is not a file URL")
     @model_serializer()
     def serialize_filespec(self):
-        return {'URL': self.url, 'Description': self.description, 'MD5': self.md5, 'Length': self.length}
+        return {
+            "URL": self.url,
+            "Description": self.description,
+            "MD5": self.md5,
+            "Length": self.length,
+        }
 class VocabularyTerm(BaseModel):

deriva-ml 1.8.10__tar.gz → 1.9.0__tar.gz

deriva-ml 1.8.10tar.gz → 1.9.0tar.gz