PyPI - deriva-ml - Versions diffs - 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl - Mend

deriva-ml 1.17.10py3-none-any.whl → 1.17.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

deriva_ml/__init__.py +43 -1
deriva_ml/asset/__init__.py +17 -0
deriva_ml/asset/asset.py +357 -0
deriva_ml/asset/aux_classes.py +100 -0
deriva_ml/bump_version.py +254 -11
deriva_ml/catalog/__init__.py +21 -0
deriva_ml/catalog/clone.py +1199 -0
deriva_ml/catalog/localize.py +426 -0
deriva_ml/core/__init__.py +29 -0
deriva_ml/core/base.py +817 -1067
deriva_ml/core/config.py +169 -21
deriva_ml/core/constants.py +120 -19
deriva_ml/core/definitions.py +123 -13
deriva_ml/core/enums.py +47 -73
deriva_ml/core/ermrest.py +226 -193
deriva_ml/core/exceptions.py +297 -14
deriva_ml/core/filespec.py +99 -28
deriva_ml/core/logging_config.py +225 -0
deriva_ml/core/mixins/__init__.py +42 -0
deriva_ml/core/mixins/annotation.py +915 -0
deriva_ml/core/mixins/asset.py +384 -0
deriva_ml/core/mixins/dataset.py +237 -0
deriva_ml/core/mixins/execution.py +408 -0
deriva_ml/core/mixins/feature.py +365 -0
deriva_ml/core/mixins/file.py +263 -0
deriva_ml/core/mixins/path_builder.py +145 -0
deriva_ml/core/mixins/rid_resolution.py +204 -0
deriva_ml/core/mixins/vocabulary.py +400 -0
deriva_ml/core/mixins/workflow.py +322 -0
deriva_ml/core/validation.py +389 -0
deriva_ml/dataset/__init__.py +2 -1
deriva_ml/dataset/aux_classes.py +20 -4
deriva_ml/dataset/catalog_graph.py +575 -0
deriva_ml/dataset/dataset.py +1242 -1008
deriva_ml/dataset/dataset_bag.py +1311 -182
deriva_ml/dataset/history.py +27 -14
deriva_ml/dataset/upload.py +225 -38
deriva_ml/demo_catalog.py +126 -110
deriva_ml/execution/__init__.py +46 -2
deriva_ml/execution/base_config.py +639 -0
deriva_ml/execution/execution.py +543 -242
deriva_ml/execution/execution_configuration.py +26 -11
deriva_ml/execution/execution_record.py +592 -0
deriva_ml/execution/find_caller.py +298 -0
deriva_ml/execution/model_protocol.py +175 -0
deriva_ml/execution/multirun_config.py +153 -0
deriva_ml/execution/runner.py +595 -0
deriva_ml/execution/workflow.py +223 -34
deriva_ml/experiment/__init__.py +8 -0
deriva_ml/experiment/experiment.py +411 -0
deriva_ml/feature.py +6 -1
deriva_ml/install_kernel.py +143 -6
deriva_ml/interfaces.py +862 -0
deriva_ml/model/__init__.py +99 -0
deriva_ml/model/annotations.py +1278 -0
deriva_ml/model/catalog.py +286 -60
deriva_ml/model/database.py +144 -649
deriva_ml/model/deriva_ml_database.py +308 -0
deriva_ml/model/handles.py +14 -0
deriva_ml/run_model.py +319 -0
deriva_ml/run_notebook.py +507 -38
deriva_ml/schema/__init__.py +18 -2
deriva_ml/schema/annotations.py +62 -33
deriva_ml/schema/create_schema.py +169 -69
deriva_ml/schema/validation.py +601 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/METADATA +4 -4
deriva_ml-1.17.11.dist-info/RECORD +77 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/WHEEL +1 -1
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/entry_points.txt +1 -0
deriva_ml/protocols/dataset.py +0 -19
deriva_ml/test.py +0 -94
deriva_ml-1.17.10.dist-info/RECORD +0 -45
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.11.dist-info}/top_level.txt +0 -0

deriva_ml/experiment/experiment.py ADDED Viewed

@@ -0,0 +1,411 @@
+"""Experiment analysis for DerivaML.
+This module provides the Experiment class for analyzing completed executions.
+An Experiment wraps an execution RID and provides helper methods for extracting
+configuration details, model parameters, and experiment metadata.
+Typical usage example:
+    >>> from deriva_ml import DerivaML
+    >>> from deriva_ml.execution import Experiment
+    >>>
+    >>> ml = DerivaML("localhost", 45)
+    >>> exp = Experiment(ml, "47BE")
+    >>> print(exp.name)  # e.g., "cifar10_quick"
+    >>> print(exp.config_choices)  # Hydra config names used
+    >>> print(exp.model_config)  # Model hyperparameters
+"""
+from __future__ import annotations
+import tempfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+import yaml
+from deriva.core.hatrac_store import HatracStore
+if TYPE_CHECKING:
+    from deriva_ml.core.base import DerivaML
+    from deriva_ml.execution.execution_record import ExecutionRecord
+    from deriva_ml.asset.asset import Asset
+    from deriva_ml.dataset.dataset import Dataset
+@dataclass
+class Experiment:
+    """Wraps an execution for experiment analysis.
+    Provides convenient access to execution metadata, configuration choices,
+    model parameters, inputs, and outputs. Useful for comparing experiments
+    and generating analysis reports.
+    Attributes:
+        ml: DerivaML instance for catalog access.
+        execution_rid: RID of the execution to analyze.
+        execution: The underlying Execution object (lazy-loaded).
+        name: Experiment name from config_choices.model_config or execution RID.
+        config_choices: Dictionary of Hydra config names used.
+        model_config: Dictionary of model hyperparameters.
+        description: Execution description.
+        status: Execution status (e.g., "Completed").
+    Example:
+        >>> exp = Experiment(ml, "47BE")
+        >>> print(f"Experiment: {exp.name}")
+        >>> print(f"Config: {exp.config_choices}")
+        >>> for ds in exp.input_datasets:
+        ...     print(f"  Input: {ds.dataset_rid}")
+    """
+    ml: "DerivaML"
+    execution_rid: str
+    _execution: "ExecutionRecord | None" = field(default=None, repr=False)
+    _hydra_config: dict | None = field(default=None, repr=False)
+    _config_choices: dict | None = field(default=None, repr=False)
+    _model_config: dict | None = field(default=None, repr=False)
+    _name: str | None = field(default=None, repr=False)
+    @property
+    def execution(self) -> "ExecutionRecord":
+        """Get the underlying ExecutionRecord (lazy-loaded)."""
+        if self._execution is None:
+            self._execution = self.ml.lookup_execution(self.execution_rid)
+        return self._execution
+    @property
+    def hydra_config(self) -> dict:
+        """Get the full Hydra configuration from execution metadata.
+        Downloads and parses the hydra config YAML file from the execution's
+        metadata assets.
+        Returns:
+            Dictionary containing the full Hydra configuration, or empty dict
+            if no config file is found.
+        """
+        if self._hydra_config is None:
+            self._hydra_config = self._load_hydra_config()
+        return self._hydra_config
+    def _load_hydra_config(self) -> dict:
+        """Load Hydra configuration from execution metadata assets.
+        Loads both the config.yaml (model parameters) and hydra.yaml (choices)
+        and merges them into a single dictionary with:
+        - config_choices: from hydra.yaml runtime.choices
+        - model_config: from config.yaml model_config section
+        - Full config.yaml contents
+        """
+        # Query Execution_Metadata_Execution to find metadata assets for this execution
+        pb = self.ml.pathBuilder()
+        meta_exec = pb.schemas[self.ml.ml_schema].Execution_Metadata_Execution
+        metadata_table = pb.schemas[self.ml.ml_schema].Execution_Metadata
+        # Find metadata assets linked to this execution with role "Output"
+        query = meta_exec.filter(meta_exec.Execution == self.execution_rid)
+        query = query.filter(meta_exec.Asset_Role == "Output")
+        records = list(query.entities().fetch())
+        # Collect metadata records
+        metadata_files: dict[str, dict] = {}
+        for record in records:
+            metadata_rid = record.get("Execution_Metadata")
+            if not metadata_rid:
+                continue
+            meta_records = list(
+                metadata_table.filter(metadata_table.RID == metadata_rid)
+                .entities()
+                .fetch()
+            )
+            if meta_records:
+                meta = meta_records[0]
+                filename = meta.get("Filename", "")
+                if filename:
+                    metadata_files[filename] = meta
+        # Create HatracStore for downloading
+        hs = HatracStore(
+            "https",
+            self.ml.host_name,
+            self.ml.credential,
+        )
+        result: dict = {}
+        # Load config.yaml for model_config and full configuration
+        for filename, meta in metadata_files.items():
+            if filename.endswith("-config.yaml"):
+                url = meta.get("URL")
+                if url:
+                    with tempfile.TemporaryDirectory() as tmpdir:
+                        dest = Path(tmpdir) / filename
+                        hs.get_obj(url, destfilename=str(dest))
+                        if dest.exists():
+                            with open(dest) as f:
+                                result = yaml.safe_load(f) or {}
+                break
+        # Load hydra.yaml for config_choices (runtime.choices)
+        for filename, meta in metadata_files.items():
+            if filename.endswith("-hydra.yaml"):
+                url = meta.get("URL")
+                if url:
+                    with tempfile.TemporaryDirectory() as tmpdir:
+                        dest = Path(tmpdir) / filename
+                        hs.get_obj(url, destfilename=str(dest))
+                        if dest.exists():
+                            with open(dest) as f:
+                                hydra_data = yaml.safe_load(f) or {}
+                            # Extract choices from hydra.runtime.choices
+                            choices = (
+                                hydra_data.get("hydra", {})
+                                .get("runtime", {})
+                                .get("choices", {})
+                            )
+                            # Filter out hydra internal choices
+                            result["config_choices"] = {
+                                k: v
+                                for k, v in choices.items()
+                                if not k.startswith("hydra/")
+                            }
+                break
+        return result
+    @property
+    def config_choices(self) -> dict[str, str]:
+        """Get the Hydra configuration choices (config names used).
+        Returns:
+            Dictionary mapping config group names to the selected config names,
+            e.g., {"model_config": "cifar10_quick", "datasets": "cifar10_labeled_split"}
+        """
+        if self._config_choices is None:
+            self._config_choices = self.hydra_config.get("config_choices", {})
+        return self._config_choices
+    @property
+    def model_config(self) -> dict[str, Any]:
+        """Get the model configuration parameters.
+        Returns:
+            Dictionary of model hyperparameters from the Hydra config,
+            e.g., {"epochs": 3, "learning_rate": 0.001, "batch_size": 128}
+        """
+        if self._model_config is None:
+            self._model_config = self.hydra_config.get("model_config", {})
+        return self._model_config
+    @property
+    def name(self) -> str:
+        """Get the experiment name.
+        Returns the model_config name from config_choices if available,
+        otherwise returns the execution RID.
+        Returns:
+            Experiment name string.
+        """
+        if self._name is None:
+            self._name = self.config_choices.get("model_config", self.execution_rid)
+        return self._name
+    @property
+    def description(self) -> str:
+        """Get the execution description."""
+        return self.execution.description or ""
+    @property
+    def status(self) -> str:
+        """Get the execution status."""
+        if self.execution.status:
+            return self.execution.status.value
+        return ""
+    @property
+    def input_datasets(self) -> list["Dataset"]:
+        """Get the input datasets for this experiment.
+        Returns:
+            List of Dataset objects used as inputs.
+        """
+        return self.execution.list_input_datasets()
+    @property
+    def input_assets(self) -> list["Asset"]:
+        """Get the input assets for this experiment.
+        Returns:
+            List of Asset objects used as inputs.
+        """
+        return self.execution.list_assets(asset_role="Input")
+    @property
+    def output_assets(self) -> list["Asset"]:
+        """Get the output assets from this experiment.
+        Returns:
+            List of Asset objects produced as outputs.
+        """
+        return self.execution.list_assets(asset_role="Output")
+    def get_chaise_url(self) -> str:
+        """Get the Chaise URL for viewing this execution in the browser.
+        Returns:
+            URL string for the execution record in Chaise.
+        """
+        return (
+            f"https://{self.ml.host_name}/chaise/record/#{self.ml.catalog_id}/"
+            f"deriva-ml:Execution/RID={self.execution_rid}"
+        )
+    def summary(self) -> dict[str, Any]:
+        """Get a summary dictionary of the experiment.
+        Returns:
+            Dictionary with experiment metadata suitable for display or analysis.
+            Includes:
+            - name, execution_rid, description, status
+            - config_choices: Hydra config names used
+            - model_config: Model hyperparameters
+            - input_datasets: List of input dataset info
+            - input_assets: List of input asset info (non-metadata)
+            - output_assets: List of output asset info (non-metadata)
+            - metadata_assets: List of execution metadata assets (config files, etc.)
+            - url: Chaise URL to view execution
+        """
+        def asset_summary(asset: "Asset") -> dict[str, Any]:
+            """Create a summary dict for an asset."""
+            return {
+                "asset_rid": asset.asset_rid,
+                "asset_table": asset.asset_table,
+                "filename": asset.filename,
+                "description": asset.description,
+                "asset_types": asset.asset_types,
+                "url": asset.url,
+            }
+        # Separate metadata assets from other assets
+        input_assets = []
+        output_assets = []
+        metadata_assets = []
+        for asset in self.input_assets:
+            if asset.asset_table == "Execution_Metadata":
+                metadata_assets.append(asset_summary(asset))
+            else:
+                input_assets.append(asset_summary(asset))
+        for asset in self.output_assets:
+            if asset.asset_table == "Execution_Metadata":
+                # Avoid duplicates - metadata is typically output
+                if not any(m["asset_rid"] == asset.asset_rid for m in metadata_assets):
+                    metadata_assets.append(asset_summary(asset))
+            else:
+                output_assets.append(asset_summary(asset))
+        return {
+            "name": self.name,
+            "execution_rid": self.execution_rid,
+            "description": self.description,
+            "status": self.status,
+            "config_choices": self.config_choices,
+            "model_config": {
+                k: v for k, v in self.model_config.items() if not k.startswith("_")
+            },
+            "input_datasets": [
+                {
+                    "dataset_rid": ds.dataset_rid,
+                    "description": ds.description,
+                    "version": str(ds.current_version) if ds.current_version else None,
+                    "dataset_types": ds.dataset_types,
+                }
+                for ds in self.input_datasets
+            ],
+            "input_assets": input_assets,
+            "output_assets": output_assets,
+            "metadata_assets": metadata_assets,
+            "url": self.get_chaise_url(),
+        }
+    def to_markdown(self, show_datasets: bool = True, show_assets: bool = True) -> str:
+        """Generate a markdown summary of this experiment.
+        Returns a formatted markdown string with clickable links, configuration
+        details, and optionally input datasets and assets.
+        Args:
+            show_datasets: If True, include input datasets with nested children.
+            show_assets: If True, include input assets.
+        Returns:
+            Markdown-formatted string.
+        Example:
+            >>> exp = ml.lookup_experiment("47BE")
+            >>> print(exp.to_markdown())
+        """
+        lines = []
+        # Header with execution link
+        lines.append(f"### {self.name} ([{self.execution_rid}]({self.get_chaise_url()}))")
+        # Description
+        if self.description:
+            lines.append(f"**Description:** {self.description}")
+        # Config choices
+        if self.config_choices:
+            choices_str = ", ".join(
+                f"`{k}={v}`" for k, v in sorted(self.config_choices.items())
+            )
+            lines.append(f"**Configuration Choices:** {choices_str}")
+        # Model configuration (filter internal fields)
+        model_cfg = {
+            k: v for k, v in self.model_config.items() if not k.startswith("_")
+        }
+        if model_cfg:
+            lines.append("**Model Configuration:**")
+            for k, v in sorted(model_cfg.items()):
+                lines.append(f"- **{k}**: {v}")
+        # Input datasets
+        if show_datasets and self.input_datasets:
+            lines.append("**Input Datasets:**")
+            for ds in self.input_datasets:
+                lines.append(ds.to_markdown(show_children=True, indent=0))
+        # Input assets
+        if show_assets and self.input_assets:
+            lines.append("**Input Assets:**")
+            for asset in self.input_assets:
+                lines.append(
+                    f"- [{asset.asset_rid}]({asset.get_chaise_url()}): {asset.filename}"
+                )
+        return "\n".join(lines)
+    def display_markdown(self, show_datasets: bool = True, show_assets: bool = True) -> None:
+        """Display a formatted markdown summary of this experiment in Jupyter.
+        Convenience method that calls to_markdown() and displays the result
+        using IPython.display.Markdown.
+        Args:
+            show_datasets: If True, display input datasets with nested children.
+            show_assets: If True, display input assets.
+        Example:
+            >>> exp = ml.lookup_experiment("47BE")
+            >>> exp.display_markdown()
+        """
+        from IPython.display import display, Markdown
+        display(Markdown(self.to_markdown(show_datasets, show_assets)))
+    def __repr__(self) -> str:
+        return f"Experiment(name={self.name!r}, rid={self.execution_rid!r})"

deriva_ml/feature.py CHANGED Viewed

@@ -16,7 +16,12 @@ from pathlib import Path
 from types import UnionType
 from typing import TYPE_CHECKING, ClassVar, Optional, Type
-from deriva.core.ermrest_model import Column, FindAssociationResult
+# Deriva imports - use importlib to avoid shadowing by local 'deriva.py' files
+import importlib
+_ermrest_model = importlib.import_module("deriva.core.ermrest_model")
+Column = _ermrest_model.Column
+FindAssociationResult = _ermrest_model.FindAssociationResult
 from pydantic import BaseModel, create_model
 if TYPE_CHECKING:

deriva_ml/install_kernel.py CHANGED Viewed

@@ -1,3 +1,78 @@
+"""Jupyter kernel installation utility for DerivaML virtual environments.
+This module provides a command-line tool for installing a Jupyter kernel that
+points to the current Python virtual environment. This allows Jupyter notebooks
+to use the DerivaML environment with all its dependencies.
+Why Install a Kernel?
+---------------------
+When working with Jupyter notebooks, the kernel determines which Python
+environment executes the code. By default, Jupyter may not see packages
+installed in your virtual environment. Installing a kernel creates a
+link so Jupyter can find and use your DerivaML environment.
+How It Works
+------------
+1. Detects the current virtual environment name from ``pyvenv.cfg``
+2. Normalizes the name to be Jupyter-compatible (lowercase, alphanumeric)
+3. Registers the kernel with Jupyter using ipykernel's install mechanism
+4. The kernel appears in Jupyter's kernel selector with a friendly display name
+The kernel is installed in the user's Jupyter data directory by default,
+making it available across all Jupyter instances for that user.
+Usage
+-----
+Command line (after activating your virtual environment)::
+    # Install kernel for current virtual environment
+    deriva-ml-install-kernel
+    # Or run as a module
+    python -m deriva_ml.install_kernel
+As a module::
+    from deriva_ml.install_kernel import main
+    main()
+After installation, the kernel will appear in Jupyter with a name like
+"Python (deriva-ml)" or "Python (my-project)" depending on your venv name.
+Example Workflow
+----------------
+Setting up a new DerivaML project with Jupyter support::
+    # Create and activate virtual environment
+    $ uv venv --prompt my-ml-project
+    $ source .venv/bin/activate
+    # Install DerivaML
+    $ uv pip install deriva-ml
+    # Install Jupyter kernel
+    $ deriva-ml-install-kernel
+    Installed Jupyter kernel 'my-ml-project' with display name 'Python (my-ml-project)'
+    # Start Jupyter and select the new kernel
+    $ jupyter lab
+Kernel Location
+---------------
+Kernels are installed to the user's Jupyter data directory:
+- **Linux/macOS**: ``~/.local/share/jupyter/kernels/``
+- **Windows**: ``%APPDATA%\\jupyter\\kernels\\``
+Each kernel is a directory containing a ``kernel.json`` file that specifies
+the Python executable path and display name.
+See Also
+--------
+- Jupyter kernels documentation: https://jupyter-client.readthedocs.io/en/latest/kernels.html
+- ipykernel: https://github.com/ipython/ipykernel
+"""
 import re
 import sys
 from argparse import ArgumentParser
@@ -8,9 +83,18 @@ from ipykernel.kernelspec import install as install_kernel
 def _dist_name_for_this_package() -> str:
-    """
-    Try to resolve the distribution name that provides this package.
-    Works in editable installs and wheels.
+    """Resolve the distribution name that provides this package.
+    Works in both editable installs and wheels by using importlib.metadata
+    to map the top-level package name to its distribution.
+    Returns:
+        The distribution name (e.g., "deriva-ml").
+    Example:
+        >>> name = _dist_name_for_this_package()
+        >>> print(name)
+        deriva-ml
     """
     # Top-level package name of this module (your_pkg)
     top_pkg = __name__.split(".")[0]
@@ -25,8 +109,23 @@ def _dist_name_for_this_package() -> str:
 def _normalize_kernel_name(name: str) -> str:
-    """
-    Jupyter kernel directory names should be simple: lowercase, [-a-z0-9_].
+    """Normalize a name to be valid as a Jupyter kernel directory name.
+    Jupyter kernel directory names should be simple: lowercase letters,
+    digits, hyphens, underscores, and dots only. This function converts
+    any input string to a valid kernel name.
+    Args:
+        name: The input name to normalize (e.g., "My Project 2.0").
+    Returns:
+        A normalized kernel name (e.g., "my-project-2.0").
+    Example:
+        >>> _normalize_kernel_name("My ML Project!")
+        'my-ml-project-'
+        >>> _normalize_kernel_name("deriva-ml")
+        'deriva-ml'
     """
     name = name.strip().lower()
     name = re.sub(r"[^a-z0-9._-]+", "-", name)
@@ -34,6 +133,23 @@ def _normalize_kernel_name(name: str) -> str:
 def _name_for_this_venv() -> str:
+    """Extract the virtual environment name from pyvenv.cfg.
+    Reads the ``prompt`` setting from the current environment's pyvenv.cfg
+    file. This is set when creating a venv with ``--prompt`` flag, or
+    defaults to the directory name.
+    Returns:
+        The virtual environment prompt/name, or empty string if not found.
+    Raises:
+        FileNotFoundError: If not running in a virtual environment (no pyvenv.cfg).
+    Example:
+        >>> # In a venv created with: uv venv --prompt my-project
+        >>> _name_for_this_venv()
+        'my-project'
+    """
     config_path = Path(sys.prefix) / "pyvenv.cfg"
     with config_path.open() as f:
         m = re.search("prompt *= *(?P<prompt>.*)", f.read())
@@ -41,7 +157,28 @@ def _name_for_this_venv() -> str:
 def main() -> None:
-    parser = ArgumentParser()
+    """Main entry point for the kernel installation tool.
+    Installs a Jupyter kernel for the current virtual environment. The kernel
+    name and display name are derived from the virtual environment's prompt
+    setting in pyvenv.cfg.
+    The kernel is installed to the user's Jupyter data directory, making it
+    available for all Jupyter instances run by that user.
+    Command-line Arguments:
+        --install-local: Install kernel to the venv's prefix directory instead
+            of the user's Jupyter data directory. (Currently not fully implemented)
+    Example:
+        >>> # Typically called via command line:
+        >>> # $ deriva-ml-install-kernel
+        >>> main()
+        Installed Jupyter kernel 'my-project' with display name 'Python (my-project)'
+    """
+    parser = ArgumentParser(
+        description="Install a Jupyter kernel for the current virtual environment."
+    )
     parser.add_argument(
         "--install-local",
         action="store_true",

deriva-ml 1.17.10__py3-none-any.whl → 1.17.11__py3-none-any.whl

deriva-ml 1.17.10py3-none-any.whl → 1.17.11py3-none-any.whl