PyPI - deriva-ml - Versions diffs - 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl - Mend

deriva-ml 1.17.10py3-none-any.whl → 1.17.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

deriva_ml/__init__.py +69 -1
deriva_ml/asset/__init__.py +17 -0
deriva_ml/asset/asset.py +357 -0
deriva_ml/asset/aux_classes.py +100 -0
deriva_ml/bump_version.py +254 -11
deriva_ml/catalog/__init__.py +31 -0
deriva_ml/catalog/clone.py +1939 -0
deriva_ml/catalog/localize.py +426 -0
deriva_ml/core/__init__.py +29 -0
deriva_ml/core/base.py +845 -1067
deriva_ml/core/config.py +169 -21
deriva_ml/core/constants.py +120 -19
deriva_ml/core/definitions.py +123 -13
deriva_ml/core/enums.py +47 -73
deriva_ml/core/ermrest.py +226 -193
deriva_ml/core/exceptions.py +297 -14
deriva_ml/core/filespec.py +99 -28
deriva_ml/core/logging_config.py +225 -0
deriva_ml/core/mixins/__init__.py +42 -0
deriva_ml/core/mixins/annotation.py +915 -0
deriva_ml/core/mixins/asset.py +384 -0
deriva_ml/core/mixins/dataset.py +237 -0
deriva_ml/core/mixins/execution.py +408 -0
deriva_ml/core/mixins/feature.py +365 -0
deriva_ml/core/mixins/file.py +263 -0
deriva_ml/core/mixins/path_builder.py +145 -0
deriva_ml/core/mixins/rid_resolution.py +204 -0
deriva_ml/core/mixins/vocabulary.py +400 -0
deriva_ml/core/mixins/workflow.py +322 -0
deriva_ml/core/validation.py +389 -0
deriva_ml/dataset/__init__.py +2 -1
deriva_ml/dataset/aux_classes.py +20 -4
deriva_ml/dataset/catalog_graph.py +575 -0
deriva_ml/dataset/dataset.py +1242 -1008
deriva_ml/dataset/dataset_bag.py +1311 -182
deriva_ml/dataset/history.py +27 -14
deriva_ml/dataset/upload.py +225 -38
deriva_ml/demo_catalog.py +126 -110
deriva_ml/execution/__init__.py +46 -2
deriva_ml/execution/base_config.py +639 -0
deriva_ml/execution/execution.py +543 -242
deriva_ml/execution/execution_configuration.py +26 -11
deriva_ml/execution/execution_record.py +592 -0
deriva_ml/execution/find_caller.py +298 -0
deriva_ml/execution/model_protocol.py +175 -0
deriva_ml/execution/multirun_config.py +153 -0
deriva_ml/execution/runner.py +595 -0
deriva_ml/execution/workflow.py +223 -34
deriva_ml/experiment/__init__.py +8 -0
deriva_ml/experiment/experiment.py +411 -0
deriva_ml/feature.py +6 -1
deriva_ml/install_kernel.py +143 -6
deriva_ml/interfaces.py +862 -0
deriva_ml/model/__init__.py +99 -0
deriva_ml/model/annotations.py +1278 -0
deriva_ml/model/catalog.py +286 -60
deriva_ml/model/database.py +144 -649
deriva_ml/model/deriva_ml_database.py +308 -0
deriva_ml/model/handles.py +14 -0
deriva_ml/run_model.py +319 -0
deriva_ml/run_notebook.py +507 -38
deriva_ml/schema/__init__.py +18 -2
deriva_ml/schema/annotations.py +62 -33
deriva_ml/schema/create_schema.py +169 -69
deriva_ml/schema/validation.py +601 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/METADATA +4 -4
deriva_ml-1.17.12.dist-info/RECORD +77 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/WHEEL +1 -1
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/entry_points.txt +1 -0
deriva_ml/protocols/dataset.py +0 -19
deriva_ml/test.py +0 -94
deriva_ml-1.17.10.dist-info/RECORD +0 -45
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/licenses/LICENSE +0 -0
{deriva_ml-1.17.10.dist-info → deriva_ml-1.17.12.dist-info}/top_level.txt +0 -0

deriva_ml/core/validation.py ADDED Viewed

@@ -0,0 +1,389 @@
+"""Centralized validation configuration for DerivaML.
+This module provides shared Pydantic configuration, custom validators,
+and RID validation utilities used throughout DerivaML.
+The module provides:
+    - DERIVA_ML_CONFIG: Shared ConfigDict for Pydantic models
+    - VALIDATION_CONFIG: Alias for DERIVA_ML_CONFIG (for use with @validate_call)
+    - Custom Pydantic types for common patterns (RID validation, etc.)
+    - validate_rids(): Validate that RIDs exist in the catalog
+    - ValidationResult: Result container for validation operations
+Example (Pydantic config):
+    >>> from deriva_ml.core.validation import VALIDATION_CONFIG
+    >>> from pydantic import validate_call
+    >>>
+    >>> @validate_call(config=VALIDATION_CONFIG)
+    ... def process_table(table: Table) -> None:
+    ...     pass
+Example (RID validation):
+    >>> from deriva_ml.core.validation import validate_rids
+    >>>
+    >>> result = validate_rids(
+    ...     ml,
+    ...     dataset_rids=["1-ABC", "2-DEF"],
+    ...     asset_rids=["3-GHI"],
+    ... )
+    >>> if not result.is_valid:
+    ...     for error in result.errors:
+    ...         print(f"ERROR: {error}")
+"""
+from __future__ import annotations
+from pydantic import ConfigDict
+# =============================================================================
+# Shared Pydantic Configuration
+# =============================================================================
+# Standard configuration for DerivaML Pydantic models and validate_call decorators.
+# This allows arbitrary types (like deriva Table, Column, etc.) to be used in
+# Pydantic validation without explicit type adapters.
+VALIDATION_CONFIG = ConfigDict(
+    arbitrary_types_allowed=True,
+    # Validate default values during model creation
+    validate_default=True,
+    # Use enum values instead of enum members for serialization
+    use_enum_values=True,
+)
+# Alias for backwards compatibility and clarity in model definitions
+DERIVA_ML_CONFIG = VALIDATION_CONFIG
+# Configuration for models that should be strict about extra fields
+STRICT_VALIDATION_CONFIG = ConfigDict(
+    arbitrary_types_allowed=True,
+    validate_default=True,
+    use_enum_values=True,
+    extra="forbid",  # Raise error if extra fields provided
+)
+__all__ = [
+    "VALIDATION_CONFIG",
+    "DERIVA_ML_CONFIG",
+    "STRICT_VALIDATION_CONFIG",
+    "ValidationResult",
+    "validate_rids",
+    "validate_vocabulary_terms",
+]
+# =============================================================================
+# RID Validation
+# =============================================================================
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from deriva_ml.core.base import DerivaML
+@dataclass
+class ValidationResult:
+    """Result of configuration validation.
+    When printed, displays a formatted summary of validation results including
+    any errors and warnings. This makes it easy to inspect validation results
+    in interactive sessions.
+    Attributes:
+        is_valid: True if all validations passed, False otherwise.
+        errors: List of error messages for failed validations.
+        warnings: List of warning messages for potential issues.
+        validated_rids: Dictionary mapping RID to its resolved table info.
+    Example:
+        >>> result = validate_rids(ml, dataset_rids=["1-ABC"])
+        >>> print(result)
+        ✓ Validation passed
+          Validated 1 RIDs
+        >>> result = validate_rids(ml, dataset_rids=["INVALID"])
+        >>> print(result)
+        ✗ Validation failed with 1 error(s)
+        Errors:
+          • Dataset RID 'INVALID' does not exist in catalog
+    """
+    is_valid: bool = True
+    errors: list[str] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+    validated_rids: dict[str, dict[str, Any]] = field(default_factory=dict)
+    def add_error(self, message: str) -> None:
+        """Add an error message and mark result as invalid."""
+        self.errors.append(message)
+        self.is_valid = False
+    def add_warning(self, message: str) -> None:
+        """Add a warning message."""
+        self.warnings.append(message)
+    def merge(self, other: "ValidationResult") -> "ValidationResult":
+        """Merge another validation result into this one."""
+        if not other.is_valid:
+            self.is_valid = False
+        self.errors.extend(other.errors)
+        self.warnings.extend(other.warnings)
+        self.validated_rids.update(other.validated_rids)
+        return self
+    def __repr__(self) -> str:
+        """Return a formatted string representation of the validation result."""
+        lines = []
+        if self.is_valid:
+            lines.append("✓ Validation passed")
+            if self.validated_rids:
+                lines.append(f"  Validated {len(self.validated_rids)} RID(s)")
+        else:
+            lines.append(f"✗ Validation failed with {len(self.errors)} error(s)")
+        if self.errors:
+            lines.append("")
+            lines.append("Errors:")
+            for error in self.errors:
+                lines.append(f"  • {error}")
+        if self.warnings:
+            lines.append("")
+            lines.append("Warnings:")
+            for warning in self.warnings:
+                lines.append(f"  ⚠ {warning}")
+        return "\n".join(lines)
+    def __str__(self) -> str:
+        """Return a formatted string for print()."""
+        return self.__repr__()
+def validate_rids(
+    ml: "DerivaML",
+    dataset_rids: list[str] | None = None,
+    asset_rids: list[str] | None = None,
+    dataset_versions: dict[str, str] | None = None,
+    workflow_rids: list[str] | None = None,
+    execution_rids: list[str] | None = None,
+    warn_missing_descriptions: bool = True,
+) -> ValidationResult:
+    """Validate that RIDs exist in the catalog.
+    Performs batch validation of RIDs to ensure they exist before running
+    experiments. This catches configuration errors early with clear messages.
+    Args:
+        ml: Connected DerivaML instance.
+        dataset_rids: List of dataset RIDs to validate.
+        asset_rids: List of asset RIDs to validate.
+        dataset_versions: Dictionary mapping dataset RID to required version string.
+            If provided, validates that the dataset has the specified version.
+        workflow_rids: List of workflow RIDs to validate.
+        execution_rids: List of execution RIDs to validate.
+        warn_missing_descriptions: If True (default), warn when datasets or other
+            entities are missing descriptions.
+    Returns:
+        ValidationResult with is_valid flag, error/warning messages, and
+        resolved RID information.
+    Example:
+        >>> result = validate_rids(
+        ...     ml,
+        ...     dataset_rids=["1-ABC", "2-DEF"],
+        ...     dataset_versions={"1-ABC": "0.4.0"},
+        ...     asset_rids=["3-GHI"],
+        ... )
+        >>> print(result)
+        ✓ Validation passed
+          Validated 3 RID(s)
+    """
+    from deriva_ml.core.exceptions import DerivaMLException
+    result = ValidationResult()
+    # Collect all RIDs for batch resolution
+    all_rids: set[str] = set()
+    rid_categories: dict[str, str] = {}  # Maps RID to category for error messages
+    if dataset_rids:
+        for rid in dataset_rids:
+            all_rids.add(rid)
+            rid_categories[rid] = "dataset"
+    if asset_rids:
+        for rid in asset_rids:
+            all_rids.add(rid)
+            rid_categories[rid] = "asset"
+    if workflow_rids:
+        for rid in workflow_rids:
+            all_rids.add(rid)
+            rid_categories[rid] = "workflow"
+    if execution_rids:
+        for rid in execution_rids:
+            all_rids.add(rid)
+            rid_categories[rid] = "execution"
+    if not all_rids:
+        return result  # Nothing to validate
+    # Batch resolve all RIDs
+    try:
+        resolved = ml.resolve_rids(all_rids)
+        for rid, info in resolved.items():
+            result.validated_rids[rid] = {
+                "rid": rid,
+                "table": info.table_name,
+                "schema": info.schema_name,
+            }
+    except DerivaMLException as e:
+        # Extract invalid RIDs from the error message
+        error_msg = str(e)
+        if "Invalid RIDs:" in error_msg:
+            # Parse out the invalid RIDs - report each one
+            for rid in all_rids:
+                if rid not in result.validated_rids:
+                    category = rid_categories.get(rid, "unknown")
+                    result.add_error(f"{category.title()} RID '{rid}' does not exist in catalog")
+        else:
+            result.add_error(f"RID validation failed: {e}")
+    # Validate dataset versions if specified
+    if dataset_versions and dataset_rids:
+        for rid, required_version in dataset_versions.items():
+            if rid not in result.validated_rids:
+                continue  # Already reported as missing
+            try:
+                dataset = ml.lookup_dataset(rid)
+                current_version = str(dataset.current_version) if dataset.current_version else None
+                if current_version is None:
+                    result.add_warning(
+                        f"Dataset '{rid}' has no version information. "
+                        f"Required version: {required_version}"
+                    )
+                elif current_version != required_version:
+                    # Check if the required version exists in history
+                    try:
+                        history = dataset.list_versions()
+                        version_exists = any(
+                            str(h.dataset_version) == required_version for h in history
+                        )
+                        if not version_exists:
+                            result.add_error(
+                                f"Dataset '{rid}' does not have version '{required_version}'. "
+                                f"Current version: {current_version}. "
+                                f"Available versions: {[str(h.dataset_version) for h in history]}"
+                            )
+                        else:
+                            # Version exists but is not current - this is OK
+                            result.validated_rids[rid]["version"] = required_version
+                            result.validated_rids[rid]["current_version"] = current_version
+                    except Exception:
+                        # Can't check history, just warn
+                        result.add_warning(
+                            f"Dataset '{rid}' current version ({current_version}) differs from "
+                            f"required version ({required_version}). Could not verify version history."
+                        )
+                else:
+                    result.validated_rids[rid]["version"] = required_version
+            except Exception as e:
+                result.add_error(f"Failed to validate dataset '{rid}' version: {e}")
+    # Validate that datasets are actually in Dataset table
+    if dataset_rids:
+        for rid in dataset_rids:
+            if rid in result.validated_rids:
+                info = result.validated_rids[rid]
+                if info.get("table") != "Dataset":
+                    result.add_error(
+                        f"RID '{rid}' specified as dataset but found in table "
+                        f"'{info.get('schema')}.{info.get('table')}'"
+                    )
+    # Validate that workflow RIDs are in Workflow table
+    if workflow_rids:
+        for rid in workflow_rids:
+            if rid in result.validated_rids:
+                info = result.validated_rids[rid]
+                if info.get("table") != "Workflow":
+                    result.add_error(
+                        f"RID '{rid}' specified as workflow but found in table "
+                        f"'{info.get('schema')}.{info.get('table')}'"
+                    )
+    # Validate that execution RIDs are in Execution table
+    if execution_rids:
+        for rid in execution_rids:
+            if rid in result.validated_rids:
+                info = result.validated_rids[rid]
+                if info.get("table") != "Execution":
+                    result.add_error(
+                        f"RID '{rid}' specified as execution but found in table "
+                        f"'{info.get('schema')}.{info.get('table')}'"
+                    )
+    # Check for missing descriptions
+    if warn_missing_descriptions and dataset_rids:
+        for rid in dataset_rids:
+            if rid in result.validated_rids and result.validated_rids[rid].get("table") == "Dataset":
+                try:
+                    dataset = ml.lookup_dataset(rid)
+                    if not dataset.description or dataset.description.strip() == "":
+                        result.add_warning(f"Dataset '{rid}' has no description")
+                except Exception:
+                    pass  # Already reported other errors
+    return result
+def validate_vocabulary_terms(
+    ml: "DerivaML",
+    vocabulary_name: str,
+    terms: list[str],
+) -> ValidationResult:
+    """Validate that terms exist in a vocabulary.
+    Args:
+        ml: Connected DerivaML instance.
+        vocabulary_name: Name of the vocabulary table.
+        terms: List of term names to validate.
+    Returns:
+        ValidationResult with validation status and details.
+    Example:
+        >>> result = validate_vocabulary_terms(ml, "Dataset_Type", ["Training", "Testing"])
+        >>> if not result.is_valid:
+        ...     for error in result.errors:
+        ...         print(f"  - {error}")
+    """
+    result = ValidationResult()
+    try:
+        existing_terms = ml.list_terms(vocabulary_name)
+        existing_names = {t.name for t in existing_terms}
+        for term in terms:
+            if term not in existing_names:
+                result.add_error(
+                    f"Term '{term}' not found in vocabulary '{vocabulary_name}'. "
+                    f"Available terms: {sorted(existing_names)}"
+                )
+            else:
+                result.validated_rids[f"{vocabulary_name}:{term}"] = {
+                    "vocabulary": vocabulary_name,
+                    "term": term,
+                }
+    except Exception as e:
+        result.add_error(f"Failed to validate vocabulary '{vocabulary_name}': {e}")
+    return result

deriva_ml/dataset/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from .aux_classes import DatasetSpec, DatasetSpecConfig, DatasetVersion, VersionPart
 from .dataset import Dataset
-from .dataset_bag import DatasetBag
+from .dataset_bag import DatasetBag, FeatureValueRecord
 __all__ = [
     "Dataset",
@@ -8,5 +8,6 @@ __all__ = [
     "DatasetSpecConfig",
     "DatasetBag",
     "DatasetVersion",
+    "FeatureValueRecord",
     "VersionPart",
 ]

deriva_ml/dataset/aux_classes.py CHANGED Viewed

@@ -3,6 +3,7 @@ THis module defines the DataSet class with is used to manipulate n
 """
 from enum import Enum
+from pprint import pformat
 from typing import Any, Optional, SupportsInt
 from hydra_zen import hydrated_dataclass
@@ -20,6 +21,16 @@ from semver import Version
 from deriva_ml.core.definitions import RID
+try:
+    from icecream import ic
+    ic.configureOutput(
+        includeContext=True,
+        argToStringFunction=lambda x: pformat(x.model_dump() if hasattr(x, "model_dump") else x, width=80, depth=10),
+    )
+except ImportError:  # Graceful fallback if IceCream isn't installed.
+    ic = lambda *a: None if not a else (a[0] if len(a) == 1 else a)  # noqa
 class VersionPart(Enum):
     """Simple enumeration for semantic versioning.
@@ -43,7 +54,7 @@ class DatasetVersion(Version):
         replace(major, minor, patch): Replace the major and minor versions
     """
-    def __init__(self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0):
+    def __init__(self, major: SupportsInt, minor: SupportsInt = 0, patch: SupportsInt = 0) -> None:
         """Initialize a DatasetVersion object.
         Args:
@@ -72,7 +83,7 @@ class DatasetVersion(Version):
         return self.major, self.minor, self.patch
     @classmethod
-    def parse(cls, version: str, optional_minor_an_path=False) -> "DatasetVersion":
+    def parse(cls, version: str, optional_minor_an_path: bool = False) -> "DatasetVersion":
         v = Version.parse(version)
         return DatasetVersion(v.major, v.minor, v.patch)
@@ -111,8 +122,13 @@ class DatasetHistory(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
+    @field_validator("execution_rid", mode="before")
+    @classmethod
+    def _default_execution_rid(cls, v: str | None) -> str | None:
+        return None if v == "" else v
     @field_validator("description", mode="after")
-    def _default_description(cls, v) -> str:
+    def _default_description(cls, v: str | None) -> str:
         return v or ""
@@ -153,7 +169,7 @@ class DatasetMinid(BaseModel):
     @model_validator(mode="before")
     @classmethod
-    def insert_metadata(cls, data: Any) -> Any:
+    def insert_metadata(cls, data: dict) -> dict:
         if isinstance(data, dict):
             if "metadata" in data:
                 data = data | data["metadata"]

deriva-ml 1.17.10__py3-none-any.whl → 1.17.12__py3-none-any.whl

deriva-ml 1.17.10py3-none-any.whl → 1.17.12py3-none-any.whl