PyPI - themis-eval - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

themis-eval 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +429 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/commands/results.py +252 -0
themis/cli/main.py +427 -57
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/core/entities.py +23 -3
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/pipelines/standard_pipeline.py +68 -8
themis/experiment/cache_manager.py +8 -3
themis/experiment/export.py +110 -2
themis/experiment/orchestrator.py +109 -11
themis/experiment/storage.py +1457 -110
themis/generation/providers/litellm_provider.py +46 -0
themis/generation/runner.py +22 -6
themis/integrations/huggingface.py +12 -1
themis/integrations/wandb.py +13 -1
themis/interfaces/__init__.py +86 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis_eval-0.2.1.dist-info/METADATA +596 -0
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
themis_eval-0.1.1.dist-info/METADATA +0 -758
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0

themis/backends/storage.py ADDED Viewed

@@ -0,0 +1,260 @@
+"""Storage backend interface for custom storage implementations.
+This module defines the abstract interface for storage backends, allowing
+users to implement custom storage solutions (cloud storage, databases, etc.)
+without modifying Themis core code.
+Example implementations:
+- S3Backend: Store results in AWS S3
+- GCSBackend: Store results in Google Cloud Storage
+- PostgresBackend: Store results in PostgreSQL
+- RedisBackend: Use Redis for distributed caching
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List
+from themis.core.entities import (
+    EvaluationRecord,
+    ExperimentReport,
+    GenerationRecord,
+)
+class StorageBackend(ABC):
+    """Abstract interface for storage backends.
+    Implement this interface to create custom storage solutions.
+    All methods should be thread-safe if used with concurrent workers.
+    Example:
+        >>> class S3StorageBackend(StorageBackend):
+        ...     def __init__(self, bucket: str):
+        ...         self.bucket = bucket
+        ...         self.s3_client = boto3.client('s3')
+        ...
+        ...     def save_run_metadata(self, run_id: str, metadata: RunMetadata) -> None:
+        ...         key = f"runs/{run_id}/metadata.json"
+        ...         self.s3_client.put_object(
+        ...             Bucket=self.bucket,
+        ...             Key=key,
+        ...             Body=metadata.to_json(),
+        ...         )
+        ...     # ... implement other methods
+    """
+    @abstractmethod
+    def save_run_metadata(self, run_id: str, metadata: Dict[str, Any]) -> None:
+        """Save run metadata.
+        Args:
+            run_id: Unique identifier for the run
+            metadata: Run metadata to save (as dictionary)
+        """
+        pass
+    @abstractmethod
+    def load_run_metadata(self, run_id: str) -> Dict[str, Any]:
+        """Load run metadata.
+        Args:
+            run_id: Unique identifier for the run
+        Returns:
+            Run metadata as dictionary
+        Raises:
+            FileNotFoundError: If run metadata doesn't exist
+        """
+        pass
+    @abstractmethod
+    def save_generation_record(self, run_id: str, record: GenerationRecord) -> None:
+        """Save a generation record.
+        Args:
+            run_id: Unique identifier for the run
+            record: Generation record to save
+        Note:
+            This method should be atomic and thread-safe.
+        """
+        pass
+    @abstractmethod
+    def load_generation_records(self, run_id: str) -> List[GenerationRecord]:
+        """Load all generation records for a run.
+        Args:
+            run_id: Unique identifier for the run
+        Returns:
+            List of generation records
+        """
+        pass
+    @abstractmethod
+    def save_evaluation_record(self, run_id: str, record: EvaluationRecord) -> None:
+        """Save an evaluation record.
+        Args:
+            run_id: Unique identifier for the run
+            record: Evaluation record to save
+        Note:
+            This method should be atomic and thread-safe.
+        """
+        pass
+    @abstractmethod
+    def load_evaluation_records(self, run_id: str) -> Dict[str, EvaluationRecord]:
+        """Load all evaluation records for a run.
+        Args:
+            run_id: Unique identifier for the run
+        Returns:
+            Dictionary mapping cache_key to EvaluationRecord
+        """
+        pass
+    @abstractmethod
+    def save_report(self, run_id: str, report: ExperimentReport) -> None:
+        """Save experiment report.
+        Args:
+            run_id: Unique identifier for the run
+            report: Experiment report to save
+        """
+        pass
+    @abstractmethod
+    def load_report(self, run_id: str) -> ExperimentReport:
+        """Load experiment report.
+        Args:
+            run_id: Unique identifier for the run
+        Returns:
+            Experiment report
+        Raises:
+            FileNotFoundError: If report doesn't exist
+        """
+        pass
+    @abstractmethod
+    def list_runs(self) -> List[str]:
+        """List all run IDs in storage.
+        Returns:
+            List of run IDs
+        """
+        pass
+    @abstractmethod
+    def run_exists(self, run_id: str) -> bool:
+        """Check if a run exists in storage.
+        Args:
+            run_id: Unique identifier for the run
+        Returns:
+            True if run exists, False otherwise
+        """
+        pass
+    @abstractmethod
+    def delete_run(self, run_id: str) -> None:
+        """Delete all data for a run.
+        Args:
+            run_id: Unique identifier for the run
+        """
+        pass
+    def close(self) -> None:
+        """Close the storage backend and release resources.
+        Optional method for cleanup. Called when storage is no longer needed.
+        """
+        pass
+class LocalFileStorageBackend(StorageBackend):
+    """Adapter for the existing ExperimentStorage implementation.
+    This class wraps the current file-based storage implementation
+    to conform to the StorageBackend interface.
+    Note:
+        This is a compatibility layer. New code should use the interface,
+        but existing storage logic is preserved.
+    """
+    def __init__(self, storage_path: str | Path):
+        """Initialize with path to storage directory.
+        Args:
+            storage_path: Path to storage directory
+        """
+        from themis.experiment.storage import ExperimentStorage
+        self._storage = ExperimentStorage(storage_path)
+    def save_run_metadata(self, run_id: str, metadata: Dict[str, Any]) -> None:
+        """Save run metadata."""
+        experiment_id = metadata.get("experiment_id", "default")
+        self._storage.start_run(run_id, experiment_id=experiment_id)
+    def load_run_metadata(self, run_id: str) -> Dict[str, Any]:
+        """Load run metadata."""
+        # Note: Current storage doesn't have a direct method for this
+        # This is a limitation of the adapter pattern
+        raise NotImplementedError("Use ExperimentStorage directly for now")
+    def save_generation_record(self, run_id: str, record: GenerationRecord) -> None:
+        """Save generation record."""
+        self._storage.append_record(run_id, record)
+    def load_generation_records(self, run_id: str) -> List[GenerationRecord]:
+        """Load generation records."""
+        cached = self._storage.load_cached_records(run_id)
+        return list(cached.values())
+    def save_evaluation_record(self, run_id: str, record: EvaluationRecord) -> None:
+        """Save evaluation record."""
+        self._storage.append_evaluation(run_id, record)
+    def load_evaluation_records(self, run_id: str) -> Dict[str, EvaluationRecord]:
+        """Load evaluation records."""
+        return self._storage.load_cached_evaluations(run_id)
+    def save_report(self, run_id: str, report: ExperimentReport) -> None:
+        """Save report."""
+        self._storage.save_report(run_id, report)
+    def load_report(self, run_id: str) -> ExperimentReport:
+        """Load report."""
+        return self._storage.load_report(run_id)
+    def list_runs(self) -> List[str]:
+        """List runs."""
+        return self._storage.list_runs()
+    def run_exists(self, run_id: str) -> bool:
+        """Check if run exists."""
+        return run_id in self._storage.list_runs()
+    def delete_run(self, run_id: str) -> None:
+        """Delete run."""
+        # Note: Current storage doesn't have delete functionality
+        raise NotImplementedError("Delete not implemented in current storage")
+__all__ = [
+    "StorageBackend",
+    "LocalFileStorageBackend",
+]

themis/cli/commands/results.py ADDED Viewed

@@ -0,0 +1,252 @@
+"""Quick results viewing commands for experiment summaries."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Annotated
+from cyclopts import Parameter
+def summary_command(
+    *,
+    run_id: Annotated[
+        str,
+        Parameter(
+            help="Run ID to view summary for",
+        ),
+    ],
+    storage: Annotated[
+        Path,
+        Parameter(
+            help="Storage directory containing experiment results",
+        ),
+    ] = Path(".cache/runs"),
+) -> int:
+    """View quick summary of a single experiment run.
+    This command reads the lightweight summary.json file (~1KB) instead of
+    the full report.json (~1.6MB), making it much faster for quick checks.
+    Examples:
+        # View summary for a specific run
+        uv run python -m themis.cli results summary \\
+          --run-id run-20260118-032014 \\
+          --storage outputs/evaluation
+        # Quick check of latest run
+        uv run python -m themis.cli results summary \\
+          --run-id $(ls -t outputs/evaluation | head -1)
+    """
+    try:
+        # Try to find summary.json
+        run_dir = storage / run_id
+        summary_path = run_dir / "summary.json"
+        if not summary_path.exists():
+            print(f"Error: Summary file not found at {summary_path}")
+            print("\nNote: summary.json is only available for runs created with")
+            print("the updated export functionality. For older runs, use the")
+            print("'compare' command which reads full report.json files.")
+            return 1
+        # Load summary
+        with summary_path.open("r", encoding="utf-8") as f:
+            summary = json.load(f)
+        # Display summary
+        print("=" * 80)
+        print(f"Experiment Summary: {run_id}")
+        print("=" * 80)
+        # Basic info
+        print(f"\nRun ID: {summary.get('run_id', 'N/A')}")
+        print(f"Total Samples: {summary.get('total_samples', 0)}")
+        # Metadata
+        metadata = summary.get("metadata", {})
+        if metadata:
+            print("\nConfiguration:")
+            print(f"  Model: {metadata.get('model', 'N/A')}")
+            print(f"  Prompt: {metadata.get('prompt_template', 'N/A')}")
+            sampling = metadata.get("sampling", {})
+            if sampling:
+                print(f"  Temperature: {sampling.get('temperature', 'N/A')}")
+                print(f"  Max Tokens: {sampling.get('max_tokens', 'N/A')}")
+        # Metrics
+        metrics = summary.get("metrics", {})
+        if metrics:
+            print("\nMetrics:")
+            for name, data in metrics.items():
+                mean = data.get("mean", 0)
+                count = data.get("count", 0)
+                print(f"  {name}: {mean:.4f} (n={count})")
+        # Cost
+        cost = summary.get("cost_usd")
+        if cost is not None:
+            print(f"\nCost: ${cost:.4f}")
+        # Failures
+        failures = summary.get("failures", 0)
+        failure_rate = summary.get("failure_rate", 0)
+        if failures > 0:
+            print(f"\nFailures: {failures} ({failure_rate:.2%})")
+        print("\n" + "=" * 80)
+        return 0
+    except FileNotFoundError:
+        print(f"Error: Run directory not found: {run_dir}")
+        return 1
+    except json.JSONDecodeError as e:
+        print(f"Error: Invalid JSON in summary file: {e}")
+        return 1
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+def list_command(
+    *,
+    storage: Annotated[
+        Path,
+        Parameter(
+            help="Storage directory containing experiment results",
+        ),
+    ] = Path(".cache/runs"),
+    limit: Annotated[
+        int | None,
+        Parameter(
+            help="Maximum number of runs to display",
+        ),
+    ] = None,
+    sort_by: Annotated[
+        str,
+        Parameter(
+            help="Sort runs by: time (newest first) or metric name",
+        ),
+    ] = "time",
+) -> int:
+    """List all experiment runs with quick summaries.
+    This command scans for summary.json files and displays a table of all runs.
+    Much faster than loading full report.json files.
+    Examples:
+        # List all runs
+        uv run python -m themis.cli results list
+        # List 10 most recent runs
+        uv run python -m themis.cli results list --limit 10
+        # List runs sorted by accuracy
+        uv run python -m themis.cli results list --sort-by accuracy
+    """
+    try:
+        if not storage.exists():
+            print(f"Error: Storage directory not found: {storage}")
+            return 1
+        # Find all summary.json files
+        summaries = []
+        for run_dir in storage.iterdir():
+            if not run_dir.is_dir():
+                continue
+            summary_path = run_dir / "summary.json"
+            if summary_path.exists():
+                try:
+                    with summary_path.open("r", encoding="utf-8") as f:
+                        summary = json.load(f)
+                    summary["_run_dir"] = run_dir.name
+                    summary["_mtime"] = summary_path.stat().st_mtime
+                    summaries.append(summary)
+                except Exception:
+                    continue
+        if not summaries:
+            print(f"No experiment runs found in {storage}")
+            print("\nNote: Only runs with summary.json files are shown.")
+            return 0
+        # Sort summaries
+        if sort_by == "time":
+            summaries.sort(key=lambda s: s.get("_mtime", 0), reverse=True)
+        else:
+            # Sort by metric value
+            summaries.sort(
+                key=lambda s: s.get("metrics", {}).get(sort_by, {}).get("mean", 0),
+                reverse=True,
+            )
+        # Apply limit
+        if limit:
+            summaries = summaries[:limit]
+        # Display table
+        print("=" * 120)
+        print(f"Found {len(summaries)} experiment run(s)")
+        print("=" * 120)
+        # Collect all metric names
+        all_metrics = set()
+        for s in summaries:
+            all_metrics.update(s.get("metrics", {}).keys())
+        metric_names = sorted(all_metrics)
+        # Header
+        header_cols = ["Run ID", "Model", "Samples"] + metric_names + ["Cost ($)"]
+        col_widths = [25, 30, 8] + [12] * len(metric_names) + [10]
+        header = " | ".join(
+            col.ljust(width)[:width] for col, width in zip(header_cols, col_widths)
+        )
+        print(header)
+        print("-" * len(header))
+        # Rows
+        for summary in summaries:
+            run_id = summary.get("_run_dir", "N/A")[:25]
+            model = summary.get("metadata", {}).get("model", "N/A")[:30]
+            samples = str(summary.get("total_samples", 0))
+            cost = summary.get("cost_usd")
+            row_values = [run_id, model, samples]
+            # Add metric values
+            for metric_name in metric_names:
+                metric_data = summary.get("metrics", {}).get(metric_name, {})
+                mean = metric_data.get("mean")
+                if mean is not None:
+                    row_values.append(f"{mean:.4f}")
+                else:
+                    row_values.append("N/A")
+            # Add cost
+            if cost is not None:
+                row_values.append(f"{cost:.4f}")
+            else:
+                row_values.append("N/A")
+            row = " | ".join(
+                val.ljust(width)[:width] for val, width in zip(row_values, col_widths)
+            )
+            print(row)
+        print("=" * 120)
+        return 0
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+__all__ = ["summary_command", "list_command"]

themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

themis-eval 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl