PyPI - themis-eval - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

themis-eval 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +429 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/commands/results.py +252 -0
themis/cli/main.py +427 -57
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/core/entities.py +23 -3
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/pipelines/standard_pipeline.py +68 -8
themis/experiment/cache_manager.py +8 -3
themis/experiment/export.py +110 -2
themis/experiment/orchestrator.py +109 -11
themis/experiment/storage.py +1457 -110
themis/generation/providers/litellm_provider.py +46 -0
themis/generation/runner.py +22 -6
themis/integrations/huggingface.py +12 -1
themis/integrations/wandb.py +13 -1
themis/interfaces/__init__.py +86 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis_eval-0.2.1.dist-info/METADATA +596 -0
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
themis_eval-0.1.1.dist-info/METADATA +0 -758
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0

themis/comparison/engine.py ADDED Viewed

@@ -0,0 +1,348 @@
+"""Comparison engine for analyzing multiple experiment runs.
+This module provides the main ComparisonEngine class that orchestrates
+loading runs, computing statistics, and generating comparison reports.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Sequence
+from themis.comparison import reports, statistics
+from themis.comparison.statistics import StatisticalTest
+from themis.experiment import storage as experiment_storage
+class ComparisonEngine:
+    """Engine for comparing multiple experiment runs.
+    This class loads experiment results from storage and performs
+    pairwise comparisons across all metrics with statistical testing.
+    """
+    def __init__(
+        self,
+        *,
+        storage: experiment_storage.ExperimentStorage | None = None,
+        storage_path: str | Path | None = None,
+        statistical_test: StatisticalTest = StatisticalTest.BOOTSTRAP,
+        alpha: float = 0.05,
+        n_bootstrap: int = 10000,
+        n_permutations: int = 10000,
+    ):
+        """Initialize comparison engine.
+        Args:
+            storage: Experiment storage instance
+            storage_path: Path to storage (if storage not provided)
+            statistical_test: Type of statistical test to use
+            alpha: Significance level for tests
+            n_bootstrap: Number of bootstrap iterations
+            n_permutations: Number of permutations for permutation test
+        """
+        if storage is None and storage_path is None:
+            raise ValueError("Either storage or storage_path must be provided")
+        self._storage = storage or experiment_storage.ExperimentStorage(storage_path)
+        self._statistical_test = statistical_test
+        self._alpha = alpha
+        self._n_bootstrap = n_bootstrap
+        self._n_permutations = n_permutations
+    def compare_runs(
+        self,
+        run_ids: Sequence[str],
+        *,
+        metrics: Sequence[str] | None = None,
+        statistical_test: StatisticalTest | None = None,
+    ) -> reports.ComparisonReport:
+        """Compare multiple runs across specified metrics.
+        Args:
+            run_ids: List of run IDs to compare
+            metrics: List of metrics to compare (None = all available)
+            statistical_test: Override default statistical test
+        Returns:
+            ComparisonReport with all comparisons and statistics
+        Raises:
+            ValueError: If fewer than 2 runs provided or runs not found
+        """
+        if len(run_ids) < 2:
+            raise ValueError("Need at least 2 runs to compare")
+        # Load all runs
+        run_data = {}
+        for run_id in run_ids:
+            try:
+                data = self._load_run_metrics(run_id)
+                run_data[run_id] = data
+            except FileNotFoundError:
+                raise ValueError(f"Run not found: {run_id}")
+        # Determine metrics to compare
+        if metrics is None:
+            # Use all metrics that appear in all runs
+            all_metrics = set(run_data[run_ids[0]].keys())
+            for run_id in run_ids[1:]:
+                all_metrics &= set(run_data[run_id].keys())
+            metrics = sorted(all_metrics)
+        if not metrics:
+            raise ValueError("No common metrics found across all runs")
+        # Perform pairwise comparisons
+        pairwise_results = []
+        for metric in metrics:
+            for i, run_a in enumerate(run_ids):
+                for run_b in run_ids[i + 1:]:
+                    result = self._compare_pair(
+                        run_a,
+                        run_b,
+                        metric,
+                        run_data[run_a][metric],
+                        run_data[run_b][metric],
+                        statistical_test or self._statistical_test,
+                    )
+                    pairwise_results.append(result)
+        # Build win/loss matrices
+        win_loss_matrices = {}
+        for metric in metrics:
+            matrix = self._build_win_loss_matrix(run_ids, metric, pairwise_results)
+            win_loss_matrices[metric] = matrix
+        # Determine best run per metric
+        best_run_per_metric = {}
+        for metric in metrics:
+            # Find run with highest mean
+            best_run = max(
+                run_ids,
+                key=lambda rid: sum(run_data[rid][metric]) / len(run_data[rid][metric])
+            )
+            best_run_per_metric[metric] = best_run
+        # Determine overall best run (most wins across all metrics)
+        overall_wins = {run_id: 0 for run_id in run_ids}
+        for matrix in win_loss_matrices.values():
+            for run_id in run_ids:
+                overall_wins[run_id] += matrix.win_counts.get(run_id, 0)
+        overall_best_run = max(overall_wins, key=overall_wins.get)
+        return reports.ComparisonReport(
+            run_ids=list(run_ids),
+            metrics=list(metrics),
+            pairwise_results=pairwise_results,
+            win_loss_matrices=win_loss_matrices,
+            best_run_per_metric=best_run_per_metric,
+            overall_best_run=overall_best_run,
+            metadata={
+                "statistical_test": self._statistical_test.value,
+                "alpha": self._alpha,
+                "n_runs": len(run_ids),
+                "n_metrics": len(metrics),
+            },
+        )
+    def _load_run_metrics(self, run_id: str) -> dict[str, list[float]]:
+        """Load all metric scores for a run.
+        Returns:
+            Dictionary mapping metric names to lists of scores
+        """
+        # Load evaluation records from storage (returns dict of cache_key -> EvaluationRecord)
+        eval_dict = self._storage.load_cached_evaluations(run_id)
+        # Organize scores by metric
+        metric_scores: dict[str, list[float]] = {}
+        # eval_dict is a dict, so iterate over values
+        for record in eval_dict.values():
+            for metric_name, score_obj in record.scores.items():
+                if metric_name not in metric_scores:
+                    metric_scores[metric_name] = []
+                # Get numeric score
+                if hasattr(score_obj, 'value'):
+                    score = score_obj.value
+                elif isinstance(score_obj, (int, float)):
+                    score = float(score_obj)
+                else:
+                    continue  # Skip non-numeric scores
+                metric_scores[metric_name].append(score)
+        return metric_scores
+    def _compare_pair(
+        self,
+        run_a_id: str,
+        run_b_id: str,
+        metric_name: str,
+        samples_a: list[float],
+        samples_b: list[float],
+        test_type: StatisticalTest,
+    ) -> reports.ComparisonResult:
+        """Compare two runs on a single metric.
+        Args:
+            run_a_id: First run identifier
+            run_b_id: Second run identifier
+            metric_name: Name of metric being compared
+            samples_a: Scores for first run
+            samples_b: Scores for second run
+            test_type: Type of statistical test to perform
+        Returns:
+            ComparisonResult with comparison statistics
+        """
+        # Calculate means
+        mean_a = sum(samples_a) / len(samples_a)
+        mean_b = sum(samples_b) / len(samples_b)
+        # Calculate delta
+        delta = mean_a - mean_b
+        delta_percent = (delta / mean_b * 100) if mean_b != 0 else 0.0
+        # Perform statistical test
+        test_result = None
+        if test_type == StatisticalTest.T_TEST:
+            test_result = statistics.t_test(
+                samples_a, samples_b, alpha=self._alpha, paired=True
+            )
+        elif test_type == StatisticalTest.BOOTSTRAP:
+            test_result = statistics.bootstrap_confidence_interval(
+                samples_a,
+                samples_b,
+                n_bootstrap=self._n_bootstrap,
+                confidence_level=1 - self._alpha,
+            )
+        elif test_type == StatisticalTest.PERMUTATION:
+            test_result = statistics.permutation_test(
+                samples_a,
+                samples_b,
+                n_permutations=self._n_permutations,
+                alpha=self._alpha,
+            )
+        # Determine winner
+        if test_result and test_result.significant:
+            winner = run_a_id if delta > 0 else run_b_id
+        else:
+            winner = "tie"
+        return reports.ComparisonResult(
+            metric_name=metric_name,
+            run_a_id=run_a_id,
+            run_b_id=run_b_id,
+            run_a_mean=mean_a,
+            run_b_mean=mean_b,
+            delta=delta,
+            delta_percent=delta_percent,
+            winner=winner,
+            test_result=test_result,
+            run_a_samples=samples_a,
+            run_b_samples=samples_b,
+        )
+    def _build_win_loss_matrix(
+        self,
+        run_ids: Sequence[str],
+        metric: str,
+        pairwise_results: list[reports.ComparisonResult],
+    ) -> reports.WinLossMatrix:
+        """Build win/loss matrix for a specific metric.
+        Args:
+            run_ids: List of run IDs
+            metric: Metric name
+            pairwise_results: All pairwise comparison results
+        Returns:
+            WinLossMatrix for the metric
+        """
+        n = len(run_ids)
+        matrix = [["—" for _ in range(n)] for _ in range(n)]
+        win_counts = {rid: 0 for rid in run_ids}
+        loss_counts = {rid: 0 for rid in run_ids}
+        tie_counts = {rid: 0 for rid in run_ids}
+        # Fill matrix from pairwise results
+        for result in pairwise_results:
+            if result.metric_name != metric:
+                continue
+            idx_a = run_ids.index(result.run_a_id)
+            idx_b = run_ids.index(result.run_b_id)
+            if result.winner == result.run_a_id:
+                matrix[idx_a][idx_b] = "win"
+                matrix[idx_b][idx_a] = "loss"
+                win_counts[result.run_a_id] += 1
+                loss_counts[result.run_b_id] += 1
+            elif result.winner == result.run_b_id:
+                matrix[idx_a][idx_b] = "loss"
+                matrix[idx_b][idx_a] = "win"
+                loss_counts[result.run_a_id] += 1
+                win_counts[result.run_b_id] += 1
+            else:  # tie
+                matrix[idx_a][idx_b] = "tie"
+                matrix[idx_b][idx_a] = "tie"
+                tie_counts[result.run_a_id] += 1
+                tie_counts[result.run_b_id] += 1
+        return reports.WinLossMatrix(
+            run_ids=list(run_ids),
+            metric_name=metric,
+            matrix=matrix,
+            win_counts=win_counts,
+            loss_counts=loss_counts,
+            tie_counts=tie_counts,
+        )
+def compare_runs(
+    run_ids: Sequence[str],
+    *,
+    storage_path: str | Path,
+    metrics: Sequence[str] | None = None,
+    statistical_test: StatisticalTest = StatisticalTest.BOOTSTRAP,
+    alpha: float = 0.05,
+) -> reports.ComparisonReport:
+    """Convenience function to compare runs.
+    Args:
+        run_ids: List of run IDs to compare
+        storage_path: Path to experiment storage
+        metrics: List of metrics to compare (None = all)
+        statistical_test: Type of statistical test
+        alpha: Significance level
+    Returns:
+        ComparisonReport with all comparisons
+    Example:
+        >>> report = compare_runs(
+        ...     ["run-gpt4", "run-claude"],
+        ...     storage_path=".cache/experiments",
+        ...     metrics=["ExactMatch", "BLEU"],
+        ... )
+        >>> print(report.summary())
+    """
+    engine = ComparisonEngine(
+        storage_path=storage_path,
+        statistical_test=statistical_test,
+        alpha=alpha,
+    )
+    return engine.compare_runs(run_ids, metrics=metrics)
+__all__ = [
+    "ComparisonEngine",
+    "compare_runs",
+]

themis/comparison/reports.py ADDED Viewed

@@ -0,0 +1,283 @@
+"""Comparison reports for analyzing experiment results.
+This module provides structured reports for comparing multiple runs,
+including win/loss matrices, metric deltas, and statistical significance.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Sequence
+from themis.comparison.statistics import StatisticalTestResult
+@dataclass
+class ComparisonResult:
+    """Result of comparing two runs on a single metric.
+    Attributes:
+        metric_name: Name of the metric being compared
+        run_a_id: Identifier for first run
+        run_b_id: Identifier for second run
+        run_a_mean: Mean value for first run
+        run_b_mean: Mean value for second run
+        delta: Difference (run_a - run_b)
+        delta_percent: Percentage difference
+        winner: ID of the winning run ("tie" if no significant difference)
+        test_result: Statistical test result (if performed)
+        run_a_samples: Individual sample scores for run A
+        run_b_samples: Individual sample scores for run B
+    """
+    metric_name: str
+    run_a_id: str
+    run_b_id: str
+    run_a_mean: float
+    run_b_mean: float
+    delta: float
+    delta_percent: float
+    winner: str  # run_a_id, run_b_id, or "tie"
+    test_result: StatisticalTestResult | None = None
+    run_a_samples: list[float] = field(default_factory=list)
+    run_b_samples: list[float] = field(default_factory=list)
+    def is_significant(self) -> bool:
+        """Check if the difference is statistically significant."""
+        return self.test_result is not None and self.test_result.significant
+    def summary(self) -> str:
+        """Generate a human-readable summary."""
+        direction = "↑" if self.delta > 0 else "↓" if self.delta < 0 else "="
+        summary = (
+            f"{self.metric_name}: {self.run_a_id} "
+            f"({self.run_a_mean:.3f}) vs {self.run_b_id} "
+            f"({self.run_b_mean:.3f}) = {direction}{abs(self.delta):.3f} "
+            f"({self.delta_percent:+.1f}%)"
+        )
+        if self.test_result:
+            sig_marker = "***" if self.is_significant() else "n.s."
+            summary += f" [{sig_marker}, p={self.test_result.p_value:.4f}]"
+        return summary
+@dataclass
+class WinLossMatrix:
+    """Win/loss/tie matrix for comparing multiple runs.
+    Attributes:
+        run_ids: List of run IDs in the matrix
+        metric_name: Name of the metric being compared
+        matrix: 2D matrix of results
+            matrix[i][j] = result of comparing run i vs run j
+            Values: "win", "loss", "tie"
+        win_counts: Number of wins for each run
+        loss_counts: Number of losses for each run
+        tie_counts: Number of ties for each run
+    """
+    run_ids: list[str]
+    metric_name: str
+    matrix: list[list[str]]
+    win_counts: dict[str, int] = field(default_factory=dict)
+    loss_counts: dict[str, int] = field(default_factory=dict)
+    tie_counts: dict[str, int] = field(default_factory=dict)
+    def get_result(self, run_a: str, run_b: str) -> str:
+        """Get comparison result between two runs."""
+        try:
+            idx_a = self.run_ids.index(run_a)
+            idx_b = self.run_ids.index(run_b)
+            return self.matrix[idx_a][idx_b]
+        except (ValueError, IndexError):
+            return "unknown"
+    def rank_runs(self) -> list[tuple[str, int, int, int]]:
+        """Rank runs by wins (descending), then losses (ascending).
+        Returns:
+            List of (run_id, wins, losses, ties) sorted by performance
+        """
+        rankings = [
+            (
+                run_id,
+                self.win_counts.get(run_id, 0),
+                self.loss_counts.get(run_id, 0),
+                self.tie_counts.get(run_id, 0),
+            )
+            for run_id in self.run_ids
+        ]
+        # Sort by wins (desc), then losses (asc)
+        rankings.sort(key=lambda x: (-x[1], x[2]))
+        return rankings
+    def to_table(self) -> str:
+        """Generate a formatted table representation."""
+        lines = []
+        # Header
+        header = f"{'Run':<20} | " + " | ".join(f"{rid:<12}" for rid in self.run_ids)
+        lines.append(header)
+        lines.append("-" * len(header))
+        # Rows
+        for i, run_id in enumerate(self.run_ids):
+            row = f"{run_id:<20} | "
+            row += " | ".join(f"{self.matrix[i][j]:<12}" for j in range(len(self.run_ids)))
+            lines.append(row)
+        # Summary
+        lines.append("")
+        lines.append("Summary (W/L/T):")
+        for run_id, wins, losses, ties in self.rank_runs():
+            lines.append(f"  {run_id}: {wins}/{losses}/{ties}")
+        return "\n".join(lines)
+@dataclass
+class ComparisonReport:
+    """Comprehensive comparison report for multiple runs.
+    Attributes:
+        run_ids: List of all run IDs being compared
+        metrics: List of metric names being compared
+        pairwise_results: List of all pairwise comparison results
+        win_loss_matrices: Win/loss matrices for each metric
+        best_run_per_metric: Best run for each metric
+        overall_best_run: Overall best run across all metrics
+        metadata: Additional metadata about the comparison
+    """
+    run_ids: list[str]
+    metrics: list[str]
+    pairwise_results: list[ComparisonResult] = field(default_factory=list)
+    win_loss_matrices: dict[str, WinLossMatrix] = field(default_factory=dict)
+    best_run_per_metric: dict[str, str] = field(default_factory=dict)
+    overall_best_run: str | None = None
+    metadata: dict[str, Any] = field(default_factory=dict)
+    def get_comparison(
+        self, run_a: str, run_b: str, metric: str
+    ) -> ComparisonResult | None:
+        """Get comparison result for specific runs and metric."""
+        for result in self.pairwise_results:
+            if (
+                result.metric_name == metric
+                and result.run_a_id == run_a
+                and result.run_b_id == run_b
+            ):
+                return result
+        return None
+    def get_metric_results(self, metric: str) -> list[ComparisonResult]:
+        """Get all comparison results for a specific metric."""
+        return [r for r in self.pairwise_results if r.metric_name == metric]
+    def summary(self, include_details: bool = False) -> str:
+        """Generate a human-readable summary of the comparison.
+        Args:
+            include_details: Whether to include detailed pairwise comparisons
+        Returns:
+            Formatted summary string
+        """
+        lines = []
+        lines.append("=" * 80)
+        lines.append("COMPARISON REPORT")
+        lines.append("=" * 80)
+        lines.append("")
+        # Overall summary
+        lines.append(f"Comparing {len(self.run_ids)} runs across {len(self.metrics)} metrics")
+        lines.append(f"Runs: {', '.join(self.run_ids)}")
+        lines.append(f"Metrics: {', '.join(self.metrics)}")
+        lines.append("")
+        # Best run per metric
+        if self.best_run_per_metric:
+            lines.append("Best Run Per Metric:")
+            for metric, run_id in self.best_run_per_metric.items():
+                lines.append(f"  {metric}: {run_id}")
+            lines.append("")
+        # Overall best
+        if self.overall_best_run:
+            lines.append(f"Overall Best Run: {self.overall_best_run}")
+            lines.append("")
+        # Win/loss matrices
+        if self.win_loss_matrices and include_details:
+            lines.append("=" * 80)
+            lines.append("WIN/LOSS MATRICES")
+            lines.append("=" * 80)
+            for metric, matrix in self.win_loss_matrices.items():
+                lines.append("")
+                lines.append(f"Metric: {metric}")
+                lines.append("-" * 40)
+                lines.append(matrix.to_table())
+                lines.append("")
+        # Pairwise comparisons
+        if include_details and self.pairwise_results:
+            lines.append("=" * 80)
+            lines.append("PAIRWISE COMPARISONS")
+            lines.append("=" * 80)
+            for metric in self.metrics:
+                results = self.get_metric_results(metric)
+                if results:
+                    lines.append("")
+                    lines.append(f"Metric: {metric}")
+                    lines.append("-" * 40)
+                    for result in results:
+                        lines.append(f"  {result.summary()}")
+                    lines.append("")
+        return "\n".join(lines)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert report to dictionary for serialization."""
+        return {
+            "run_ids": self.run_ids,
+            "metrics": self.metrics,
+            "best_run_per_metric": self.best_run_per_metric,
+            "overall_best_run": self.overall_best_run,
+            "pairwise_results": [
+                {
+                    "metric": r.metric_name,
+                    "run_a": r.run_a_id,
+                    "run_b": r.run_b_id,
+                    "run_a_mean": r.run_a_mean,
+                    "run_b_mean": r.run_b_mean,
+                    "delta": r.delta,
+                    "delta_percent": r.delta_percent,
+                    "winner": r.winner,
+                    "significant": r.is_significant(),
+                    "p_value": r.test_result.p_value if r.test_result else None,
+                }
+                for r in self.pairwise_results
+            ],
+            "win_loss_summary": {
+                metric: {
+                    "rankings": [
+                        {"run_id": rid, "wins": w, "losses": l, "ties": t}
+                        for rid, w, l, t in matrix.rank_runs()
+                    ]
+                }
+                for metric, matrix in self.win_loss_matrices.items()
+            },
+            "metadata": self.metadata,
+        }
+__all__ = [
+    "ComparisonResult",
+    "WinLossMatrix",
+    "ComparisonReport",
+]

themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

themis-eval 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl