PyPI - isage-benchmark-agent - Versions diffs - 0.1.0.1__cp311-none-any.whl - Mend

isage-benchmark-agent 0.1.0.1__cp311-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

sage/benchmark/benchmark_agent/evaluation/__init__.py ADDED Viewed

@@ -0,0 +1,217 @@
+"""
+Evaluation module for Agent Capability Benchmark.
+This module provides metrics, analyzers, and report builders for evaluating
+agent performance across three capabilities: tool selection, task planning,
+and timing judgment.
+"""
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Protocol, Sequence
+from pydantic import BaseModel, ConfigDict, Field
+__all__ = [
+    "MetricOutput",
+    "EvaluationReport",
+    "Metric",
+    "Analyzer",
+    "ReportBuilder",
+    "compute_metrics",
+    "MetricRegistry",
+]
+class MetricOutput(BaseModel):
+    """Output from a metric computation."""
+    value: float
+    details: dict[str, Any] = Field(default_factory=dict)
+class EvaluationReport(BaseModel):
+    """Complete evaluation report with metrics, breakdowns, and artifacts."""
+    task: str
+    experiment_id: str
+    metrics: dict[str, float]
+    breakdowns: dict[str, Any] = Field(default_factory=dict)
+    artifacts: dict[str, Path] = Field(default_factory=dict)
+    timestamp: str
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+class Metric(Protocol):
+    """Protocol for metric implementations."""
+    name: str
+    def compute(self, predictions: Sequence[Any], references: Sequence[Any]) -> MetricOutput:
+        """
+        Compute metric from predictions and references.
+        Args:
+            predictions: Model predictions
+            references: Ground truth references
+        Returns:
+            MetricOutput with value and optional details
+        """
+        ...
+class Analyzer(Protocol):
+    """Protocol for analyzer implementations."""
+    name: str
+    def analyze(
+        self, predictions: Sequence[Any], references: Sequence[Any], metadata: dict[str, Any]
+    ) -> dict[str, Any]:
+        """
+        Analyze predictions and produce breakdowns.
+        Args:
+            predictions: Model predictions
+            references: Ground truth references
+            metadata: Additional context from experiment
+        Returns:
+            Dictionary with analysis results
+        """
+        ...
+class ReportBuilder(Protocol):
+    """Protocol for report builder implementations."""
+    def build(self, report: EvaluationReport, output_path: Path) -> Path:
+        """
+        Build and save report to file.
+        Args:
+            report: EvaluationReport to format
+            output_path: Path to save report
+        Returns:
+            Path to saved report file
+        """
+        ...
+# Import metric registry after defining base classes
+from sage.benchmark.benchmark_agent.evaluation.metrics import MetricRegistry
+def compute_metrics(
+    task: str,
+    predictions: list[dict[str, Any]],
+    references: list[dict[str, Any]],
+    metrics: list[str],
+    k: int = 5,
+) -> dict[str, float]:
+    """
+    Compute evaluation metrics for experiment results.
+    Args:
+        task: Task type ('tool_selection', 'planning', 'timing_detection')
+        predictions: List of prediction dictionaries
+        references: List of reference dictionaries
+        metrics: List of metric names to compute
+        k: Top-k parameter for ranking metrics
+    Returns:
+        Dictionary mapping metric names to values
+    """
+    results = {}
+    if task == "tool_selection":
+        # Extract tool lists from predictions and references
+        pred_tools = []
+        ref_tools = []
+        for pred, ref in zip(predictions, references):
+            # Get predicted tool IDs
+            if "predicted_tools" in pred:
+                tools = pred["predicted_tools"]
+                if tools and isinstance(tools[0], dict):
+                    pred_tools.append([t["tool_id"] for t in tools])
+                else:
+                    pred_tools.append(tools if tools else [])
+            else:
+                pred_tools.append([])
+            # Get reference tool IDs
+            if "ground_truth_tools" in ref:
+                ref_tools.append(ref["ground_truth_tools"])
+            elif "top_k" in ref:
+                ref_tools.append(ref["top_k"])
+            else:
+                ref_tools.append([])
+        # Compute each metric
+        for metric_name in metrics:
+            try:
+                if metric_name in ("top_k_accuracy", "recall_at_k", "precision_at_k"):
+                    metric = MetricRegistry.get(metric_name, k=k)
+                elif metric_name == "mrr":
+                    metric = MetricRegistry.get("mrr")
+                else:
+                    continue
+                output = metric.compute(pred_tools, ref_tools)
+                results[metric_name] = output.value
+            except Exception as e:
+                results[metric_name] = 0.0
+                results[f"{metric_name}_error"] = str(e)
+    elif task == "timing_detection":
+        # Extract boolean decisions
+        pred_decisions = []
+        ref_decisions = []
+        for pred, ref in zip(predictions, references):
+            pred_decisions.append(pred.get("should_call_tool", False))
+            ref_decisions.append(ref.get("should_call_tool", False))
+        # Metric name mapping for timing detection
+        timing_metric_map = {
+            "accuracy": "timing_accuracy",
+            "precision": "timing_precision",
+            "recall": "timing_recall",
+            "f1": "timing_f1",
+        }
+        for metric_name in metrics:
+            try:
+                # Map simple names to full metric names
+                registry_name = timing_metric_map.get(metric_name, metric_name)
+                metric = MetricRegistry.get(registry_name)
+                output = metric.compute(pred_decisions, ref_decisions)
+                results[metric_name] = output.value
+                # Include details if available
+                if hasattr(output, "details") and output.details:
+                    results[f"{metric_name}_details"] = output.details
+            except Exception as e:
+                results[metric_name] = 0.0
+                results[f"{metric_name}_error"] = str(e)
+    elif task == "planning":
+        # Extract tool sequences
+        pred_sequences = []
+        ref_sequences = []
+        for pred, ref in zip(predictions, references):
+            pred_sequences.append(pred.get("tool_sequence", []))
+            ref_sequences.append(ref.get("tool_sequence", []))
+        for metric_name in metrics:
+            try:
+                metric = MetricRegistry.get(metric_name)
+                output = metric.compute(pred_sequences, ref_sequences)
+                results[metric_name] = output.value
+            except Exception:
+                results[metric_name] = 0.0
+    return results

sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""Analyzers package initialization."""
+from .planning_analyzer import PlanningAnalyzer
+from .timing_analyzer import TimingAnalyzer
+from .tool_selection_analyzer import ToolSelectionAnalyzer
+__all__ = [
+    "ToolSelectionAnalyzer",
+    "PlanningAnalyzer",
+    "TimingAnalyzer",
+]

sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""Planning analyzer for step-level alignment analysis."""
+from collections import Counter, defaultdict
+from typing import Any, Sequence
+class PlanningAnalyzer:
+    """
+    Analyzer for task planning predictions.
+    Provides breakdowns by:
+    - Step-level correctness
+    - Tool sequence alignment
+    - Failure patterns
+    """
+    name = "planning"
+    def analyze(
+        self,
+        predictions: Sequence[list[str]],
+        references: Sequence[list[str]],
+        metadata: dict[str, Any],
+    ) -> dict[str, Any]:
+        """
+        Analyze planning predictions.
+        Args:
+            predictions: List of predicted tool sequences
+            references: List of reference tool sequences
+            metadata: Additional context
+        Returns:
+            Dictionary with analysis results
+        """
+        # Step-level analysis
+        step_correctness: list[float] = []
+        length_diffs: list[int] = []
+        failure_modes: dict[str, int] = defaultdict(int)
+        exact_matches = 0
+        prefix_matches = 0
+        for pred, ref in zip(predictions, references):
+            # Length analysis
+            length_diffs.append(len(pred) - len(ref))
+            # Exact match
+            if pred == ref:
+                exact_matches += 1
+                step_correctness.append(1.0)
+                failure_modes["perfect"] += 1
+                prefix_matches += 1
+                continue
+            # Prefix match
+            min_len = min(len(pred), len(ref))
+            if min_len > 0 and pred[:min_len] == ref[:min_len]:
+                prefix_matches += 1
+            # Step-by-step correctness
+            correct_steps = sum(1 for p, r in zip(pred, ref) if p == r)
+            step_acc = correct_steps / len(ref) if len(ref) > 0 else 0.0
+            step_correctness.append(step_acc)
+            # Classify failure mode
+            if len(pred) == 0:
+                failure_modes["empty_plan"] += 1
+            elif len(pred) < len(ref):
+                failure_modes["too_short"] += 1
+            elif len(pred) > len(ref):
+                failure_modes["too_long"] += 1
+            elif set(pred) == set(ref):
+                failure_modes["wrong_order"] += 1
+            else:
+                failure_modes["wrong_tools"] += 1
+        # Tool sequence statistics
+        pred_lengths = [len(p) for p in predictions]
+        ref_lengths = [len(r) for r in references]
+        # Tool usage analysis
+        tool_usage_pred: Counter[str] = Counter()
+        tool_usage_ref: Counter[str] = Counter()
+        for pred in predictions:
+            tool_usage_pred.update(pred)
+        for ref in references:
+            tool_usage_ref.update(ref)
+        return {
+            "exact_match_rate": exact_matches / len(predictions) if predictions else 0.0,
+            "prefix_match_rate": prefix_matches / len(predictions) if predictions else 0.0,
+            "step_correctness": {
+                "mean": sum(step_correctness) / len(step_correctness) if step_correctness else 0.0,
+                "min": min(step_correctness) if step_correctness else 0.0,
+                "max": max(step_correctness) if step_correctness else 0.0,
+                "distribution": step_correctness,
+            },
+            "length_analysis": {
+                "pred_avg": sum(pred_lengths) / len(pred_lengths) if pred_lengths else 0.0,
+                "ref_avg": sum(ref_lengths) / len(ref_lengths) if ref_lengths else 0.0,
+                "length_diff_mean": sum(length_diffs) / len(length_diffs) if length_diffs else 0.0,
+                "length_diff_distribution": length_diffs,
+            },
+            "failure_modes": dict(failure_modes),
+            "tool_usage": {
+                "predicted_most_common": tool_usage_pred.most_common(10),
+                "reference_most_common": tool_usage_ref.most_common(10),
+            },
+            "total_samples": len(predictions),
+        }

sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py ADDED Viewed

@@ -0,0 +1,135 @@
+"""Timing analyzer for confusion matrix and threshold analysis."""
+from typing import Any, Sequence
+import numpy as np
+class TimingAnalyzer:
+    """
+    Analyzer for timing judgment predictions.
+    Provides breakdowns by:
+    - Confusion matrix
+    - Confidence distribution
+    - Threshold sensitivity (if confidence scores available)
+    """
+    name = "timing"
+    def analyze(
+        self, predictions: Sequence[bool], references: Sequence[bool], metadata: dict[str, Any]
+    ) -> dict[str, Any]:
+        """
+        Analyze timing predictions.
+        Args:
+            predictions: List of predicted decisions (True = call tool)
+            references: List of reference decisions
+            metadata: Additional context (may include confidence scores)
+        Returns:
+            Dictionary with analysis results
+        """
+        preds = np.array(predictions, dtype=bool)
+        refs = np.array(references, dtype=bool)
+        # Confusion matrix
+        true_positives = int(np.sum(preds & refs))
+        false_positives = int(np.sum(preds & ~refs))
+        false_negatives = int(np.sum(~preds & refs))
+        true_negatives = int(np.sum(~preds & ~refs))
+        confusion_matrix = {
+            "true_positives": true_positives,
+            "false_positives": false_positives,
+            "false_negatives": false_negatives,
+            "true_negatives": true_negatives,
+        }
+        # Derived metrics
+        total = len(predictions)
+        positive_rate = (true_positives + false_positives) / total if total > 0 else 0.0
+        true_positive_rate = (
+            true_positives / (true_positives + false_negatives)
+            if (true_positives + false_negatives) > 0
+            else 0.0
+        )
+        false_positive_rate = (
+            false_positives / (false_positives + true_negatives)
+            if (false_positives + true_negatives) > 0
+            else 0.0
+        )
+        # Class distribution
+        class_distribution = {
+            "reference_positive_ratio": float(np.mean(refs)),
+            "predicted_positive_ratio": float(np.mean(preds)),
+            "reference_positive_count": int(np.sum(refs)),
+            "predicted_positive_count": int(np.sum(preds)),
+        }
+        # Confidence analysis if available
+        confidence_analysis = {}
+        if "confidences" in metadata:
+            confidences = np.array(metadata["confidences"])
+            # Confidence by correctness
+            correct_mask = preds == refs
+            confidence_analysis = {
+                "mean_confidence_correct": (
+                    float(np.mean(confidences[correct_mask])) if np.any(correct_mask) else 0.0
+                ),
+                "mean_confidence_incorrect": (
+                    float(np.mean(confidences[~correct_mask])) if np.any(~correct_mask) else 0.0
+                ),
+                "mean_confidence_overall": float(np.mean(confidences)),
+                "confidence_distribution": {
+                    "bins": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
+                    "counts": np.histogram(confidences, bins=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0])[
+                        0
+                    ].tolist(),
+                },
+            }
+            # Threshold sensitivity analysis
+            thresholds = np.linspace(0.1, 0.9, 9)
+            threshold_metrics = []
+            for threshold in thresholds:
+                thresh_preds = confidences >= threshold
+                tp = int(np.sum(thresh_preds & refs))
+                fp = int(np.sum(thresh_preds & ~refs))
+                fn = int(np.sum(~thresh_preds & refs))
+                # tn = int(np.sum(~thresh_preds & ~refs))  # Not used in metrics
+                precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
+                recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
+                f1 = (
+                    2 * (precision * recall) / (precision + recall)
+                    if (precision + recall) > 0
+                    else 0.0
+                )
+                threshold_metrics.append(
+                    {
+                        "threshold": float(threshold),
+                        "precision": precision,
+                        "recall": recall,
+                        "f1": f1,
+                    }
+                )
+            confidence_analysis["threshold_sensitivity"] = threshold_metrics
+        return {
+            "confusion_matrix": confusion_matrix,
+            "rates": {
+                "true_positive_rate": true_positive_rate,
+                "false_positive_rate": false_positive_rate,
+                "predicted_positive_rate": positive_rate,
+            },
+            "class_distribution": class_distribution,
+            "confidence_analysis": confidence_analysis,
+            "total_samples": total,
+        }

sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py ADDED Viewed

@@ -0,0 +1,124 @@
+"""Tool selection analyzer for detailed error analysis."""
+from collections import Counter, defaultdict
+from typing import Any, Optional, Sequence
+class ToolSelectionAnalyzer:
+    """
+    Analyzer for tool selection predictions.
+    Provides breakdowns by:
+    - Category coverage
+    - Error patterns (wrong tool categories)
+    - Tool popularity in predictions vs references
+    """
+    name = "tool_selection"
+    def __init__(self, tools_metadata: Optional[dict[str, Any]] = None):
+        """
+        Initialize analyzer.
+        Args:
+            tools_metadata: Optional metadata about tools (categories, etc.)
+        """
+        self.tools_metadata = tools_metadata or {}
+    def analyze(
+        self,
+        predictions: Sequence[list[str]],
+        references: Sequence[list[str]],
+        metadata: dict[str, Any],
+    ) -> dict[str, Any]:
+        """
+        Analyze tool selection predictions.
+        Args:
+            predictions: List of predicted tool ID lists
+            references: List of reference tool ID lists
+            metadata: Additional context
+        Returns:
+            Dictionary with analysis results
+        """
+        # Tool frequency analysis
+        pred_tools: Counter[str] = Counter()
+        ref_tools: Counter[str] = Counter()
+        for pred_list in predictions:
+            pred_tools.update(pred_list)
+        for ref_list in references:
+            ref_tools.update(ref_list)
+        # Error pattern analysis
+        errors_by_type: dict[str, int] = defaultdict(int)
+        correct_selections = 0
+        total_predictions = 0
+        category_hits: dict[str, int] = defaultdict(int)
+        category_misses: dict[str, int] = defaultdict(int)
+        for pred, ref in zip(predictions, references):
+            ref_set = set(ref)
+            pred_set = set(pred)
+            # Count correct and incorrect
+            correct = pred_set & ref_set
+            incorrect = pred_set - ref_set
+            correct_selections += len(correct)
+            total_predictions += len(pred)
+            if len(correct) == 0:
+                errors_by_type["complete_miss"] += 1
+            elif len(incorrect) > 0:
+                errors_by_type["partial_correct"] += 1
+            else:
+                errors_by_type["all_correct"] += 1
+            # Category-level analysis if metadata available
+            for tool_id in ref:
+                category = self._get_category(tool_id)
+                if tool_id in pred_set:
+                    category_hits[category] += 1
+                else:
+                    category_misses[category] += 1
+        # Coverage statistics
+        pred_tool_set = set(pred_tools.keys())
+        ref_tool_set = set(ref_tools.keys())
+        return {
+            "error_patterns": dict(errors_by_type),
+            "tool_coverage": {
+                "predicted_tools": len(pred_tool_set),
+                "reference_tools": len(ref_tool_set),
+                "overlap": len(pred_tool_set & ref_tool_set),
+                "predicted_only": len(pred_tool_set - ref_tool_set),
+                "missed": len(ref_tool_set - pred_tool_set),
+            },
+            "tool_frequency": {
+                "top_predicted": pred_tools.most_common(10),
+                "top_reference": ref_tools.most_common(10),
+            },
+            "category_performance": {
+                "hits_by_category": dict(category_hits),
+                "misses_by_category": dict(category_misses),
+            },
+            "selection_accuracy": {
+                "correct_selections": correct_selections,
+                "total_predictions": total_predictions,
+                "accuracy": (
+                    correct_selections / total_predictions if total_predictions > 0 else 0.0
+                ),
+            },
+        }
+    def _get_category(self, tool_id: str) -> str:
+        """Extract category from tool ID."""
+        # Tool ID format: {domain}_{category}_{number}
+        parts = tool_id.split("_")
+        if len(parts) >= 2:
+            return f"{parts[0]}_{parts[1]}"
+        return "unknown"