PyPI - mcpbr - Versions diffs - 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl - Mend

mcpbr 0.4.14py3-none-any.whl → 0.4.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

mcpbr/benchmarks/__init__.py +12 -0
mcpbr/benchmarks/adversarial.py +341 -0
mcpbr/benchmarks/custom.py +607 -0
mcpbr/benchmarks/longbench.py +623 -0
mcpbr/benchmarks/mmmu.py +353 -0
mcpbr/config.py +4 -0
mcpbr/custom_metrics.py +405 -0
mcpbr/dataset_versioning.py +222 -0
mcpbr/docker_env.py +6 -0
mcpbr/failure_analysis.py +558 -0
mcpbr/few_shot.py +367 -0
mcpbr/gpu_support.py +157 -0
mcpbr/harness.py +8 -0
mcpbr/latency_metrics.py +317 -0
mcpbr/sampling.py +193 -0
{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/METADATA +10 -6
{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/RECORD +27 -16
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/github.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/slack.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/WHEEL +0 -0
{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/entry_points.txt +0 -0
{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/licenses/LICENSE +0 -0

mcpbr/latency_metrics.py ADDED Viewed

@@ -0,0 +1,317 @@
+"""Latency and performance benchmarking metrics for evaluation runs.
+This module complements the PerformanceProfiler in profiler.py by providing
+aggregate latency statistics across multiple evaluation tasks. While the profiler
+tracks per-task performance, this module computes cross-task percentile distributions
+and throughput metrics suitable for benchmarking reports.
+Key capabilities:
+- Per-task event timestamp tracking (start, first tool call, first response, end)
+- Per-tool-call latency recording within each task
+- Aggregate percentile statistics (p50, p95, p99, mean) across tasks
+- Tokens-per-second throughput calculation
+- Human-readable latency report formatting
+"""
+import statistics
+from dataclasses import dataclass, field
+from typing import Any
+def percentile(data: list[float], p: float) -> float:
+    """Calculate the p-th percentile of a list of values.
+    Uses linear interpolation between closest ranks for accurate percentile
+    estimation, falling back to boundary values for edge cases.
+    Args:
+        data: List of numeric values. Must not be empty.
+        p: Percentile to compute, in range [0, 100].
+    Returns:
+        The interpolated percentile value.
+    Raises:
+        ValueError: If data is empty or p is outside [0, 100].
+    """
+    if not data:
+        raise ValueError("Cannot compute percentile of empty data")
+    if p < 0 or p > 100:
+        raise ValueError(f"Percentile must be between 0 and 100, got {p}")
+    sorted_data = sorted(data)
+    n = len(sorted_data)
+    if n == 1:
+        return sorted_data[0]
+    # Compute the rank using the C = 1 interpolation method (same as Excel PERCENTILE.INC)
+    rank = (p / 100) * (n - 1)
+    lower_index = int(rank)
+    upper_index = lower_index + 1
+    fraction = rank - lower_index
+    if upper_index >= n:
+        return sorted_data[-1]
+    return sorted_data[lower_index] + fraction * (
+        sorted_data[upper_index] - sorted_data[lower_index]
+    )
+@dataclass
+class LatencyTracker:
+    """Records timestamps for key events during a single evaluation task.
+    Tracks the lifecycle of a task from start to end, including when the first
+    tool call and first response occur. Also records individual tool call latencies
+    for fine-grained analysis.
+    Attributes:
+        task_id: Identifier for the task being tracked.
+        task_start: Timestamp (seconds since epoch) when the task began.
+        first_tool_call: Timestamp when the first tool call was initiated.
+        first_response: Timestamp when the first response was received.
+        task_end: Timestamp when the task completed.
+        tool_call_latencies: List of individual tool call durations in seconds.
+        total_tokens: Total tokens (input + output) consumed during the task.
+    """
+    task_id: str = ""
+    task_start: float | None = None
+    first_tool_call: float | None = None
+    first_response: float | None = None
+    task_end: float | None = None
+    tool_call_latencies: list[float] = field(default_factory=list)
+    total_tokens: int = 0
+    def record_task_start(self, timestamp: float) -> None:
+        """Record the task start timestamp.
+        Args:
+            timestamp: Time in seconds (e.g., from time.time()).
+        """
+        self.task_start = timestamp
+    def record_first_tool_call(self, timestamp: float) -> None:
+        """Record the first tool call timestamp.
+        Only records the first occurrence; subsequent calls are ignored.
+        Args:
+            timestamp: Time in seconds.
+        """
+        if self.first_tool_call is None:
+            self.first_tool_call = timestamp
+    def record_first_response(self, timestamp: float) -> None:
+        """Record the first response timestamp.
+        Only records the first occurrence; subsequent calls are ignored.
+        Args:
+            timestamp: Time in seconds.
+        """
+        if self.first_response is None:
+            self.first_response = timestamp
+    def record_task_end(self, timestamp: float) -> None:
+        """Record the task end timestamp.
+        Args:
+            timestamp: Time in seconds.
+        """
+        self.task_end = timestamp
+    def record_tool_call_latency(self, duration_seconds: float) -> None:
+        """Record the latency of an individual tool call.
+        Args:
+            duration_seconds: Duration of the tool call in seconds.
+        """
+        self.tool_call_latencies.append(duration_seconds)
+    @property
+    def time_to_first_tool_call(self) -> float | None:
+        """Calculate time from task start to first tool call.
+        Returns:
+            Duration in seconds, or None if either timestamp is missing.
+        """
+        if self.task_start is not None and self.first_tool_call is not None:
+            return self.first_tool_call - self.task_start
+        return None
+    @property
+    def total_task_duration(self) -> float | None:
+        """Calculate total task duration from start to end.
+        Returns:
+            Duration in seconds, or None if either timestamp is missing.
+        """
+        if self.task_start is not None and self.task_end is not None:
+            return self.task_end - self.task_start
+        return None
+    @property
+    def tokens_per_second(self) -> float | None:
+        """Calculate throughput in tokens per second.
+        Returns:
+            Tokens per second, or None if duration is zero or unavailable.
+        """
+        duration = self.total_task_duration
+        if duration is not None and duration > 0 and self.total_tokens > 0:
+            return self.total_tokens / duration
+        return None
+def _compute_distribution(values: list[float]) -> dict[str, float]:
+    """Compute percentile distribution and mean for a list of values.
+    Args:
+        values: List of numeric values. Must not be empty.
+    Returns:
+        Dictionary with keys: p50, p95, p99, mean.
+    """
+    return {
+        "p50": percentile(values, 50),
+        "p95": percentile(values, 95),
+        "p99": percentile(values, 99),
+        "mean": statistics.mean(values),
+    }
+def compute_latency_stats(trackers: list["LatencyTracker"]) -> dict[str, Any]:
+    """Compute aggregate latency statistics across multiple task trackers.
+    Collects timing data from all trackers and produces percentile distributions
+    for key metrics: time to first tool call, total task duration, individual
+    tool call latency, and tokens-per-second throughput.
+    Args:
+        trackers: List of LatencyTracker instances with recorded data.
+    Returns:
+        Dictionary containing:
+        - time_to_first_tool_call: {p50, p95, p99, mean} or None
+        - total_task_duration: {p50, p95, p99, mean} or None
+        - tool_call_latency: {p50, p95, p99, mean} or None
+        - tokens_per_second: {p50, p95, p99, mean} or None
+        - task_count: number of trackers analyzed
+    """
+    if not trackers:
+        return {
+            "time_to_first_tool_call": None,
+            "total_task_duration": None,
+            "tool_call_latency": None,
+            "tokens_per_second": None,
+            "task_count": 0,
+        }
+    # Collect values from all trackers
+    ttftc_values: list[float] = []
+    duration_values: list[float] = []
+    tool_latency_values: list[float] = []
+    tps_values: list[float] = []
+    for tracker in trackers:
+        ttftc = tracker.time_to_first_tool_call
+        if ttftc is not None:
+            ttftc_values.append(ttftc)
+        duration = tracker.total_task_duration
+        if duration is not None:
+            duration_values.append(duration)
+        tool_latency_values.extend(tracker.tool_call_latencies)
+        tps = tracker.tokens_per_second
+        if tps is not None:
+            tps_values.append(tps)
+    return {
+        "time_to_first_tool_call": _compute_distribution(ttftc_values) if ttftc_values else None,
+        "total_task_duration": _compute_distribution(duration_values) if duration_values else None,
+        "tool_call_latency": (
+            _compute_distribution(tool_latency_values) if tool_latency_values else None
+        ),
+        "tokens_per_second": _compute_distribution(tps_values) if tps_values else None,
+        "task_count": len(trackers),
+    }
+def _format_distribution(label: str, dist: dict[str, float], unit: str = "s") -> str:
+    """Format a single distribution as a human-readable line.
+    Args:
+        label: Name of the metric.
+        dist: Distribution dict with p50, p95, p99, mean.
+        unit: Unit suffix to append to values.
+    Returns:
+        Formatted string line.
+    """
+    return (
+        f"  {label}:\n"
+        f"    Mean: {dist['mean']:.3f}{unit}\n"
+        f"    p50:  {dist['p50']:.3f}{unit}\n"
+        f"    p95:  {dist['p95']:.3f}{unit}\n"
+        f"    p99:  {dist['p99']:.3f}{unit}"
+    )
+def format_latency_report(stats: dict[str, Any]) -> str:
+    """Format latency statistics into a human-readable report.
+    Produces a multi-line text report suitable for console output or inclusion
+    in benchmark result files.
+    Args:
+        stats: Statistics dictionary as returned by compute_latency_stats().
+    Returns:
+        Formatted multi-line report string.
+    """
+    lines: list[str] = []
+    lines.append("=" * 50)
+    lines.append("Latency & Performance Report")
+    lines.append("=" * 50)
+    lines.append(f"Tasks analyzed: {stats.get('task_count', 0)}")
+    lines.append("")
+    ttftc = stats.get("time_to_first_tool_call")
+    if ttftc is not None:
+        lines.append(_format_distribution("Time to First Tool Call", ttftc))
+        lines.append("")
+    duration = stats.get("total_task_duration")
+    if duration is not None:
+        lines.append(_format_distribution("Total Task Duration", duration))
+        lines.append("")
+    tool_latency = stats.get("tool_call_latency")
+    if tool_latency is not None:
+        lines.append(_format_distribution("Tool Call Latency", tool_latency))
+        lines.append("")
+    tps = stats.get("tokens_per_second")
+    if tps is not None:
+        lines.append(_format_distribution("Throughput", tps, unit=" tok/s"))
+        lines.append("")
+    if all(
+        stats.get(key) is None
+        for key in [
+            "time_to_first_tool_call",
+            "total_task_duration",
+            "tool_call_latency",
+            "tokens_per_second",
+        ]
+    ):
+        lines.append("  No latency data available.")
+        lines.append("")
+    lines.append("=" * 50)
+    return "\n".join(lines)

mcpbr/sampling.py ADDED Viewed

@@ -0,0 +1,193 @@
+"""Sampling strategies for benchmark task selection.
+Provides random and stratified sampling with seed control for reproducible
+benchmark evaluations. Supports sequential (default), random, and stratified
+sampling strategies.
+"""
+import random
+from collections import defaultdict
+from enum import Enum
+from typing import Any
+class SamplingStrategy(Enum):
+    """Sampling strategy for selecting benchmark tasks.
+    Attributes:
+        SEQUENTIAL: Take the first N tasks (default behavior, backward compatible).
+        RANDOM: Randomly sample N tasks with optional seed for reproducibility.
+        STRATIFIED: Group tasks by a field, then sample proportionally from each group.
+    """
+    SEQUENTIAL = "sequential"
+    RANDOM = "random"
+    STRATIFIED = "stratified"
+def sample_tasks(
+    tasks: list[dict[str, Any]],
+    sample_size: int | None = None,
+    strategy: SamplingStrategy = SamplingStrategy.SEQUENTIAL,
+    seed: int | None = None,
+    stratify_field: str | None = None,
+) -> list[dict[str, Any]]:
+    """Sample tasks from a list using the specified strategy.
+    Args:
+        tasks: Full list of task dictionaries to sample from.
+        sample_size: Number of tasks to select. None returns all tasks.
+        strategy: Sampling strategy to use.
+        seed: Random seed for reproducibility (used by RANDOM and STRATIFIED).
+        stratify_field: Field name to group by for STRATIFIED sampling.
+            Required when strategy is STRATIFIED.
+    Returns:
+        List of sampled task dictionaries.
+    Raises:
+        ValueError: If strategy is STRATIFIED but stratify_field is not provided.
+        ValueError: If strategy is STRATIFIED but stratify_field is not found in any task.
+    """
+    if not tasks:
+        return []
+    if sample_size is None or sample_size >= len(tasks):
+        return list(tasks)
+    if sample_size <= 0:
+        return []
+    if strategy == SamplingStrategy.SEQUENTIAL:
+        return _sample_sequential(tasks, sample_size)
+    elif strategy == SamplingStrategy.RANDOM:
+        return _sample_random(tasks, sample_size, seed)
+    elif strategy == SamplingStrategy.STRATIFIED:
+        return _sample_stratified(tasks, sample_size, seed, stratify_field)
+    else:
+        raise ValueError(f"Unknown sampling strategy: {strategy}")
+def _sample_sequential(
+    tasks: list[dict[str, Any]],
+    sample_size: int,
+) -> list[dict[str, Any]]:
+    """Take the first N tasks sequentially.
+    This matches the existing behavior where tasks[:sample_size] is used.
+    Args:
+        tasks: Full list of tasks.
+        sample_size: Number of tasks to select.
+    Returns:
+        First sample_size tasks from the list.
+    """
+    return tasks[:sample_size]
+def _sample_random(
+    tasks: list[dict[str, Any]],
+    sample_size: int,
+    seed: int | None = None,
+) -> list[dict[str, Any]]:
+    """Randomly sample N tasks with optional seed for reproducibility.
+    Args:
+        tasks: Full list of tasks.
+        sample_size: Number of tasks to select.
+        seed: Random seed for reproducibility.
+    Returns:
+        Randomly selected tasks.
+    """
+    rng = random.Random(seed)
+    return rng.sample(tasks, sample_size)
+def _sample_stratified(
+    tasks: list[dict[str, Any]],
+    sample_size: int,
+    seed: int | None = None,
+    stratify_field: str | None = None,
+) -> list[dict[str, Any]]:
+    """Sample proportionally from groups defined by stratify_field.
+    Groups tasks by the value of stratify_field, then samples from each group
+    proportionally to its size in the original dataset. Uses round-robin allocation
+    for any remainder to ensure exact sample_size is met.
+    Args:
+        tasks: Full list of tasks.
+        sample_size: Total number of tasks to select across all groups.
+        seed: Random seed for reproducibility.
+        stratify_field: Field name to group tasks by.
+    Returns:
+        Stratified sample of tasks.
+    Raises:
+        ValueError: If stratify_field is None or not found in any task.
+    """
+    if not stratify_field:
+        raise ValueError("stratify_field is required when using STRATIFIED sampling strategy")
+    # Group tasks by the stratify_field value
+    groups: dict[str, list[dict[str, Any]]] = defaultdict(list)
+    for task in tasks:
+        key = str(task.get(stratify_field, "_unknown_"))
+        groups[key] = groups.get(key, [])
+        groups[key].append(task)
+    # Check that at least one task had the stratify_field
+    if len(groups) == 1 and "_unknown_" in groups:
+        raise ValueError(
+            f"stratify_field '{stratify_field}' not found in any task. "
+            f"Available fields: {list(tasks[0].keys()) if tasks else []}"
+        )
+    total_tasks = len(tasks)
+    rng = random.Random(seed)
+    # Sort group keys for deterministic ordering
+    sorted_keys = sorted(groups.keys())
+    # Calculate proportional allocation for each group
+    allocations: dict[str, int] = {}
+    allocated = 0
+    for key in sorted_keys:
+        group_size = len(groups[key])
+        # Proportional allocation (floor)
+        proportion = group_size / total_tasks
+        count = int(sample_size * proportion)
+        allocations[key] = count
+        allocated += count
+    # Distribute remainder using round-robin over groups sorted by fractional part
+    remainder = sample_size - allocated
+    if remainder > 0:
+        # Sort groups by their fractional allocation (descending) for fair distribution
+        fractional_parts = []
+        for key in sorted_keys:
+            group_size = len(groups[key])
+            proportion = group_size / total_tasks
+            exact = sample_size * proportion
+            fractional = exact - int(exact)
+            fractional_parts.append((fractional, key))
+        fractional_parts.sort(key=lambda x: x[0], reverse=True)
+        for i in range(remainder):
+            _, key = fractional_parts[i % len(fractional_parts)]
+            allocations[key] += 1
+    # Sample from each group
+    result: list[dict[str, Any]] = []
+    for key in sorted_keys:
+        group = groups[key]
+        count = min(allocations[key], len(group))
+        if count > 0:
+            sampled = rng.sample(group, count)
+            result.extend(sampled)
+    return result

{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mcpbr
-Version: 0.4.14
+Version: 0.4.16
 Summary: Model Context Protocol Benchmark Runner - evaluate MCP servers against software engineering benchmarks
 Project-URL: Homepage, https://github.com/greynewell/mcpbr
 Project-URL: Repository, https://github.com/greynewell/mcpbr
@@ -100,7 +100,7 @@ mcpbr runs controlled experiments: same model, same tasks, same environment - th
 ## Supported Benchmarks
-mcpbr supports 25+ benchmarks across 8 categories through a flexible abstraction layer:
+mcpbr supports 30+ benchmarks across 10 categories through a flexible abstraction layer:
 | Category | Benchmarks |
 |----------|-----------|
@@ -111,7 +111,11 @@ mcpbr supports 25+ benchmarks across 8 categories through a flexible abstraction
 | **Tool Use & Agents** | [MCPToolBench++](https://greynewell.github.io/mcpbr/benchmarks/mcptoolbench/), [ToolBench](https://greynewell.github.io/mcpbr/benchmarks/toolbench/), [AgentBench](https://greynewell.github.io/mcpbr/benchmarks/agentbench/), [WebArena](https://greynewell.github.io/mcpbr/benchmarks/webarena/), [TerminalBench](https://greynewell.github.io/mcpbr/benchmarks/terminalbench/), [InterCode](https://greynewell.github.io/mcpbr/benchmarks/intercode/) |
 | **ML Research** | [MLAgentBench](https://greynewell.github.io/mcpbr/benchmarks/mlagentbench/) |
 | **Code Understanding** | [RepoQA](https://greynewell.github.io/mcpbr/benchmarks/repoqa/) |
+| **Multimodal** | MMMU |
+| **Long Context** | LongBench |
+| **Safety & Adversarial** | Adversarial (HarmBench) |
 | **Security** | [CyberGym](https://greynewell.github.io/mcpbr/benchmarks/cybergym/) |
+| **Custom** | User-defined benchmarks via YAML |
 ### Featured Benchmarks
@@ -1470,10 +1474,10 @@ We're building the defacto standard for MCP server benchmarking! Our [v1.0 Roadm
 - Cost analysis in reports
 **Phase 2: Benchmarks** (v0.4.0)
-- HumanEval, MBPP, ToolBench
-- GAIA for general AI capabilities
-- Custom benchmark YAML support
-- SWE-bench Verified
+- ✅ 30+ benchmarks across 10 categories
+- ✅ Custom benchmark YAML support
+- ✅ Custom metrics, failure analysis, sampling strategies
+- ✅ Dataset versioning, latency metrics, GPU support, few-shot learning
 **Phase 3: Developer Experience** (v0.5.0)
 - Real-time dashboard

{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/RECORD RENAMED Viewed

@@ -3,16 +3,22 @@ mcpbr/__main__.py,sha256=WmeQsAqtW_9tMTNKArH1m76DPBokZpXuy6dMZp13gXA,132
 mcpbr/agent.py,sha256=aSFH2S3ExKZfdVfMbzk6D1nRhpKt4JmpRzmF4Vi6Gmo,5795
 mcpbr/cache.py,sha256=YiP13omwMbXLb6NhNocJvL58enXEx9J8OrvTZnWUkw4,13254
 mcpbr/cli.py,sha256=xvh7gpJx0LzjV3g-Te4FF7BfHubGzDxOiYQsSeQnCEc,68276
-mcpbr/config.py,sha256=E9Icedjk_VFONnnEZbWW5WN7El5RaJD5pGi-JQlrlV0,18890
+mcpbr/config.py,sha256=7lWV0ZtzyD6WZ07IR4yhT9lyBBPONzlanaO4XHm9OoE,18952
 mcpbr/config_inheritance.py,sha256=0EV9Tv62UFNgZoc8mY7yYjHEbnMM_R5EAhSeuK7ajAA,6617
 mcpbr/config_validator.py,sha256=ZMEIeK4y6fSwyY46Xv5dK5v3jM4HDKcYkosnIcn7iyI,20488
-mcpbr/docker_env.py,sha256=vpbjL227L9qLjrS7CzXevxzo9393qmOrrxWG7lP1s44,31629
+mcpbr/custom_metrics.py,sha256=4pMO9-BPpeQ_GUTnZ18TQXINFScAMH3cIYm0HG-C51o,13213
+mcpbr/dataset_versioning.py,sha256=Y_ZSGhl8ihl6Kgee_p7VbkNwGhgwIdMZPlRunvk4knY,7149
+mcpbr/docker_env.py,sha256=_45OUZKjUevE9O3YLF_1uvQtdOyJ7yZIYWmSvXN3cFw,31794
 mcpbr/env_expansion.py,sha256=Rkhth-tWV8CptQlSSk9exuMsUaSTTW9hj69z4snZd_U,6122
 mcpbr/evaluation.py,sha256=EjPREWv7hBRqhBhNan0ERh2imqMBegT0Y2cgZlTxRGk,12765
-mcpbr/harness.py,sha256=sEMP2PnrQP_BKK-4yixz05qXcY-0OsJNJ5e5JU2Rtsc,51079
+mcpbr/failure_analysis.py,sha256=N5xp9YPe2d7P9fTa2LVSHsPgB1WOQtWMeClq3bOv4_c,19883
+mcpbr/few_shot.py,sha256=bFDdes_kgZAFWoFZQEfZG5Z2Es9rmkB1jsxSMp4aCCM,11684
+mcpbr/gpu_support.py,sha256=eroBiLkt1A3Q2ODJDSyqrd_BzcMh8tFkjtPn7PsvJJc,5070
+mcpbr/harness.py,sha256=8-qmcPR2CDFuoBib9g6lPx7aMOK-5PuZgpWhpGs-Ils,51419
 mcpbr/harnesses.py,sha256=h9iDp4qkPABNwO9OXbJ61qcD4n0oAUTU7AQksxRKLcg,47335
 mcpbr/incremental_save.py,sha256=1dm3pGiEIhP8cVk_Y6XF_cAdo3B_vyRc6CO8Wt-MyIA,4830
 mcpbr/junit_reporter.py,sha256=M_02zJbFbA3VoIYG5oR7VDecqWHEpIee-JOUShWNuLU,9261
+mcpbr/latency_metrics.py,sha256=xNMaUzGMSbOIfuoyZGyIfyMk5uAmoj6K65ZAs5D6Z8c,10476
 mcpbr/log_formatter.py,sha256=d2jWH7z4IRSbr8-PbnEt3TmLAqk8vgdPT38uTnTCN5c,21488
 mcpbr/models.py,sha256=zsrBrwFeOfNKgThUbT1oPkF5pdRjL1QJjMte0vXjcbk,3710
 mcpbr/output_validator.py,sha256=TUoBtDjjXvR6MACbWV6uNOsxM_n4C0Jbn5in35HH4K8,1750
@@ -22,6 +28,7 @@ mcpbr/profiler.py,sha256=SRXLKf2TOlpnMbQpGvjRy1Agv-XaEz6lDmBa5WGNv8c,15954
 mcpbr/providers.py,sha256=ebrnH6RXODxX4Ma9r7Is5VBHYFNP5LwCs-vpLbbHP8o,6598
 mcpbr/regression.py,sha256=xm_ago8ZP3RAOrDNjtINwyRUvzKWJcJDWbzf3hp6LlU,12827
 mcpbr/reporting.py,sha256=Odzb7EgpimW-qh01VQedhb2X594ACrOcGe4jshgiwTg,56111
+mcpbr/sampling.py,sha256=Hpgh2TayI3QGcno-Np9eYi8sklxKEZQXyhpaQlc9T4Q,6248
 mcpbr/schema.py,sha256=fdjiKmp1au2oN5aXcPRoCbyvwm2XeMD5DmeWSurMk4A,6858
 mcpbr/smoke_test.py,sha256=srYGOn_auspRbt_a6ebYDDDq_nujA_iZGman5nU1ikU,14925
 mcpbr/state_tracker.py,sha256=rIP9LIHtQg6oBsLIxnwRjE865Kw6U7DMO_GzzuMRC0E,10790
@@ -29,7 +36,8 @@ mcpbr/statistics.py,sha256=Ny8TMdBrIpS4KfKCJcuFfTeaGuTmEkS1G_uHBlboYdA,19134
 mcpbr/streaming.py,sha256=XPhkXO1R1EsWtkoPvCpyy4TehEom7hkuOeP-00joX3o,13853
 mcpbr/swebench_test_specs.py,sha256=Mh_BPjcexkgDT3p4zT2p31925b8w5tgsxxRpYZQZalM,1390
 mcpbr/templates.py,sha256=dqwboVB-yfE06w2rgDOvuWJB4Hx5duH_W-jvLBqmlKg,10683
-mcpbr/benchmarks/__init__.py,sha256=RK0TxNTSqhUX_WtGs0CcV1MX2uiCBTUWkEHYpo_7T5M,4099
+mcpbr/benchmarks/__init__.py,sha256=2-7Ebg6-wHo1QGfVKWjjbREcLG_A-6Q0XfZGiyXrOeE,4489
+mcpbr/benchmarks/adversarial.py,sha256=69VBTZv6BhR1JwjQepA_YwAu3b--vJviGd6IWs2h1QA,12357
 mcpbr/benchmarks/agentbench.py,sha256=jQ8OG_5cn-PvOZizXivysLTw9xvtA8c_MWfw3jXq0TQ,6512
 mcpbr/benchmarks/aider_polyglot.py,sha256=_uWYNVaW0YWEWuuSXNxsqSngvWjo0HUeubcj16Q25uk,7256
 mcpbr/benchmarks/apps.py,sha256=mvN26KNICxGZh0sxCmxR0Ph6hfXnqRsVO-oB5I6MjgQ,7801
@@ -39,6 +47,7 @@ mcpbr/benchmarks/bigbench_hard.py,sha256=jwG5YV97xo6FiNnpAUseJVO_a_6QkpCYZ1r1mGi
 mcpbr/benchmarks/bigcodebench.py,sha256=dK4QkRTM6D1v3pprBgAxSTsOz7mJqi9f4sOfMKJUJXM,7117
 mcpbr/benchmarks/codecontests.py,sha256=Kx_izYR9D1sMcfVtslCN0upGsPtbXir7UHjL1fEZzc0,8905
 mcpbr/benchmarks/codereval.py,sha256=n77q2mXgMNg7wdeoMOSNKbLh86IrwG8iIzd64Gb0NEc,8341
+mcpbr/benchmarks/custom.py,sha256=cjuhZLSyS4oCZun-3JJo3fsSVs-lcRv5kzaoQ_m2MTU,20675
 mcpbr/benchmarks/cybergym.py,sha256=r5itZNGdiDtztlC_BGLCdtLBZu0jgAyyG2_8cNUCoJ8,18574
 mcpbr/benchmarks/gaia.py,sha256=4Lxe6YAbKyIiPYgszvRcoia74TLZ6FqoIY5_337Vjtw,6852
 mcpbr/benchmarks/gsm8k.py,sha256=CK9C6qQi3rO81nuGcE-od2-PvQ48lmL-nQcLIeZDrbM,12730
@@ -46,10 +55,12 @@ mcpbr/benchmarks/hellaswag.py,sha256=Ah8Pub7QI94lgGHnbC6g3US4NTkt-zWSReS4h9Y6XGU
 mcpbr/benchmarks/humaneval.py,sha256=J9hCB17ppey81p4HS2ynGFsDDGLOdJhw63OSaG7vhT8,18296
 mcpbr/benchmarks/intercode.py,sha256=iq0X75aL469xIR8mVGUNaPlgdqAlySPsa2YWoSftw5M,8737
 mcpbr/benchmarks/leetcode.py,sha256=lan8A5D5Bfe5B6t_wx4KzZsAr9iNF7vch0Em2g9bX-k,7772
+mcpbr/benchmarks/longbench.py,sha256=Hb4lGiojG3apRajgsI7c0DkcP1WzqdMrdpPEkI-WAkE,20791
 mcpbr/benchmarks/math_benchmark.py,sha256=LP_gjp3Cgzt1kDWVPqufRHg0YE0N9ouThOI6avpYxCk,8322
 mcpbr/benchmarks/mbpp.py,sha256=e1tgQJOEeAQAlkeYMBr4jymTYvC9s_Nt34TKExFVFy4,6907
 mcpbr/benchmarks/mcptoolbench.py,sha256=ioXPdXeXQEgBCHccOq7ier_-ucfQI41hUu0Z4HSIIAg,16209
 mcpbr/benchmarks/mlagentbench.py,sha256=Qr_BRhQFgK66KcEAr0svP44a-twWkXeTQVPQHdX7HpM,8367
+mcpbr/benchmarks/mmmu.py,sha256=jvIgpM-ofJAkmuDKA0jMktDBsX41s0zyC8PRG5qSBlw,11929
 mcpbr/benchmarks/repoqa.py,sha256=0Z9WxXl2dFgSWLNRGFNGd2kOU_rItNrtSdF8ZbC2TqI,6509
 mcpbr/benchmarks/swebench.py,sha256=Eo4dL1BLabQqZvSLR9xqoDmEdy0Y0mLTgincbV78DjQ,6473
 mcpbr/benchmarks/terminalbench.py,sha256=I9YLeZh5j_AYvUJFhZkhlDTfIWU3OvcuJLjzYlfAZuw,7166
@@ -69,15 +80,15 @@ mcpbr/infrastructure/azure_health.py,sha256=xITmIa9IfYIwxcVhY0sJ81a-6WNKiT8kSQTd
 mcpbr/infrastructure/base.py,sha256=Olj6uiNBeGoUqltZI1NHZfa26kzT-6jfp8YIXSykFKM,3037
 mcpbr/infrastructure/local.py,sha256=VK6UAg7Dzvb9v1LAJgNGA_s0blQKrHAQEXBAC75zAL8,4237
 mcpbr/infrastructure/manager.py,sha256=j0T7U1Tbajmfve4SNfhYKikvL9kgSVT01fYKMC-sH-s,4796
-mcpbr-0.4.14.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
-mcpbr-0.4.14.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
-mcpbr-0.4.14.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
-mcpbr-0.4.14.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
-mcpbr-0.4.14.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
-mcpbr-0.4.14.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
-mcpbr-0.4.14.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
-mcpbr-0.4.14.dist-info/METADATA,sha256=f2PEinjR_XbBOmFtDAZxoDHdBLwKxLX4V9kjYqh_UtA,54809
-mcpbr-0.4.14.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-mcpbr-0.4.14.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
-mcpbr-0.4.14.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
-mcpbr-0.4.14.dist-info/RECORD,,
+mcpbr-0.4.16.data/data/mcpbr/data/templates/brave-search.yaml,sha256=PYHXJOaDqYKoqdJc3JV1WbaL-BacrdkQPck1eKGbMPo,1098
+mcpbr-0.4.16.data/data/mcpbr/data/templates/filesystem.yaml,sha256=1p6Z6ChViFYHAODYD71JFst6gdhR5y5rnWNf7Pp5zOY,1091
+mcpbr-0.4.16.data/data/mcpbr/data/templates/github.yaml,sha256=uzPwq5_loFegvH6RNov1MQclbBiFBgYWzpiKLfEN9H4,1133
+mcpbr-0.4.16.data/data/mcpbr/data/templates/google-maps.yaml,sha256=ldR7E9UmuAA-3nJZ1SShD7PhG0_AwDJOSYuy19hQ6cI,1116
+mcpbr-0.4.16.data/data/mcpbr/data/templates/postgres.yaml,sha256=r6R1069BhV4ADQGPZ-T9r6xMNwbr2yrNh8-IHPb4XiI,1178
+mcpbr-0.4.16.data/data/mcpbr/data/templates/slack.yaml,sha256=dBn_YqlFJMJai_55sRDb4hXClgxRpcyYTlWl4LBkpuo,1072
+mcpbr-0.4.16.data/data/mcpbr/data/templates/sqlite.yaml,sha256=UR5yN9f8v_BC6oskny2xMldHWzZrB9b_PpFSmv5eccg,1080
+mcpbr-0.4.16.dist-info/METADATA,sha256=GeSnMZw0x7-XPhblIu50aCO7NXaNfjgVScnBOp6ZaOA,55069
+mcpbr-0.4.16.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+mcpbr-0.4.16.dist-info/entry_points.txt,sha256=lLL8icujqBF36V9bF4gfaB2at4cFKCiv2IdJ1i5hT9U,41
+mcpbr-0.4.16.dist-info/licenses/LICENSE,sha256=mcXLPreEXzD-816yLKmocCPr9_k3gFFo62TjrSuKkIQ,1075
+mcpbr-0.4.16.dist-info/RECORD,,