PyPI - mcpbr - Versions diffs - 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl - Mend

mcpbr 0.4.14py3-none-any.whl → 0.4.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

mcpbr/benchmarks/__init__.py +12 -0
mcpbr/benchmarks/adversarial.py +341 -0
mcpbr/benchmarks/custom.py +607 -0
mcpbr/benchmarks/longbench.py +623 -0
mcpbr/benchmarks/mmmu.py +353 -0
mcpbr/config.py +4 -0
mcpbr/custom_metrics.py +405 -0
mcpbr/dataset_versioning.py +222 -0
mcpbr/docker_env.py +6 -0
mcpbr/failure_analysis.py +558 -0
mcpbr/few_shot.py +367 -0
mcpbr/gpu_support.py +157 -0
mcpbr/harness.py +8 -0
mcpbr/latency_metrics.py +317 -0
mcpbr/sampling.py +193 -0
{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/METADATA +10 -6
{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/RECORD +27 -16
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/github.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/slack.yaml +0 -0
{mcpbr-0.4.14.data → mcpbr-0.4.16.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/WHEEL +0 -0
{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/entry_points.txt +0 -0
{mcpbr-0.4.14.dist-info → mcpbr-0.4.16.dist-info}/licenses/LICENSE +0 -0

mcpbr/custom_metrics.py ADDED Viewed

@@ -0,0 +1,405 @@
+"""Custom metrics framework for flexible evaluation beyond standard accuracy/pass rates.
+This module provides:
+- MetricDefinition dataclass for declaring metrics with name, description, compute
+  function, aggregation strategy, and direction (higher_is_better).
+- MetricRegistry for registering, looking up, and managing metrics.
+- Built-in metrics: accuracy, pass_rate, avg_tokens, avg_cost, avg_time,
+  tool_call_rate, failure_rate.
+- Support for composite metrics (e.g., cost_efficiency = pass_rate / avg_cost).
+- compute_metrics() to evaluate a set of metrics against result data.
+- validate_metric() to check metric definition validity.
+"""
+from __future__ import annotations
+import math
+import statistics
+from dataclasses import dataclass
+from typing import Any, Callable
+@dataclass
+class MetricDefinition:
+    """Definition of a single evaluation metric.
+    Attributes:
+        name: Unique identifier for the metric.
+        description: Human-readable description of what the metric measures.
+        compute_fn: Either a callable ``(list[dict]) -> float`` that computes the
+            metric from a list of result dicts, or a string expression referencing
+            other metric names (for composite metrics).
+        aggregation: Aggregation strategy used when summarising per-task values.
+            One of ``"mean"``, ``"sum"``, ``"min"``, ``"max"``, ``"median"``.
+        higher_is_better: Whether a higher value is considered better.
+    """
+    name: str
+    description: str
+    compute_fn: Callable[[list[dict[str, Any]]], float] | str
+    aggregation: str = "mean"
+    higher_is_better: bool = True
+_VALID_AGGREGATIONS = frozenset({"mean", "sum", "min", "max", "median"})
+class MetricRegistry:
+    """Registry for looking up and managing metric definitions.
+    Provides ``register``, ``get``, ``list_metrics``, and ``unregister`` operations.
+    """
+    def __init__(self) -> None:
+        self._metrics: dict[str, MetricDefinition] = {}
+    # -- public API ----------------------------------------------------------
+    def register(self, metric: MetricDefinition) -> None:
+        """Register a metric definition.
+        Args:
+            metric: The metric to register.
+        Raises:
+            ValueError: If a metric with the same name is already registered.
+        """
+        if metric.name in self._metrics:
+            raise ValueError(f"Metric '{metric.name}' is already registered")
+        self._metrics[metric.name] = metric
+    def get(self, name: str) -> MetricDefinition | None:
+        """Look up a metric by name.
+        Args:
+            name: Metric name.
+        Returns:
+            The MetricDefinition if found, otherwise ``None``.
+        """
+        return self._metrics.get(name)
+    def list_metrics(self) -> list[str]:
+        """Return a sorted list of all registered metric names."""
+        return sorted(self._metrics.keys())
+    def unregister(self, name: str) -> bool:
+        """Remove a metric from the registry.
+        Args:
+            name: Metric name to remove.
+        Returns:
+            ``True`` if the metric was removed, ``False`` if it was not found.
+        """
+        if name in self._metrics:
+            del self._metrics[name]
+            return True
+        return False
+    def __contains__(self, name: str) -> bool:
+        return name in self._metrics
+    def __len__(self) -> int:
+        return len(self._metrics)
+# ---------------------------------------------------------------------------
+# Built-in metric compute functions
+# ---------------------------------------------------------------------------
+def _compute_accuracy(results: list[dict[str, Any]]) -> float:
+    """Fraction of results where ``resolved`` is truthy."""
+    if not results:
+        return 0.0
+    resolved = sum(1 for r in results if r.get("resolved"))
+    return resolved / len(results)
+def _compute_pass_rate(results: list[dict[str, Any]]) -> float:
+    """Fraction of results where ``resolved`` is truthy (alias of accuracy)."""
+    return _compute_accuracy(results)
+def _compute_avg_tokens(results: list[dict[str, Any]]) -> float:
+    """Average total token count per result."""
+    token_counts: list[int] = []
+    for r in results:
+        tokens = r.get("tokens", {})
+        total = tokens.get("input", 0) + tokens.get("output", 0)
+        token_counts.append(total)
+    if not token_counts:
+        return 0.0
+    return float(statistics.mean(token_counts))
+def _compute_avg_cost(results: list[dict[str, Any]]) -> float:
+    """Average cost per result."""
+    costs = [r.get("cost", 0.0) for r in results]
+    if not costs:
+        return 0.0
+    return statistics.mean(costs)
+def _compute_avg_time(results: list[dict[str, Any]]) -> float:
+    """Average runtime in seconds per result."""
+    runtimes = [r.get("runtime_seconds", 0.0) for r in results]
+    if not runtimes:
+        return 0.0
+    return statistics.mean(runtimes)
+def _compute_tool_call_rate(results: list[dict[str, Any]]) -> float:
+    """Fraction of results that contain at least one tool call."""
+    if not results:
+        return 0.0
+    with_tools = sum(1 for r in results if r.get("tool_usage"))
+    return with_tools / len(results)
+def _compute_failure_rate(results: list[dict[str, Any]]) -> float:
+    """Fraction of results where ``error`` is present and non-empty."""
+    if not results:
+        return 0.0
+    with_errors = sum(1 for r in results if r.get("error"))
+    return with_errors / len(results)
+# ---------------------------------------------------------------------------
+# Built-in metric definitions
+# ---------------------------------------------------------------------------
+BUILTIN_METRICS: list[MetricDefinition] = [
+    MetricDefinition(
+        name="accuracy",
+        description="Fraction of tasks resolved successfully",
+        compute_fn=_compute_accuracy,
+        aggregation="mean",
+        higher_is_better=True,
+    ),
+    MetricDefinition(
+        name="pass_rate",
+        description="Fraction of tasks that pass (alias for accuracy)",
+        compute_fn=_compute_pass_rate,
+        aggregation="mean",
+        higher_is_better=True,
+    ),
+    MetricDefinition(
+        name="avg_tokens",
+        description="Average total tokens (input + output) per task",
+        compute_fn=_compute_avg_tokens,
+        aggregation="mean",
+        higher_is_better=False,
+    ),
+    MetricDefinition(
+        name="avg_cost",
+        description="Average API cost per task in USD",
+        compute_fn=_compute_avg_cost,
+        aggregation="mean",
+        higher_is_better=False,
+    ),
+    MetricDefinition(
+        name="avg_time",
+        description="Average runtime per task in seconds",
+        compute_fn=_compute_avg_time,
+        aggregation="mean",
+        higher_is_better=False,
+    ),
+    MetricDefinition(
+        name="tool_call_rate",
+        description="Fraction of tasks that used at least one tool",
+        compute_fn=_compute_tool_call_rate,
+        aggregation="mean",
+        higher_is_better=True,
+    ),
+    MetricDefinition(
+        name="failure_rate",
+        description="Fraction of tasks that encountered an error",
+        compute_fn=_compute_failure_rate,
+        aggregation="mean",
+        higher_is_better=False,
+    ),
+]
+def create_default_registry() -> MetricRegistry:
+    """Create a MetricRegistry pre-populated with all built-in metrics.
+    Returns:
+        A MetricRegistry instance containing the built-in metrics.
+    """
+    registry = MetricRegistry()
+    for metric in BUILTIN_METRICS:
+        registry.register(metric)
+    return registry
+# ---------------------------------------------------------------------------
+# Aggregation helpers
+# ---------------------------------------------------------------------------
+def _aggregate(values: list[float], method: str) -> float:
+    """Aggregate a list of floats using the specified method.
+    Args:
+        values: Numeric values to aggregate.
+        method: One of ``"mean"``, ``"sum"``, ``"min"``, ``"max"``, ``"median"``.
+    Returns:
+        Aggregated value.
+    Raises:
+        ValueError: If the method is unrecognised.
+    """
+    if not values:
+        return 0.0
+    if method == "mean":
+        return statistics.mean(values)
+    elif method == "sum":
+        return math.fsum(values)
+    elif method == "min":
+        return min(values)
+    elif method == "max":
+        return max(values)
+    elif method == "median":
+        return statistics.median(values)
+    else:
+        raise ValueError(f"Unknown aggregation method: {method!r}")
+# ---------------------------------------------------------------------------
+# Core public API
+# ---------------------------------------------------------------------------
+def compute_metrics(
+    results: list[dict[str, Any]],
+    metrics: list[str],
+    registry: MetricRegistry | None = None,
+) -> dict[str, float]:
+    """Compute the requested metrics over a list of result dicts.
+    Each result dict is expected to follow the structure used elsewhere in mcpbr
+    (keys such as ``resolved``, ``tokens``, ``cost``, ``runtime_seconds``,
+    ``tool_usage``, ``error``).
+    Composite metrics (whose ``compute_fn`` is a string expression) are resolved
+    by first computing all non-composite metrics they reference, then evaluating the
+    expression in a restricted namespace.
+    Args:
+        results: List of per-task result dictionaries.
+        metrics: List of metric names to compute.
+        registry: Optional MetricRegistry. If ``None``, the default registry
+            (containing built-in metrics) is used.
+    Returns:
+        Dictionary mapping metric names to their computed float values.
+    Raises:
+        KeyError: If a requested metric is not found in the registry.
+        ValueError: If a composite expression references an unknown metric or
+            fails to evaluate.
+    """
+    if registry is None:
+        registry = create_default_registry()
+    computed: dict[str, float] = {}
+    # Separate callable and composite (expression-based) metrics
+    callable_names: list[str] = []
+    composite_names: list[str] = []
+    for name in metrics:
+        metric_def = registry.get(name)
+        if metric_def is None:
+            raise KeyError(f"Metric '{name}' is not registered")
+        if callable(metric_def.compute_fn):
+            callable_names.append(name)
+        else:
+            composite_names.append(name)
+    # Phase 1: compute all callable metrics
+    for name in callable_names:
+        metric_def = registry.get(name)
+        assert metric_def is not None  # guaranteed above
+        assert callable(metric_def.compute_fn)
+        computed[name] = metric_def.compute_fn(results)
+    # Phase 2: resolve composite metrics
+    for name in composite_names:
+        metric_def = registry.get(name)
+        assert metric_def is not None
+        assert isinstance(metric_def.compute_fn, str)
+        # Build a namespace of already-computed values.  If the expression
+        # references a metric that hasn't been computed yet, compute it now.
+        ns: dict[str, float] = {}
+        for existing_name, existing_val in computed.items():
+            ns[existing_name] = existing_val
+        # Evaluate the expression.  We deliberately restrict the namespace to
+        # only contain computed metric values (no builtins).
+        try:
+            value = float(eval(metric_def.compute_fn, {"__builtins__": {}}, ns))  # noqa: S307
+        except ZeroDivisionError:
+            value = 0.0
+        except Exception as exc:
+            raise ValueError(
+                f"Failed to evaluate composite metric '{name}' "
+                f"expression '{metric_def.compute_fn}': {exc}"
+            ) from exc
+        computed[name] = value
+    return computed
+def validate_metric(metric_def: dict[str, Any]) -> bool:
+    """Validate a metric definition dictionary.
+    Checks that the definition contains all required fields with correct types
+    and valid values.
+    Required keys:
+        - ``name`` (str, non-empty)
+        - ``description`` (str)
+        - ``compute_fn`` (callable or str)
+    Optional keys (with defaults):
+        - ``aggregation`` (str, one of mean/sum/min/max/median)
+        - ``higher_is_better`` (bool)
+    Args:
+        metric_def: Dictionary representing a metric definition.
+    Returns:
+        ``True`` if the definition is valid, ``False`` otherwise.
+    """
+    # Required fields
+    if not isinstance(metric_def.get("name"), str) or not metric_def["name"].strip():
+        return False
+    if not isinstance(metric_def.get("description"), str):
+        return False
+    compute_fn = metric_def.get("compute_fn")
+    if compute_fn is None:
+        return False
+    if not callable(compute_fn) and not isinstance(compute_fn, str):
+        return False
+    if isinstance(compute_fn, str) and not compute_fn.strip():
+        return False
+    # Optional fields
+    aggregation = metric_def.get("aggregation", "mean")
+    if aggregation not in _VALID_AGGREGATIONS:
+        return False
+    higher_is_better = metric_def.get("higher_is_better", True)
+    if not isinstance(higher_is_better, bool):
+        return False
+    return True

mcpbr/dataset_versioning.py ADDED Viewed

@@ -0,0 +1,222 @@
+"""Dataset versioning for reproducible benchmark evaluations.
+This module provides utilities to pin and track HuggingFace dataset versions,
+ensuring that benchmark runs can be reproduced with the exact same data.
+Version information includes dataset revision hashes, download timestamps,
+and optional checksums for data integrity verification.
+"""
+import hashlib
+import json
+import logging
+from dataclasses import asdict, dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+from datasets import Dataset, load_dataset
+from huggingface_hub import dataset_info
+logger = logging.getLogger(__name__)
+@dataclass
+class DatasetVersion:
+    """Pinned version information for a HuggingFace dataset.
+    Attributes:
+        dataset_id: HuggingFace dataset identifier (e.g., 'SWE-bench/SWE-bench_Lite').
+        revision: Git revision hash of the dataset (None for latest).
+        download_date: ISO 8601 timestamp of when the version was pinned.
+        checksum: Optional SHA256 checksum of the dataset content for integrity verification.
+    """
+    dataset_id: str
+    revision: str | None
+    download_date: str
+    checksum: str | None
+def pin_dataset_version(
+    dataset_id: str,
+    revision: str | None = None,
+) -> DatasetVersion:
+    """Record the current version of a HuggingFace dataset.
+    Fetches dataset metadata from the HuggingFace Hub to determine the
+    current revision. If a specific revision is provided, it is used directly.
+    Args:
+        dataset_id: HuggingFace dataset identifier (e.g., 'SWE-bench/SWE-bench_Lite').
+        revision: Specific git revision to pin. If None, the latest revision is fetched.
+    Returns:
+        DatasetVersion with the pinned revision and metadata.
+    Raises:
+        Exception: If the dataset cannot be found or accessed on the HuggingFace Hub.
+    """
+    info = dataset_info(dataset_id, revision=revision)
+    resolved_revision = info.sha
+    # Compute a checksum from the dataset card and file metadata for integrity
+    checksum_data = f"{dataset_id}:{resolved_revision}"
+    if info.siblings:
+        file_names = sorted(s.rfilename for s in info.siblings)
+        checksum_data += ":" + ",".join(file_names)
+    checksum = hashlib.sha256(checksum_data.encode()).hexdigest()
+    download_date = datetime.now(timezone.utc).isoformat()
+    version = DatasetVersion(
+        dataset_id=dataset_id,
+        revision=resolved_revision,
+        download_date=download_date,
+        checksum=checksum,
+    )
+    logger.info(
+        "Pinned dataset %s at revision %s",
+        dataset_id,
+        resolved_revision,
+    )
+    return version
+def load_dataset_pinned(
+    dataset_id: str,
+    version: DatasetVersion | None = None,
+    **kwargs: Any,
+) -> Dataset:
+    """Load a HuggingFace dataset using a pinned version for reproducibility.
+    Wraps the standard ``datasets.load_dataset`` call, injecting the pinned
+    revision so that the exact same data snapshot is used across runs.
+    Args:
+        dataset_id: HuggingFace dataset identifier.
+        version: Pinned version to use. If None, loads the latest version.
+        **kwargs: Additional keyword arguments passed to ``datasets.load_dataset``
+            (e.g., split, name, streaming).
+    Returns:
+        The loaded HuggingFace Dataset.
+    """
+    revision = None
+    if version is not None:
+        revision = version.revision
+        logger.info(
+            "Loading dataset %s at pinned revision %s (pinned on %s)",
+            dataset_id,
+            revision,
+            version.download_date,
+        )
+    else:
+        logger.info("Loading dataset %s at latest revision", dataset_id)
+    return load_dataset(dataset_id, revision=revision, **kwargs)
+def save_version_manifest(
+    versions: dict[str, DatasetVersion],
+    path: Path,
+) -> None:
+    """Save dataset version pins to a JSON manifest file.
+    The manifest file records all pinned dataset versions so they can be
+    shared across team members or CI environments for reproducible runs.
+    Args:
+        versions: Mapping of dataset identifiers to their pinned versions.
+        path: File path to write the JSON manifest.
+    """
+    manifest: dict[str, Any] = {
+        "format_version": "1.0",
+        "created_at": datetime.now(timezone.utc).isoformat(),
+        "datasets": {},
+    }
+    for dataset_id, version in versions.items():
+        manifest["datasets"][dataset_id] = asdict(version)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        json.dump(manifest, f, indent=2)
+    logger.info("Saved version manifest with %d datasets to %s", len(versions), path)
+def load_version_manifest(path: Path) -> dict[str, DatasetVersion]:
+    """Load pinned dataset versions from a JSON manifest file.
+    Args:
+        path: File path to the JSON manifest.
+    Returns:
+        Mapping of dataset identifiers to their pinned versions.
+    Raises:
+        FileNotFoundError: If the manifest file does not exist.
+        json.JSONDecodeError: If the manifest file contains invalid JSON.
+        KeyError: If the manifest is missing required fields.
+    """
+    with open(path) as f:
+        manifest = json.load(f)
+    versions: dict[str, DatasetVersion] = {}
+    datasets_data = manifest.get("datasets", {})
+    for dataset_id, version_data in datasets_data.items():
+        versions[dataset_id] = DatasetVersion(
+            dataset_id=version_data["dataset_id"],
+            revision=version_data.get("revision"),
+            download_date=version_data["download_date"],
+            checksum=version_data.get("checksum"),
+        )
+    logger.info("Loaded version manifest with %d datasets from %s", len(versions), path)
+    return versions
+def get_dataset_info(dataset_id: str) -> dict[str, Any]:
+    """Get metadata about a HuggingFace dataset.
+    Retrieves information such as the latest revision, description,
+    file listing, and other Hub metadata.
+    Args:
+        dataset_id: HuggingFace dataset identifier.
+    Returns:
+        Dictionary containing dataset metadata with keys:
+            - dataset_id: The dataset identifier.
+            - latest_revision: The current HEAD revision hash.
+            - description: Dataset description text.
+            - tags: List of dataset tags.
+            - downloads: Number of downloads.
+            - last_modified: Last modification timestamp.
+            - files: List of files in the dataset repository.
+    Raises:
+        Exception: If the dataset cannot be found or accessed on the HuggingFace Hub.
+    """
+    info = dataset_info(dataset_id)
+    files: list[str] = []
+    if info.siblings:
+        files = [s.rfilename for s in info.siblings]
+    result: dict[str, Any] = {
+        "dataset_id": dataset_id,
+        "latest_revision": info.sha,
+        "description": info.description or "",
+        "tags": list(info.tags) if info.tags else [],
+        "downloads": info.downloads if info.downloads is not None else 0,
+        "last_modified": info.last_modified.isoformat() if info.last_modified else None,
+        "files": files,
+    }
+    return result

mcpbr/docker_env.py CHANGED Viewed

@@ -724,6 +724,12 @@ CMD ["/bin/bash"]
         if self in _active_managers:
             _active_managers.remove(self)
+        # Close the Docker client to release background threads/connections
+        try:
+            self.client.close()
+        except Exception:
+            pass
         if report and cleanup_report.total_removed > 0:
             logger.info(str(cleanup_report))

mcpbr 0.4.14__py3-none-any.whl → 0.4.16__py3-none-any.whl

mcpbr 0.4.14py3-none-any.whl → 0.4.16py3-none-any.whl