PyPI - eval-protocol - Versions diffs - 0.2.8__tar.gz → 0.2.9__tar.gz - Mend

eval-protocol 0.2.8tar.gz → 0.2.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (340) hide show

{eval_protocol-0.2.8/eval_protocol.egg-info → eval_protocol-0.2.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-protocol
-Version: 0.2.8
+Version: 0.2.9
 Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
 Author-email: Fireworks AI <info@fireworks.ai>
 License-Expression: MIT

{eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/_version.py RENAMED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-08-11T22:02:14-0700",
+ "date": "2025-08-12T13:33:17-0700",
  "dirty": false,
  "error": null,
- "full-revisionid": "b004c422c7d873890fc88cc299935929fa966b1f",
- "version": "0.2.8"
+ "full-revisionid": "6b018d4d211d239896a5bda83b375b9bbb4fca34",
+ "version": "0.2.9"
 }
 '''  # END VERSION_JSON

eval_protocol-0.2.9/eval_protocol/benchmarks/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .registry import export_benchmark, get_benchmark_runner, list_benchmarks
+__all__ = [
+    "export_benchmark",
+    "get_benchmark_runner",
+    "list_benchmarks",
+]

eval_protocol-0.2.9/eval_protocol/benchmarks/registry.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""
+Benchmark registry and export decorator.
+This module provides a lightweight registry for benchmarks and a decorator
+`@export_benchmark(name)` that can be stacked with `@evaluation_test`.
+It registers a runnable handle that executes the exact same evaluation pipeline
+as the pytest flow by calling `run_evaluation_test_direct` with the parameters
+captured from the decorated function.
+Usage in a suite module (stack under @evaluation_test):
+    from eval_protocol.benchmarks.registry import export_benchmark
+    @export_benchmark("aime25_low")
+    @evaluation_test(...)
+    def test_aime_pointwise(row: EvaluationRow) -> EvaluationRow:
+        ...
+Programmatic run:
+    from eval_protocol.benchmarks.registry import get_benchmark_runner
+    get_benchmark_runner("aime25_low")(model="fireworks_ai/...", print_summary=True, out="artifacts/aime.json")
+"""
+from __future__ import annotations
+import json
+import os
+from typing import Any, Callable, Dict, List, Optional
+# Global registry: name -> callable runner
+_BENCHMARK_REGISTRY: Dict[str, Callable[..., Any]] = {}
+def list_benchmarks() -> List[str]:
+    return sorted(_BENCHMARK_REGISTRY.keys())
+def get_benchmark_runner(name: str) -> Callable[..., Any]:
+    try:
+        return _BENCHMARK_REGISTRY[name]
+    except KeyError as exc:
+        raise KeyError(f"Benchmark '{name}' not found. Available: {list_benchmarks()}") from exc
+def export_benchmark(name: str) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
+    """
+    Decorator to export a benchmark test into the global registry.
+    This expects to be stacked with `@evaluation_test`, so the decorated function
+    should carry `__ep_config` and `__ep_original_test_func` attributes that the
+    decorator can read to construct a direct runner.
+    The registered runner supports a subset of convenient overrides and maps them
+    to the same EP_* environment variables used by the pytest plugin to ensure
+    identical summaries and JSON artifact behavior.
+    """
+    def _decorator(test_wrapper: Callable[..., Any]) -> Callable[..., Any]:
+        # Pull through metadata attached by evaluation_test
+        ep_config: Dict[str, Any] = getattr(test_wrapper, "__ep_config", {})
+        original_test_func: Optional[Callable[..., Any]] = getattr(
+            test_wrapper, "__ep_original_test_func", None
+        )
+        def _runner(
+            *,
+            model: Optional[str] = None,
+            print_summary: bool = False,
+            out: Optional[str] = None,
+            reasoning_effort: Optional[str] = None,
+            max_rows: Optional[int | str] = None,
+            num_runs: Optional[int] = None,
+            input_params_override: Optional[Dict[str, Any]] = None,
+            max_concurrency: Optional[int] = None,
+        ) -> Any:
+            # Map convenience flags to EP_* env used by the pytest flow
+            if print_summary:
+                os.environ["EP_PRINT_SUMMARY"] = "1"
+            if out:
+                os.environ["EP_SUMMARY_JSON"] = out
+            # Merge reasoning effort and arbitrary overrides into EP_INPUT_PARAMS_JSON
+            merged: Dict[str, Any] = {}
+            if reasoning_effort:
+                # Fireworks OpenAI-compatible endpoint expects extra_body.reasoning_effort, not nested reasoning dict
+                merged.setdefault("extra_body", {})["reasoning_effort"] = str(reasoning_effort)
+            if input_params_override:
+                def _deep_update(base: Dict[str, Any], over: Dict[str, Any]) -> Dict[str, Any]:
+                    for k, v in over.items():
+                        if isinstance(v, dict) and isinstance(base.get(k), dict):
+                            _deep_update(base[k], v)
+                        else:
+                            base[k] = v
+                    return base
+                merged = _deep_update(merged, dict(input_params_override))
+            if merged:
+                os.environ["EP_INPUT_PARAMS_JSON"] = json.dumps(merged)
+            if max_rows is not None:
+                if isinstance(max_rows, str) and max_rows.strip().lower() == "all":
+                    os.environ["EP_MAX_DATASET_ROWS"] = "None"
+                else:
+                    os.environ["EP_MAX_DATASET_ROWS"] = str(max_rows)
+            # Build effective parameters, preferring overrides
+            models: List[str] = ep_config.get("model") or []
+            model_to_use = model or (models[0] if models else None)
+            if not model_to_use:
+                raise ValueError(
+                    f"No model provided and none captured from evaluation_test for benchmark '{name}'"
+                )
+            input_messages = ep_config.get("input_messages")
+            input_dataset = ep_config.get("input_dataset")
+            dataset_adapter = ep_config.get("dataset_adapter")
+            rollout_input_params_list = ep_config.get("rollout_input_params")
+            rollout_processor = ep_config.get("rollout_processor")
+            aggregation_method = ep_config.get("aggregation_method")
+            threshold = ep_config.get("threshold_of_success")
+            default_num_runs = ep_config.get("num_runs")
+            max_dataset_rows = ep_config.get("max_dataset_rows")
+            mcp_config_path = ep_config.get("mcp_config_path")
+            max_concurrent_rollouts = ep_config.get("max_concurrent_rollouts")
+            if max_concurrency is not None:
+                max_concurrent_rollouts = int(max_concurrency)
+            server_script_path = ep_config.get("server_script_path")
+            steps = ep_config.get("steps")
+            mode = ep_config.get("mode")
+            combine_datasets = ep_config.get("combine_datasets")
+            # Choose the first rollout param set by default
+            rollout_params = None
+            if isinstance(rollout_input_params_list, list) and rollout_input_params_list:
+                rollout_params = rollout_input_params_list[0]
+            # Import runner lazily to avoid hard import dependencies and circulars
+            import importlib
+            _mod = importlib.import_module("eval_protocol.pytest.evaluation_test")
+            run_evaluation_test_direct = getattr(_mod, "run_evaluation_test_direct")
+            return run_evaluation_test_direct(
+                test_func=original_test_func or test_wrapper,
+                model=model_to_use,
+                input_messages=input_messages,
+                input_dataset=input_dataset,
+                dataset_adapter=dataset_adapter,
+                rollout_input_params=rollout_params,
+                rollout_processor=rollout_processor,
+                aggregation_method=aggregation_method,
+                threshold_of_success=threshold,
+                num_runs=(num_runs if num_runs is not None else default_num_runs),
+                max_dataset_rows=max_dataset_rows,
+                mcp_config_path=mcp_config_path,
+                max_concurrent_rollouts=max_concurrent_rollouts,
+                server_script_path=server_script_path,
+                steps=steps,
+                mode=mode,
+            )
+        # Register runner
+        if name in _BENCHMARK_REGISTRY:
+            # Overwrite with latest definition
+            _BENCHMARK_REGISTRY[name] = _runner
+        else:
+            _BENCHMARK_REGISTRY[name] = _runner
+        return test_wrapper
+    return _decorator

eval_protocol-0.2.9/eval_protocol/benchmarks/run.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""
+Minimal CLI runner for exported benchmarks.
+Usage:
+  python -m eval_protocol.benchmarks.run aime25_low \
+    --model fireworks_ai/accounts/fireworks/models/gpt-oss-120b \
+    --print-summary \
+    --out artifacts/aime25_low.json \
+    --max-rows 50 \
+    --reasoning-effort low
+"""
+from __future__ import annotations
+import argparse
+from typing import Any
+from importlib import import_module
+import pkgutil
+import eval_protocol.benchmarks.suites as suites_pkg
+from eval_protocol.benchmarks.registry import get_benchmark_runner, list_benchmarks
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run an exported eval-protocol benchmark")
+    parser.add_argument("name", help=f"Benchmark name. Known: {', '.join(list_benchmarks()) or '(none)'}")
+    parser.add_argument("--model", required=True, help="Model identifier (provider/model)")
+    parser.add_argument("--print-summary", action="store_true", help="Print concise EP summary line")
+    parser.add_argument("--out", help="Write JSON summary artifact to path or directory")
+    parser.add_argument(
+        "--reasoning-effort",
+        choices=["low", "medium", "high"],
+        help="Sets extra_body.reasoning.effort via EP_INPUT_PARAMS_JSON",
+    )
+    parser.add_argument(
+        "--max-rows",
+        help="Limit rows: integer or 'all' for no limit (maps to EP_MAX_DATASET_ROWS)",
+    )
+    parser.add_argument("--num-runs", type=int, help="Override num_runs if provided")
+    parser.add_argument("--max-tokens", type=int, help="Override max_tokens for generation requests")
+    parser.add_argument("--max-concurrency", type=int, help="Override max concurrent rollouts")
+    # Allow overriding reasoning effort explicitly (low/medium/high). If omitted, suite default is used.
+    # Already mapped by --reasoning-effort above.
+    return parser.parse_args()
+def main() -> int:
+    args = _parse_args()
+    # Auto-import all suite modules so their @export_benchmark decorators register
+    # Import all suite modules so their @export_benchmark decorators register
+    import sys, traceback
+    for modinfo in pkgutil.iter_modules(suites_pkg.__path__):
+        mod_name = f"{suites_pkg.__name__}.{modinfo.name}"
+        try:
+            import_module(mod_name)
+        except Exception as e:
+            print(f"[bench] failed to import suite module: {mod_name}: {e}", file=sys.stderr)
+            traceback.print_exc()
+    # Fallback: if nothing registered yet and a known suite was requested, try explicit import
+    if not list_benchmarks():
+        known_map = {
+            "aime25_low": "eval_protocol.benchmarks.suites.aime25",
+        }
+        forced = known_map.get(args.name)
+        if forced:
+            try:
+                import_module(forced)
+            except Exception as e:
+                print(f"[bench] explicit import failed for {forced}: {e}", file=sys.stderr)
+    runner = get_benchmark_runner(args.name)
+    max_rows: int | str | None = None
+    if args.max_rows is not None:
+        try:
+            max_rows = int(args.max_rows)
+        except Exception:
+            max_rows = str(args.max_rows)
+    # Build input params override if needed
+    ip_override = {}
+    if args.max_tokens is not None:
+        ip_override["max_tokens"] = int(args.max_tokens)
+    _ = runner(
+        model=args.model,
+        print_summary=args.print_summary,
+        out=args.out,
+        reasoning_effort=args.reasoning_effort,
+        max_rows=max_rows,
+        num_runs=args.num_runs,
+        input_params_override=(ip_override or None),
+        max_concurrency=args.max_concurrency,
+    )
+    # Non-zero exit on failure gate is handled within the runner via assertions
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

eval_protocol-0.2.9/eval_protocol/benchmarks/suites/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@
1	+ # Suite modules are auto-imported by eval_protocol.benchmarks.run to register benchmarks.
2	+
3	+

eval_protocol-0.2.9/eval_protocol/benchmarks/suites/aime25.py ADDED Viewed

@@ -0,0 +1,118 @@
+from typing import Any, Dict, List, Optional
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
+from eval_protocol.pytest.default_single_turn_rollout_process import (
+    default_single_turn_rollout_processor,
+)
+from eval_protocol.pytest.evaluation_test import evaluation_test
+from eval_protocol.benchmarks.registry import export_benchmark
+SYSTEM_PROMPT = (
+    "You are a helpful math assistant. Please reason step by step, and put your "
+    "final answer within \\boxed{...}."
+)
+def _extract_boxed_text(text: str) -> str:
+    import re
+    if not text:
+        return ""
+    pattern_boxed = r"boxed{(.*?)}|framebox{(.*?)}"
+    matches = re.findall(pattern_boxed, text, re.DOTALL)
+    if matches:
+        for match in matches[::-1]:
+            for group in match:
+                if group:
+                    return group.split(",")[-1].strip()
+    matches_digits = re.findall(r"\d+", text, re.DOTALL)
+    if matches_digits:
+        return matches_digits[-1]
+    return ""
+def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
+    import re
+    if s is None:
+        return None
+    m = re.match(r"\d+", str(s).strip())
+    if not m:
+        return None
+    try:
+        return int(m.group(0))
+    except ValueError:
+        return None
+def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
+    converted: List[EvaluationRow] = []
+    for r in rows:
+        question = r.get("question", "")
+        answer = r.get("answer", None)
+        messages = [
+            Message(role="system", content=SYSTEM_PROMPT),
+            Message(role="user", content=str(question)),
+        ]
+        converted.append(
+            EvaluationRow(messages=messages, ground_truth=str(answer) if answer is not None else None)
+        )
+    return converted
+@export_benchmark("aime25")
+@evaluation_test(
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
+    input_dataset=[
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
+        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
+    ],
+    dataset_adapter=aime2025_dataset_adapter,
+    rollout_input_params=[{"max_tokens": 131000, "extra_body": {"reasoning_effort": "low"}}],
+    rollout_processor=default_single_turn_rollout_processor,
+    aggregation_method="mean",
+    threshold_of_success=None,
+    num_runs=8,
+    max_dataset_rows=2,
+    max_concurrent_rollouts=4,
+    mode="pointwise",
+)
+def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
+    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
+    content = assistant_msgs[-1].content if assistant_msgs else ""
+    extracted_text = _extract_boxed_text(content or "")
+    extracted_int = _normalize_to_int_or_none(extracted_text)
+    gt_int = _normalize_to_int_or_none(row.ground_truth or "")
+    is_valid = extracted_int is not None and gt_int is not None
+    score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
+    metrics = {
+        "exact_match": MetricResult(
+            score=score,
+            is_score_valid=is_valid,
+            reason=(
+                "Parsed both integers and they matched"
+                if score == 1.0
+                else ("Parsed integers did not match" if is_valid else "Failed to parse integer")
+            ),
+            data={
+                "extracted_text": extracted_text,
+                "extracted_int": extracted_int,
+                "ground_truth_int": gt_int,
+            },
+        )
+    }
+    row.evaluation_result = EvaluateResult(
+        score=score,
+        reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
+        is_score_valid=is_valid,
+        metrics=metrics,
+    )
+    return row

eval_protocol-0.2.9/eval_protocol/benchmarks/suites/gpqa.py ADDED Viewed

@@ -0,0 +1,100 @@
+from typing import List
+import csv
+import io
+import re
+import requests
+from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
+from eval_protocol.pytest.evaluation_test import evaluation_test
+from eval_protocol.pytest.default_single_turn_rollout_process import (
+    default_single_turn_rollout_processor,
+)
+from eval_protocol.benchmarks.registry import export_benchmark
+SYSTEM_PROMPT = (
+    "You are a helpful assistant. Read the question and options carefully. "
+    "Express your final answer strictly as a single letter: A, B, C, or D."
+)
+def _load_gpqa_messages_from_csv() -> List[List[Message]]:
+    url = "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv"
+    resp = requests.get(url, timeout=60)
+    resp.raise_for_status()
+    messages_list: List[List[Message]] = []
+    reader = csv.DictReader(io.StringIO(resp.text))
+    for ex in reader:
+        q = str(ex.get("Question", ""))
+        correct = str(ex.get("Correct Answer", "")).strip()
+        inc1 = str(ex.get("Incorrect Answer 1", ""))
+        inc2 = str(ex.get("Incorrect Answer 2", ""))
+        inc3 = str(ex.get("Incorrect Answer 3", ""))
+        choices = [correct, inc1, inc2, inc3]
+        user_content = (
+            f"{q}\n\n(A) {choices[0]}\n(B) {choices[1]}\n(C) {choices[2]}\n(D) {choices[3]}\n\nAnswer with one letter."
+        )
+        messages_list.append(
+            [
+                Message(role="system", content=SYSTEM_PROMPT),
+                Message(role="user", content=user_content),
+                # Correct answer is always option A by construction
+                Message(role="system", content="__GT__:A"),
+            ]
+        )
+    if not messages_list:
+        raise RuntimeError("Failed to load GPQA messages: no rows found from source")
+    return messages_list
+def _extract_abcd_letter(text: str) -> str | None:
+    if not text:
+        return None
+    m = re.search(r"\b([ABCD])\b", text.upper())
+    return m.group(1) if m else None
+_GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv()
+@export_benchmark("gpqa")
+@evaluation_test(
+    model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
+    input_messages=_GPQA_INPUT_MESSAGES,
+    rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
+    rollout_processor=default_single_turn_rollout_processor,
+    aggregation_method="mean",
+    threshold_of_success=None,
+    num_runs=8,
+    mode="pointwise",
+)
+def gpqa_pointwise(row: EvaluationRow) -> EvaluationRow:
+    assistant_msgs = [m for m in row.messages if m.role == "assistant"]
+    content = assistant_msgs[-1].content if assistant_msgs else ""
+    pred = _extract_abcd_letter(content or "")
+    # Retrieve GT from the trailing system message we appended
+    gt_tokens = [m.content for m in row.messages if m.role == "system" and (m.content or "").startswith("__GT__:")]
+    gt = gt_tokens[-1].split(":", 1)[1].strip() if gt_tokens else None
+    is_valid = pred is not None and gt in {"A", "B", "C", "D"}
+    score = 1.0 if (is_valid and pred == gt) else 0.0
+    row.evaluation_result = EvaluateResult(
+        score=score,
+        reason=("Correct option" if score == 1.0 else "Incorrect option"),
+        is_score_valid=is_valid,
+        metrics={
+            "exact_match": MetricResult(
+                score=score,
+                is_score_valid=is_valid,
+                reason=("Matched" if score == 1.0 else "Not matched"),
+                data={"pred": pred, "gt": gt},
+            )
+        },
+    )
+    return row

{eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/execution/base_policy.py RENAMED Viewed

@@ -151,7 +151,7 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
         tool_schemas: List[Dict],
         env_index: int,
         conversation_history: List[Dict[str, Any]],
-    ) -> Tuple[List[MCPToolCall], CompletionUsage]:
+    ) -> Tuple[List[MCPToolCall], CompletionUsage, str]:
         """
         Generate tool calls using conversation history for proper OpenAI trajectories.
@@ -161,7 +161,7 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
             user_prompt: Current user prompt with observation
         Returns:
-            List of MCPToolCall objects
+            List of MCPToolCall objects, LLM usage stats, and finish reason
         """
         # Convert MCP tools to LLM format
         llm_tools = self._convert_mcp_tools_to_llm_format(tool_schemas)
@@ -190,6 +190,8 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
             total_tokens=response["usage"]["total_tokens"],
         )
+        finish_reason = response["choices"][0]["finish_reason"]
         # Extract tool call from response
         message = response["choices"][0]["message"]
         logger.debug(f"Environment {env_index} - Response message: {message}")
@@ -217,15 +219,19 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
             if self.max_tools_per_turn:
                 mcp_tool_calls = mcp_tool_calls[: self.max_tools_per_turn]
-            return mcp_tool_calls, usage_stats
+            return mcp_tool_calls, usage_stats, finish_reason
         else:
             # No tool calls in response - this is normal when episode ends or LLM provides only text
             logger.debug(f"No tool calls in response for env {env_index}, message content: {message.get('content')}")
-            return [
-                MCPToolCall(
-                    tool_name="_no_tool_call",
-                    arguments={
-                        "reason": "no_tool_call_generated",
-                    },
-                )
-            ], usage_stats
+            return (
+                [
+                    MCPToolCall(
+                        tool_name="_no_tool_call",
+                        arguments={
+                            "reason": "no_tool_call_generated",
+                        },
+                    )
+                ],
+                usage_stats,
+                finish_reason,
+            )

eval-protocol 0.2.8__tar.gz → 0.2.9__tar.gz

eval-protocol 0.2.8tar.gz → 0.2.9tar.gz