PyPI - eval-protocol - Versions diffs - 0.2.6.dev2__tar.gz → 0.2.8__tar.gz - Mend

eval-protocol 0.2.6.dev2tar.gz → 0.2.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (334) hide show

{eval_protocol-0.2.6.dev2/eval_protocol.egg-info → eval_protocol-0.2.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-protocol
-Version: 0.2.6.dev2
+Version: 0.2.8
 Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
 Author-email: Fireworks AI <info@fireworks.ai>
 License-Expression: MIT

{eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/_version.py RENAMED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-08-10T19:39:17-0700",
+ "date": "2025-08-11T22:02:14-0700",
  "dirty": false,
  "error": null,
- "full-revisionid": "a50c3f62fdb5be7347741446338d8c1771e92547",
- "version": "0.2.6-dev2"
+ "full-revisionid": "b004c422c7d873890fc88cc299935929fa966b1f",
+ "version": "0.2.8"
 }
 '''  # END VERSION_JSON

{eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli.py RENAMED Viewed

@@ -289,6 +289,7 @@ def parse_args(args=None):
     # Logs command
     logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
+    logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
     # Run command (for Hydra-based evaluations)
     # This subparser intentionally defines no arguments itself.

{eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/logs.py RENAMED Viewed

@@ -11,15 +11,16 @@ from ..utils.logs_server import serve_logs
 def logs_command(args):
     """Serve logs with file watching and real-time updates"""
+    port = args.port
     print(f"🚀 Starting Eval Protocol Logs Server")
-    print(f"🌐 URL: http://localhost:8000")
-    print(f"🔌 WebSocket: ws://localhost:8000/ws")
+    print(f"🌐 URL: http://localhost:{port}")
+    print(f"🔌 WebSocket: ws://localhost:{port}/ws")
     print(f"👀 Watching paths: {['current directory']}")
     print("Press Ctrl+C to stop the server")
     print("-" * 50)
     try:
-        serve_logs()
+        serve_logs(port=args.port)
         return 0
     except KeyboardInterrupt:
         print("\n🛑 Server stopped by user")

{eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py RENAMED Viewed

@@ -22,9 +22,8 @@ class SqliteDatasetLoggerAdapter(DatasetLogger):
             self._store = SqliteEvaluationRowStore(self.db_path)
     def log(self, row: "EvaluationRow") -> None:
-        row_id = row.input_metadata.row_id
         data = row.model_dump(exclude_none=True, mode="json")
-        self._store.upsert_row(row_id=row_id, data=data)
+        self._store.upsert_row(data=data)
         try:
             event_bus.emit(LOG_EVENT_TYPE, EvaluationRow(**data))
         except Exception as e:
@@ -32,8 +31,8 @@ class SqliteDatasetLoggerAdapter(DatasetLogger):
             logger.error(f"Failed to emit row_upserted event: {e}")
             pass
-    def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
+    def read(self, rollout_id: Optional[str] = None) -> List["EvaluationRow"]:
         from eval_protocol.models import EvaluationRow
-        results = self._store.read_rows(row_id=row_id)
+        results = self._store.read_rows(rollout_id=rollout_id)
         return [EvaluationRow(**data) for data in results]

{eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py RENAMED Viewed

@@ -11,7 +11,7 @@ class SqliteEvaluationRowStore:
     """
     Lightweight reusable SQLite store for evaluation rows.
-    Stores arbitrary row data as JSON keyed by a unique string `row_id`.
+    Stores arbitrary row data as JSON keyed by a unique string `rollout_id`.
     """
     def __init__(self, db_path: str):
@@ -24,7 +24,7 @@ class SqliteEvaluationRowStore:
                 database = self._db
         class EvaluationRow(BaseModel):  # type: ignore
-            row_id = CharField(unique=True)
+            rollout_id = CharField(unique=True)
             data = JSONField()
         self._EvaluationRow = EvaluationRow
@@ -36,22 +36,25 @@ class SqliteEvaluationRowStore:
     def db_path(self) -> str:
         return self._db_path
-    def upsert_row(self, row_id: str, data: dict) -> None:
-        if self._EvaluationRow.select().where(self._EvaluationRow.row_id == row_id).exists():
-            self._EvaluationRow.update(data=data).where(self._EvaluationRow.row_id == row_id).execute()
+    def upsert_row(self, data: dict) -> None:
+        rollout_id = data["execution_metadata"]["rollout_id"]
+        if rollout_id is None:
+            raise ValueError("execution_metadata.rollout_id is required to upsert a row")
+        if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
+            self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
         else:
-            self._EvaluationRow.create(row_id=row_id, data=data)
+            self._EvaluationRow.create(rollout_id=rollout_id, data=data)
-    def read_rows(self, row_id: Optional[str] = None) -> List[dict]:
-        if row_id is None:
+    def read_rows(self, rollout_id: Optional[str] = None) -> List[dict]:
+        if rollout_id is None:
             query = self._EvaluationRow.select().dicts()
         else:
-            query = self._EvaluationRow.select().dicts().where(self._EvaluationRow.row_id == row_id)
+            query = self._EvaluationRow.select().dicts().where(self._EvaluationRow.rollout_id == rollout_id)
         results = list(query)
         return [result["data"] for result in results]
-    def delete_row(self, row_id: str) -> int:
-        return self._EvaluationRow.delete().where(self._EvaluationRow.row_id == row_id).execute()
+    def delete_row(self, rollout_id: str) -> int:
+        return self._EvaluationRow.delete().where(self._EvaluationRow.rollout_id == rollout_id).execute()
     def delete_all_rows(self) -> int:
         return self._EvaluationRow.delete().execute()

{eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/manager.py RENAMED Viewed

@@ -158,8 +158,8 @@ class ExecutionManager:
                 messages.append(Message.model_validate(msg_dict))
             evaluation_rows[idx].messages = messages
-            evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
-            evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
+            # evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
+            # evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
             evaluation_rows[idx].tools = shared_tool_schema
             evaluation_rows[idx].usage = CompletionUsage(**trajectory.usage)
             evaluation_rows[idx].input_metadata.completion_params = CompletionParams(
@@ -482,11 +482,11 @@ class ExecutionManager:
                 trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"})
             try:
                 await envs.connection_manager.reset_session(session)
-            except:
+            except:  # noqa: E722
                 logger.error(f"Error resetting session {session.session_id}")
             try:
                 await envs.connection_manager.close_session(session)
-            except:
+            except:  # noqa: E722
                 logger.error(f"Error closing session {session.session_id}")
         return trajectory

{eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/models.py RENAMED Viewed

@@ -202,6 +202,21 @@ class InputMetadata(BaseModel):
     )
+class EvaluationThreshold(BaseModel):
+    """Threshold configuration for evaluation tests.
+    The success field is required - tests must specify a minimum success rate.
+    The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
+    """
+    success: float = Field(
+        ..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
+    )
+    standard_deviation: Optional[float] = Field(
+        None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
+    )
 class EvalMetadata(BaseModel):
     """Metadata about the evaluation that was run."""
@@ -216,10 +231,36 @@ class EvalMetadata(BaseModel):
     )
     num_runs: int = Field(..., description="Number of times the evaluation was repeated")
     aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
-    threshold_of_success: Optional[float] = Field(None, description="Threshold score for test success")
+    passed_threshold: Optional[EvaluationThreshold] = Field(
+        None, description="Threshold configuration for test success"
+    )
     passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
+class ExecutionMetadata(BaseModel):
+    """Metadata about the execution of the evaluation."""
+    invocation_id: Optional[str] = Field(
+        default_factory=generate_id,
+        description="The ID of the invocation that this row belongs to.",
+    )
+    experiment_id: Optional[str] = Field(
+        default_factory=generate_id,
+        description="The ID of the experiment that this row belongs to.",
+    )
+    rollout_id: Optional[str] = Field(
+        default_factory=generate_id,
+        description="The ID of the rollout that this row belongs to.",
+    )
+    run_id: Optional[str] = Field(
+        None,
+        description=("The ID of the run that this row belongs to."),
+    )
 class RolloutStatus(BaseModel):
     """Status of the rollout."""
@@ -264,26 +305,6 @@ class EvaluationRow(BaseModel):
         description="The status of the rollout.",
     )
-    invocation_id: Optional[str] = Field(
-        default_factory=generate_id,
-        description="The ID of the invocation that this row belongs to.",
-    )
-    cohort_id: Optional[str] = Field(
-        default_factory=generate_id,
-        description="The ID of the cohort that this row belongs to.",
-    )
-    rollout_id: Optional[str] = Field(
-        default_factory=generate_id,
-        description="The ID of the rollout that this row belongs to.",
-    )
-    run_id: Optional[str] = Field(
-        None,
-        description=("The ID of the run that this row belongs to."),
-    )
     # Ground truth reference (moved from EvaluateResult to top level)
     ground_truth: Optional[str] = Field(
         default=None, description="Optional ground truth reference for this evaluation."
@@ -294,6 +315,11 @@ class EvaluationRow(BaseModel):
         default=None, description="The evaluation result for this row/trajectory."
     )
+    execution_metadata: ExecutionMetadata = Field(
+        default_factory=ExecutionMetadata,
+        description="Metadata about the execution of the evaluation.",
+    )
     # LLM usage statistics
     usage: Optional[CompletionUsage] = Field(
         default=None, description="Token usage statistics from LLM calls during execution."

{eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_agent_rollout_processor.py RENAMED Viewed

@@ -8,7 +8,7 @@ from openai import NOT_GIVEN, NotGiven
 from openai.types.chat import ChatCompletionContentPartTextParam, ChatCompletionMessage, ChatCompletionToolParam
 from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
-from eval_protocol.dataset_logger import default_logger
+from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
 from eval_protocol.mcp.execution.policy import LiteLLMPolicy
 from eval_protocol.mcp.mcp_multi_client import MCPMultiClient
 from eval_protocol.models import EvaluationRow, Message
@@ -20,12 +20,13 @@ class Agent:
     A really simple agent that calls the model until no more tool calls are needed.
     """
-    def __init__(self, model: str, row: EvaluationRow, config_path: str):
+    def __init__(self, model: str, row: EvaluationRow, config_path: str, logger: DatasetLogger):
         self.model = model
         self.evaluation_row: EvaluationRow = row
         self._policy = LiteLLMPolicy(model_id=model)
         self.mcp_client = MCPMultiClient(config_path=config_path) if config_path else None
         self.tools: Union[List[ChatCompletionToolParam], NotGiven] = NOT_GIVEN
+        self.logger: DatasetLogger = logger
     async def setup(self):
         if self.mcp_client:
@@ -42,7 +43,7 @@ class Agent:
     def append_message_and_log(self, message: Message):
         self.messages.append(message)
-        default_logger.log(self.evaluation_row)
+        self.logger.log(self.evaluation_row)
     async def call_agent(self) -> str:
         """
@@ -116,7 +117,7 @@ async def default_agent_rollout_processor(
 ) -> List[EvaluationRow]:
     dataset: Dataset = []
     for row in rows:
-        agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path)
+        agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path, logger=config.logger)
         await agent.setup()
         await agent.call_agent()
         dataset.append(agent.evaluation_row)

{eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_single_turn_rollout_process.py RENAMED Viewed

@@ -1,11 +1,9 @@
 import asyncio
-from typing import List
 import logging
 import os
+from typing import List
-from eval_protocol.dataset_logger import default_logger
-from eval_protocol.models import EvaluationRow, Message, ChatCompletionMessageToolCall
+from eval_protocol.models import ChatCompletionMessageToolCall, EvaluationRow, Message
 from eval_protocol.pytest.types import RolloutProcessorConfig
@@ -49,6 +47,7 @@ async def default_single_turn_rollout_processor(
         # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
         import importlib
         _litellm = importlib.import_module("litellm")
         acompletion = getattr(_litellm, "acompletion")
         response = await acompletion(**request_params)
@@ -79,7 +78,7 @@ async def default_single_turn_rollout_processor(
         ]
         row.messages = messages
-        default_logger.log(row)
+        config.logger.log(row)
         return row
     # Process rows with bounded concurrency if configured

{eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/evaluation_test.py RENAMED Viewed

@@ -3,13 +3,21 @@ import inspect
 import math
 import os
 import statistics
-from typing import Any, Callable, Dict, List, Literal, Optional
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
 import pytest
 from eval_protocol.dataset_logger import default_logger
+from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
 from eval_protocol.human_id import generate_id
-from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata, Message
+from eval_protocol.models import (
+    CompletionParams,
+    EvalMetadata,
+    EvaluationRow,
+    EvaluationThreshold,
+    InputMetadata,
+    Message,
+)
 from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
 from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
 from eval_protocol.pytest.types import (
@@ -46,7 +54,7 @@ def evaluation_test(  # noqa: C901
     rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
     evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
     aggregation_method: AggregationMethod = "mean",
-    threshold_of_success: Optional[float] = None,
+    passed_threshold: Optional[Union[EvaluationThreshold, float]] = None,
     num_runs: int = 1,
     max_dataset_rows: Optional[int] = None,
     mcp_config_path: Optional[str] = None,
@@ -55,6 +63,7 @@ def evaluation_test(  # noqa: C901
     steps: int = 30,
     mode: EvaluationTestMode = "batch",
     combine_datasets: bool = True,
+    logger: Optional[DatasetLogger] = None,
 ) -> Callable[
     [TestFunction],
     TestFunction,
@@ -64,14 +73,14 @@ def evaluation_test(  # noqa: C901
     Here are some key concepts to understand the terminology in EP:
     - "invocation" is a single execution of a test function. An invocation can
-        generate 1 or more cohorts. Grouping by invocation might be useful to
+        generate 1 or more experiments. Grouping by invocation might be useful to
         aggregate eval scores across multiple invocations when you want to aggregate
         scores across multiple datasets.
-    - "cohort" is a group of runs with for a combination of parameters. A single
-        cohort will have multiple runs if num_runs > 1.
+    - "experiment" is a group of runs with for a combination of parameters. A single
+        experiment will have multiple runs if num_runs > 1.
         1. If your evaluation_test has combinations of parameters, it will generate
-        multiple cohorts per combination of parameters.
-        2. A new execution of a test function will generate a new cohort.
+        multiple experiments per combination of parameters.
+        2. A new execution of a test function will generate a new experiment.
     - "run" is a group of rollouts. For multiple num_runs > 1, there will be
         multiple "run_id"s.
     - "rollout" is the execution/process that produces a "trajectory". You
@@ -89,7 +98,7 @@ def evaluation_test(  # noqa: C901
         decorated test. It simply produces a score from 0 to 1 and attached it
         to the row as the "evaluation_result" field.
-    "invocation", "cohort", "run", "rollout", and "row" each have a unique ID
+    "invocation", "experiment", "run", "rollout", and "row" each have a unique ID
     which can be used to easily group and identify your dataset by.
     Args:
@@ -106,8 +115,8 @@ def evaluation_test(  # noqa: C901
         rollout_processor: Function used to perform the rollout.
         evaluation_test_kwargs: Kwargs for the evaluation function.
         aggregation_method: How to aggregate scores across rows.
-        threshold_of_success: If set, fail the test if the aggregated score is
-            below this threshold.
+        passed_threshold: Threshold configuration for test success.
+            Success rate must be above success, and if set, standard deviation must be below standard_deviation.
         num_runs: Number of times to repeat the rollout and evaluations.
         max_dataset_rows: Limit dataset to the first N rows.
         mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
@@ -117,11 +126,22 @@ def evaluation_test(  # noqa: C901
         mode: Evaluation mode. "batch" (default) expects test function to handle
             full dataset. "pointwise" applies test function to each row. If your evaluation requires
             the full rollout of all rows to compute the score, use
+        logger: DatasetLogger to use for logging. If not provided, a default logger will be used.
     """
+    active_logger: DatasetLogger = logger if logger else default_logger
     def decorator(
         test_func: TestFunction,
     ):
+        if passed_threshold is not None:
+            if isinstance(passed_threshold, float):
+                threshold = EvaluationThreshold(success=passed_threshold)
+            else:
+                threshold = EvaluationThreshold(**passed_threshold)
+        else:
+            threshold = None
         sig = inspect.signature(test_func)
         # For pointwise/rowwise mode, we expect a different signature
@@ -280,14 +300,14 @@ def evaluation_test(  # noqa: C901
             def wrapper_body(**kwargs):
                 model_name = kwargs["model"]
                 eval_metadata = None
-                all_results: List[EvaluationRow] = []
+                all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)]
-                cohort_id = generate_id()
+                experiment_id = generate_id()
                 def _log_eval_error(
                     status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
                 ) -> None:
-                    log_eval_status_and_rows(eval_metadata, rows, status, passed, default_logger)
+                    log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger)
                 try:
                     # Handle dataset loading
@@ -341,7 +361,7 @@ def evaluation_test(  # noqa: C901
                         status="running",
                         num_runs=num_runs,
                         aggregation_method=aggregation_method,
-                        threshold_of_success=threshold_of_success,
+                        passed_threshold=threshold,
                         passed=None,
                     )
@@ -363,13 +383,12 @@ def evaluation_test(  # noqa: C901
                         row.input_metadata.session_data["mode"] = mode
                         # Initialize eval_metadata for each row
                         row.eval_metadata = eval_metadata
-                        row.cohort_id = cohort_id
-                        row.invocation_id = invocation_id
+                        row.execution_metadata.experiment_id = experiment_id
+                        row.execution_metadata.invocation_id = invocation_id
                         # has to be done in the pytest main process since it's
                         # used to determine whether this eval has stopped
                         row.pid = os.getpid()
-                        default_logger.log(row)
                     # Prepare rollout processor config once; we will generate fresh outputs per run
                     config = RolloutProcessorConfig(
@@ -379,21 +398,26 @@ def evaluation_test(  # noqa: C901
                         max_concurrent_rollouts=max_concurrent_rollouts,
                         server_script_path=server_script_path,
                         steps=steps,
+                        logger=active_logger,
                     )
-                    for _ in range(num_runs):
+                    for i in range(num_runs):
                         # Regenerate outputs each run by deep-copying the pristine dataset
                         # so model responses are not reused across runs.
                         run_id = generate_id()
-                        fresh_dataset = [copy.deepcopy(r) for r in data]
+                        fresh_dataset = [r.model_copy(deep=True) for r in data]
                         # apply new run_id to fresh_dataset
                         for row in fresh_dataset:
-                            row.run_id = run_id
+                            row.execution_metadata.run_id = run_id
                         # generate new rollout_id for each row
                         for row in fresh_dataset:
-                            row.rollout_id = generate_id()
+                            row.execution_metadata.rollout_id = generate_id()
+                        # log the fresh_dataset
+                        for row in fresh_dataset:
+                            active_logger.log(row)
                         processed_dataset = execute_function(rollout_processor, rows=fresh_dataset, config=config)
@@ -409,7 +433,7 @@ def evaluation_test(  # noqa: C901
                                     raise ValueError(
                                         f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
                                     )
-                                all_results.append(result)
+                                all_results[i].append(result)
                         else:
                             # Batch mode: call the test function with the full dataset
                             results = execute_with_params(
@@ -433,17 +457,21 @@ def evaluation_test(  # noqa: C901
                                 raise ValueError(
                                     f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
                                 )
-                            all_results.extend(results)
+                            all_results[i] = results
-                    scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
+                    scores = [
+                        sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
+                        for result in all_results
+                    ]
                     agg_score = aggregate(scores, aggregation_method)
+                    score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
                     # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
                     ci_low: float | None = None
                     ci_high: float | None = None
                     if aggregation_method == "mean":
                         try:
-                            result_ci = compute_fixed_set_mu_ci(all_results)
+                            result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
                             mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
                             if mu_ci_low is not None and mu_ci_high is not None:
                                 ci_low = float(mu_ci_low)
@@ -455,15 +483,24 @@ def evaluation_test(  # noqa: C901
                     # Determine if the evaluation passed based on threshold
                     passed = None
-                    if threshold_of_success is not None:
-                        passed = agg_score >= threshold_of_success
+                    if threshold is not None:
+                        success_passed, std_passed = True, True
+                        success_passed = agg_score >= threshold.success
+                        if threshold.standard_deviation is not None:
+                            std_passed = score_std <= threshold.standard_deviation
+                        passed = success_passed and std_passed
                     # Update eval metadata status and passed field for all results
-                    for r in all_results:
-                        if r.eval_metadata is not None:
-                            r.eval_metadata.status = "finished"
-                            r.eval_metadata.passed = passed
-                        default_logger.log(r)
+                    for result in all_results:
+                        for r in result:
+                            if r.eval_metadata is not None:
+                                r.eval_metadata.status = "finished"
+                                r.eval_metadata.passed = passed
+                            active_logger.log(r)
                     # Optional: print and/or persist a summary artifact for CI
                     try:
@@ -471,7 +508,7 @@ def evaluation_test(  # noqa: C901
                         summary_path = os.getenv("EP_SUMMARY_JSON")
                         suite_name = test_func.__name__
                         model_used = model_name
-                        total_rows = len(all_results)
+                        total_rows = len([item for sublist in all_results for item in sublist])
                         summary_obj = {
                             "suite": suite_name,
                             "model": model_used,
@@ -488,7 +525,7 @@ def evaluation_test(  # noqa: C901
                         from collections import defaultdict
                         metric_scores: Dict[str, list] = defaultdict(list)
-                        for r in all_results:
+                        for r in [item for sublist in all_results for item in sublist]:
                             if r.evaluation_result and r.evaluation_result.metrics:
                                 for m_name, m_res in r.evaluation_result.metrics.items():
                                     if m_res is not None and getattr(m_res, "score", None) is not None:
@@ -587,11 +624,32 @@ def evaluation_test(  # noqa: C901
                         # Do not fail evaluation if summary writing fails
                         pass
+                    # # Write all rows from active_logger.read() to a JSONL file in the same directory as the summary
+                    # try:
+                    #     if active_logger is not None:
+                    #         rows = active_logger.read()
+                    #         # Write to a .jsonl file alongside the summary file
+                    #         jsonl_path = "logs.jsonl"
+                    #         import json
+                    #         with open(jsonl_path, "w", encoding="utf-8") as f_jsonl:
+                    #             for row in rows:
+                    #                 json.dump(row.model_dump(exclude_none=True, mode="json"), f_jsonl)
+                    #                 f_jsonl.write("\n")
+                    # except Exception as e:
+                    #     # Do not fail evaluation if log writing fails
+                    #     print(e)
+                    #     pass
                     # Check threshold after logging
-                    if threshold_of_success is not None and not passed:
+                    if threshold is not None and not passed:
                         assert (
-                            agg_score >= threshold_of_success
-                        ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}"
+                            agg_score >= threshold.success
+                        ), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
+                        if threshold.standard_deviation is not None:
+                            assert (
+                                score_std <= threshold.standard_deviation
+                            ), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
                 except AssertionError:
                     _log_eval_error("finished", data if "data" in locals() else None, passed=False)

{eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/types.py RENAMED Viewed

@@ -5,6 +5,9 @@ Parameter types
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Literal, Optional
+from eval_protocol.dataset_logger import default_logger
+from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
 from ..models import EvaluationRow, Message
 ModelParam = str  # gpt-4o, gpt-4o-mini, accounts/fireworks/models/llama-3.1-8b-instruct
@@ -39,10 +42,13 @@ Rollout processor types
 class RolloutProcessorConfig:
     model: ModelParam
     input_params: RolloutInputParam  # optional input parameters for inference
-    mcp_config_path: str
-    server_script_path: Optional[str] = None  # TODO: change from server_script_path to mcp_config_path for agent rollout processor
+    mcp_config_path: str
+    server_script_path: Optional[str] = (
+        None  # TODO: change from server_script_path to mcp_config_path for agent rollout processor
+    )
     max_concurrent_rollouts: int = 8  # maximum number of concurrent rollouts
     steps: int = 30  # max number of rollout steps
+    logger: DatasetLogger = default_logger  # logger to use during rollout for mid-rollout logs
 RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], List[EvaluationRow]]

eval-protocol 0.2.6.dev2__tar.gz → 0.2.8__tar.gz

eval-protocol 0.2.6.dev2tar.gz → 0.2.8tar.gz