PyPI - eval-protocol - Versions diffs - 0.2.7__tar.gz → 0.2.8__tar.gz - Mend

eval-protocol 0.2.7tar.gz → 0.2.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (334) hide show

{eval_protocol-0.2.7/eval_protocol.egg-info → eval_protocol-0.2.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-protocol
-Version: 0.2.7
+Version: 0.2.8
 Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
 Author-email: Fireworks AI <info@fireworks.ai>
 License-Expression: MIT

{eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/_version.py RENAMED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-08-11T00:47:52-0700",
+ "date": "2025-08-11T22:02:14-0700",
  "dirty": false,
  "error": null,
- "full-revisionid": "38a44449f6d48a8a79eb11a0aaf873129df3e994",
- "version": "0.2.7"
+ "full-revisionid": "b004c422c7d873890fc88cc299935929fa966b1f",
+ "version": "0.2.8"
 }
 '''  # END VERSION_JSON

{eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli.py RENAMED Viewed

@@ -289,6 +289,7 @@ def parse_args(args=None):
     # Logs command
     logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
+    logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
     # Run command (for Hydra-based evaluations)
     # This subparser intentionally defines no arguments itself.

{eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/logs.py RENAMED Viewed

@@ -11,15 +11,16 @@ from ..utils.logs_server import serve_logs
 def logs_command(args):
     """Serve logs with file watching and real-time updates"""
+    port = args.port
     print(f"🚀 Starting Eval Protocol Logs Server")
-    print(f"🌐 URL: http://localhost:8000")
-    print(f"🔌 WebSocket: ws://localhost:8000/ws")
+    print(f"🌐 URL: http://localhost:{port}")
+    print(f"🔌 WebSocket: ws://localhost:{port}/ws")
     print(f"👀 Watching paths: {['current directory']}")
     print("Press Ctrl+C to stop the server")
     print("-" * 50)
     try:
-        serve_logs()
+        serve_logs(port=args.port)
         return 0
     except KeyboardInterrupt:
         print("\n🛑 Server stopped by user")

{eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py RENAMED Viewed

@@ -37,9 +37,9 @@ class SqliteEvaluationRowStore:
         return self._db_path
     def upsert_row(self, data: dict) -> None:
-        rollout_id = data["rollout_id"]
-        if "rollout_id" not in data:
-            raise ValueError("rollout_id is required to upsert a row")
+        rollout_id = data["execution_metadata"]["rollout_id"]
+        if rollout_id is None:
+            raise ValueError("execution_metadata.rollout_id is required to upsert a row")
         if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
             self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
         else:

{eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/manager.py RENAMED Viewed

@@ -158,8 +158,8 @@ class ExecutionManager:
                 messages.append(Message.model_validate(msg_dict))
             evaluation_rows[idx].messages = messages
-            evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
-            evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
+            # evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
+            # evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
             evaluation_rows[idx].tools = shared_tool_schema
             evaluation_rows[idx].usage = CompletionUsage(**trajectory.usage)
             evaluation_rows[idx].input_metadata.completion_params = CompletionParams(
@@ -482,11 +482,11 @@ class ExecutionManager:
                 trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"})
             try:
                 await envs.connection_manager.reset_session(session)
-            except:
+            except:  # noqa: E722
                 logger.error(f"Error resetting session {session.session_id}")
             try:
                 await envs.connection_manager.close_session(session)
-            except:
+            except:  # noqa: E722
                 logger.error(f"Error closing session {session.session_id}")
         return trajectory

{eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/models.py RENAMED Viewed

@@ -202,6 +202,21 @@ class InputMetadata(BaseModel):
     )
+class EvaluationThreshold(BaseModel):
+    """Threshold configuration for evaluation tests.
+    The success field is required - tests must specify a minimum success rate.
+    The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
+    """
+    success: float = Field(
+        ..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
+    )
+    standard_deviation: Optional[float] = Field(
+        None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
+    )
 class EvalMetadata(BaseModel):
     """Metadata about the evaluation that was run."""
@@ -216,10 +231,36 @@ class EvalMetadata(BaseModel):
     )
     num_runs: int = Field(..., description="Number of times the evaluation was repeated")
     aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
-    threshold_of_success: Optional[float] = Field(None, description="Threshold score for test success")
+    passed_threshold: Optional[EvaluationThreshold] = Field(
+        None, description="Threshold configuration for test success"
+    )
     passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
+class ExecutionMetadata(BaseModel):
+    """Metadata about the execution of the evaluation."""
+    invocation_id: Optional[str] = Field(
+        default_factory=generate_id,
+        description="The ID of the invocation that this row belongs to.",
+    )
+    experiment_id: Optional[str] = Field(
+        default_factory=generate_id,
+        description="The ID of the experiment that this row belongs to.",
+    )
+    rollout_id: Optional[str] = Field(
+        default_factory=generate_id,
+        description="The ID of the rollout that this row belongs to.",
+    )
+    run_id: Optional[str] = Field(
+        None,
+        description=("The ID of the run that this row belongs to."),
+    )
 class RolloutStatus(BaseModel):
     """Status of the rollout."""
@@ -264,26 +305,6 @@ class EvaluationRow(BaseModel):
         description="The status of the rollout.",
     )
-    invocation_id: Optional[str] = Field(
-        default_factory=generate_id,
-        description="The ID of the invocation that this row belongs to.",
-    )
-    cohort_id: Optional[str] = Field(
-        default_factory=generate_id,
-        description="The ID of the cohort that this row belongs to.",
-    )
-    rollout_id: Optional[str] = Field(
-        default_factory=generate_id,
-        description="The ID of the rollout that this row belongs to.",
-    )
-    run_id: Optional[str] = Field(
-        None,
-        description=("The ID of the run that this row belongs to."),
-    )
     # Ground truth reference (moved from EvaluateResult to top level)
     ground_truth: Optional[str] = Field(
         default=None, description="Optional ground truth reference for this evaluation."
@@ -294,6 +315,11 @@ class EvaluationRow(BaseModel):
         default=None, description="The evaluation result for this row/trajectory."
     )
+    execution_metadata: ExecutionMetadata = Field(
+        default_factory=ExecutionMetadata,
+        description="Metadata about the execution of the evaluation.",
+    )
     # LLM usage statistics
     usage: Optional[CompletionUsage] = Field(
         default=None, description="Token usage statistics from LLM calls during execution."

{eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/evaluation_test.py RENAMED Viewed

@@ -3,14 +3,21 @@ import inspect
 import math
 import os
 import statistics
-from typing import Any, Callable, Dict, List, Literal, Optional
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
 import pytest
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
 from eval_protocol.human_id import generate_id
-from eval_protocol.models import CompletionParams, EvalMetadata, EvaluationRow, InputMetadata, Message
+from eval_protocol.models import (
+    CompletionParams,
+    EvalMetadata,
+    EvaluationRow,
+    EvaluationThreshold,
+    InputMetadata,
+    Message,
+)
 from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
 from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
 from eval_protocol.pytest.types import (
@@ -47,7 +54,7 @@ def evaluation_test(  # noqa: C901
     rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
     evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
     aggregation_method: AggregationMethod = "mean",
-    threshold_of_success: Optional[float] = None,
+    passed_threshold: Optional[Union[EvaluationThreshold, float]] = None,
     num_runs: int = 1,
     max_dataset_rows: Optional[int] = None,
     mcp_config_path: Optional[str] = None,
@@ -66,14 +73,14 @@ def evaluation_test(  # noqa: C901
     Here are some key concepts to understand the terminology in EP:
     - "invocation" is a single execution of a test function. An invocation can
-        generate 1 or more cohorts. Grouping by invocation might be useful to
+        generate 1 or more experiments. Grouping by invocation might be useful to
         aggregate eval scores across multiple invocations when you want to aggregate
         scores across multiple datasets.
-    - "cohort" is a group of runs with for a combination of parameters. A single
-        cohort will have multiple runs if num_runs > 1.
+    - "experiment" is a group of runs with for a combination of parameters. A single
+        experiment will have multiple runs if num_runs > 1.
         1. If your evaluation_test has combinations of parameters, it will generate
-        multiple cohorts per combination of parameters.
-        2. A new execution of a test function will generate a new cohort.
+        multiple experiments per combination of parameters.
+        2. A new execution of a test function will generate a new experiment.
     - "run" is a group of rollouts. For multiple num_runs > 1, there will be
         multiple "run_id"s.
     - "rollout" is the execution/process that produces a "trajectory". You
@@ -91,7 +98,7 @@ def evaluation_test(  # noqa: C901
         decorated test. It simply produces a score from 0 to 1 and attached it
         to the row as the "evaluation_result" field.
-    "invocation", "cohort", "run", "rollout", and "row" each have a unique ID
+    "invocation", "experiment", "run", "rollout", and "row" each have a unique ID
     which can be used to easily group and identify your dataset by.
     Args:
@@ -108,8 +115,8 @@ def evaluation_test(  # noqa: C901
         rollout_processor: Function used to perform the rollout.
         evaluation_test_kwargs: Kwargs for the evaluation function.
         aggregation_method: How to aggregate scores across rows.
-        threshold_of_success: If set, fail the test if the aggregated score is
-            below this threshold.
+        passed_threshold: Threshold configuration for test success.
+            Success rate must be above success, and if set, standard deviation must be below standard_deviation.
         num_runs: Number of times to repeat the rollout and evaluations.
         max_dataset_rows: Limit dataset to the first N rows.
         mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
@@ -127,6 +134,14 @@ def evaluation_test(  # noqa: C901
     def decorator(
         test_func: TestFunction,
     ):
+        if passed_threshold is not None:
+            if isinstance(passed_threshold, float):
+                threshold = EvaluationThreshold(success=passed_threshold)
+            else:
+                threshold = EvaluationThreshold(**passed_threshold)
+        else:
+            threshold = None
         sig = inspect.signature(test_func)
         # For pointwise/rowwise mode, we expect a different signature
@@ -285,9 +300,9 @@ def evaluation_test(  # noqa: C901
             def wrapper_body(**kwargs):
                 model_name = kwargs["model"]
                 eval_metadata = None
-                all_results: List[EvaluationRow] = []
+                all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)]
-                cohort_id = generate_id()
+                experiment_id = generate_id()
                 def _log_eval_error(
                     status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
@@ -346,7 +361,7 @@ def evaluation_test(  # noqa: C901
                         status="running",
                         num_runs=num_runs,
                         aggregation_method=aggregation_method,
-                        threshold_of_success=threshold_of_success,
+                        passed_threshold=threshold,
                         passed=None,
                     )
@@ -368,8 +383,8 @@ def evaluation_test(  # noqa: C901
                         row.input_metadata.session_data["mode"] = mode
                         # Initialize eval_metadata for each row
                         row.eval_metadata = eval_metadata
-                        row.cohort_id = cohort_id
-                        row.invocation_id = invocation_id
+                        row.execution_metadata.experiment_id = experiment_id
+                        row.execution_metadata.invocation_id = invocation_id
                         # has to be done in the pytest main process since it's
                         # used to determine whether this eval has stopped
@@ -386,19 +401,19 @@ def evaluation_test(  # noqa: C901
                         logger=active_logger,
                     )
-                    for _ in range(num_runs):
+                    for i in range(num_runs):
                         # Regenerate outputs each run by deep-copying the pristine dataset
                         # so model responses are not reused across runs.
                         run_id = generate_id()
-                        fresh_dataset = [copy.deepcopy(r) for r in data]
+                        fresh_dataset = [r.model_copy(deep=True) for r in data]
                         # apply new run_id to fresh_dataset
                         for row in fresh_dataset:
-                            row.run_id = run_id
+                            row.execution_metadata.run_id = run_id
                         # generate new rollout_id for each row
                         for row in fresh_dataset:
-                            row.rollout_id = generate_id()
+                            row.execution_metadata.rollout_id = generate_id()
                         # log the fresh_dataset
                         for row in fresh_dataset:
@@ -418,7 +433,7 @@ def evaluation_test(  # noqa: C901
                                     raise ValueError(
                                         f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
                                     )
-                                all_results.append(result)
+                                all_results[i].append(result)
                         else:
                             # Batch mode: call the test function with the full dataset
                             results = execute_with_params(
@@ -442,17 +457,21 @@ def evaluation_test(  # noqa: C901
                                 raise ValueError(
                                     f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
                                 )
-                            all_results.extend(results)
+                            all_results[i] = results
-                    scores = [r.evaluation_result.score for r in all_results if r.evaluation_result]
+                    scores = [
+                        sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
+                        for result in all_results
+                    ]
                     agg_score = aggregate(scores, aggregation_method)
+                    score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
                     # Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
                     ci_low: float | None = None
                     ci_high: float | None = None
                     if aggregation_method == "mean":
                         try:
-                            result_ci = compute_fixed_set_mu_ci(all_results)
+                            result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
                             mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
                             if mu_ci_low is not None and mu_ci_high is not None:
                                 ci_low = float(mu_ci_low)
@@ -464,15 +483,24 @@ def evaluation_test(  # noqa: C901
                     # Determine if the evaluation passed based on threshold
                     passed = None
-                    if threshold_of_success is not None:
-                        passed = agg_score >= threshold_of_success
+                    if threshold is not None:
+                        success_passed, std_passed = True, True
+                        success_passed = agg_score >= threshold.success
+                        if threshold.standard_deviation is not None:
+                            std_passed = score_std <= threshold.standard_deviation
+                        passed = success_passed and std_passed
                     # Update eval metadata status and passed field for all results
-                    for r in all_results:
-                        if r.eval_metadata is not None:
-                            r.eval_metadata.status = "finished"
-                            r.eval_metadata.passed = passed
-                        active_logger.log(r)
+                    for result in all_results:
+                        for r in result:
+                            if r.eval_metadata is not None:
+                                r.eval_metadata.status = "finished"
+                                r.eval_metadata.passed = passed
+                            active_logger.log(r)
                     # Optional: print and/or persist a summary artifact for CI
                     try:
@@ -480,7 +508,7 @@ def evaluation_test(  # noqa: C901
                         summary_path = os.getenv("EP_SUMMARY_JSON")
                         suite_name = test_func.__name__
                         model_used = model_name
-                        total_rows = len(all_results)
+                        total_rows = len([item for sublist in all_results for item in sublist])
                         summary_obj = {
                             "suite": suite_name,
                             "model": model_used,
@@ -497,7 +525,7 @@ def evaluation_test(  # noqa: C901
                         from collections import defaultdict
                         metric_scores: Dict[str, list] = defaultdict(list)
-                        for r in all_results:
+                        for r in [item for sublist in all_results for item in sublist]:
                             if r.evaluation_result and r.evaluation_result.metrics:
                                 for m_name, m_res in r.evaluation_result.metrics.items():
                                     if m_res is not None and getattr(m_res, "score", None) is not None:
@@ -614,10 +642,14 @@ def evaluation_test(  # noqa: C901
                     #     pass
                     # Check threshold after logging
-                    if threshold_of_success is not None and not passed:
+                    if threshold is not None and not passed:
                         assert (
-                            agg_score >= threshold_of_success
-                        ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}"
+                            agg_score >= threshold.success
+                        ), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
+                        if threshold.standard_deviation is not None:
+                            assert (
+                                score_std <= threshold.standard_deviation
+                            ), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
                 except AssertionError:
                     _log_eval_error("finished", data if "data" in locals() else None, passed=False)

{eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/logs_server.py RENAMED Viewed

@@ -87,18 +87,32 @@ class WebSocketManager:
             return
         tasks = []
+        failed_connections = []
         for connection in connections:
             try:
                 tasks.append(connection.send_text(text))
             except Exception as e:
                 logger.error(f"Failed to send text to WebSocket: {e}")
-                with self._lock:
-                    try:
-                        self.active_connections.remove(connection)
-                    except ValueError:
-                        pass
+                failed_connections.append(connection)
+        # Execute all sends in parallel
         if tasks:
-            await asyncio.gather(*tasks, return_exceptions=True)
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+            # Check for any exceptions that occurred during execution
+            for i, result in enumerate(results):
+                if isinstance(result, Exception):
+                    logger.error(f"Failed to send text to WebSocket: {result}")
+                    failed_connections.append(connections[i])
+        # Remove all failed connections
+        with self._lock:
+            for connection in failed_connections:
+                try:
+                    self.active_connections.remove(connection)
+                except ValueError:
+                    pass
     def start_broadcast_loop(self):
         """Start the broadcast loop in the current event loop."""
@@ -109,6 +123,7 @@ class WebSocketManager:
         """Stop the broadcast loop."""
         if self._broadcast_task and not self._broadcast_task.done():
             self._broadcast_task.cancel()
+            self._broadcast_task = None
 class EvaluationWatcher:
@@ -233,7 +248,6 @@ class LogsServer(ViteServer):
         # Subscribe to events and start listening for cross-process events
         event_bus.subscribe(self._handle_event)
-        event_bus.start_listening()
         logger.info(f"LogsServer initialized on {host}:{port}")
@@ -273,6 +287,12 @@ class LogsServer(ViteServer):
             data = EvaluationRow(**data)
             self.websocket_manager.broadcast_row_upserted(data)
+    def start_loops(self):
+        """Start the broadcast loop and evaluation watcher."""
+        self.websocket_manager.start_broadcast_loop()
+        self.evaluation_watcher.start()
+        event_bus.start_listening()
     async def run_async(self):
         """
         Run the logs server asynchronously with file watching.
@@ -285,11 +305,7 @@ class LogsServer(ViteServer):
             logger.info(f"Serving files from: {self.build_dir}")
             logger.info("WebSocket endpoint available at /ws")
-            # Start the broadcast loop
-            self.websocket_manager.start_broadcast_loop()
-            # Start the evaluation watcher
-            self.evaluation_watcher.start()
+            self.start_loops()
             config = uvicorn.Config(
                 self.app,
@@ -319,20 +335,54 @@ class LogsServer(ViteServer):
         asyncio.run(self.run_async())
-server = LogsServer()
-app = server.app
+def create_app(host: str = "localhost", port: int = 8000, build_dir: Optional[str] = None) -> FastAPI:
+    """
+    Factory function to create a FastAPI app instance and start the server with async loops.
+    This creates a LogsServer instance and starts it in a background thread to ensure
+    all async loops (WebSocket broadcast, evaluation watching) are running.
+    Args:
+        host: Host to bind to
+        port: Port to bind to
+        build_dir: Optional custom build directory path
-def serve_logs():
+    Returns:
+        FastAPI app instance with server running in background
+    """
+    if build_dir is None:
+        build_dir = os.path.abspath(
+            os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "vite-app", "dist")
+        )
+    server = LogsServer(host=host, port=port, build_dir=build_dir)
+    server.start_loops()
+    return server.app
+# For backward compatibility and direct usage
+def serve_logs(port: Optional[int] = None):
     """
     Convenience function to create and run a LogsServer.
     """
-    global server, app
-    if server is None:
-        server = LogsServer()
-        app = server.app
+    server = LogsServer(port=port)
     server.run()
 if __name__ == "__main__":
-    serve_logs()
+    import argparse
+    parser = argparse.ArgumentParser(description="Start the evaluation logs server")
+    parser.add_argument("--host", default="localhost", help="Host to bind to (default: localhost)")
+    parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
+    parser.add_argument("--build-dir", help="Path to Vite build directory")
+    args = parser.parse_args()
+    # Create server with command line arguments
+    if args.build_dir:
+        server = LogsServer(host=args.host, port=args.port, build_dir=args.build_dir)
+    else:
+        server = LogsServer(host=args.host, port=args.port)
+    server.run()

eval-protocol 0.2.7__tar.gz → 0.2.8__tar.gz

eval-protocol 0.2.7tar.gz → 0.2.8tar.gz