PyPI - eval-protocol - Versions diffs - 0.2.99__tar.gz → 0.2.99.dev2__tar.gz - Mend

eval-protocol 0.2.99tar.gz → 0.2.99.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (472) hide show

{eval_protocol-0.2.99/eval_protocol.egg-info → eval_protocol-0.2.99.dev2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-protocol
-Version: 0.2.99
+Version: 0.2.99.dev2
 Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
 Author-email: Fireworks AI <info@fireworks.ai>
 License-Expression: MIT

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/_version.py RENAMED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-12-16T16:20:44-0800",
+ "date": "2025-12-17T19:22:32-0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "2b765e03b649eee53bc18f024d5e7f7dbeb2891a",
- "version": "0.2.99"
+ "full-revisionid": "686ed67e7b83d4451d8fbd613f7d261a41fff9cb",
+ "version": "0.2.99.dev.2"
 }
 '''  # END VERSION_JSON

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/create_rft.py RENAMED Viewed

@@ -660,8 +660,8 @@ def _create_rft_job(
         ("temperature", "temperature"),
         ("topP", "top_p"),
         ("topK", "top_k"),
-        ("maxTokens", "max_output_tokens"),
-        ("n", "response_candidates_count"),
+        ("maxOutputTokens", "max_output_tokens"),
+        ("responseCandidatesCount", "response_candidates_count"),
     ]:
         val = getattr(args, arg_name, None)
         if val is not None:

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/tinker_rollout_processor.py RENAMED Viewed

@@ -152,7 +152,7 @@ class TinkerRolloutProcessor(RolloutProcessor):
             # Update row
             new_messages = list(row.messages) + [Message(role="assistant", content=assistant_content)]
             row.messages = new_messages
-            row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+            row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
             # Log usage (approximate since Tinker might not return usage stats in same format)
             # We can count tokens ourselves

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/execution/manager.py RENAMED Viewed

@@ -150,7 +150,7 @@ class ExecutionManager:
                 else:
                     evaluation_row.rollout_status = Status.rollout_running()
-                evaluation_row.execution_metadata.duration_seconds = time.perf_counter() - row_start_time
+                evaluation_row.execution_metadata.rollout_duration_seconds = time.perf_counter() - row_start_time
                 return evaluation_row

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/models.py RENAMED Viewed

@@ -809,9 +809,21 @@ class ExecutionMetadata(BaseModel):
     cost_metrics: Optional[CostMetrics] = Field(default=None, description="Cost breakdown for LLM API calls.")
+    # deprecated: use rollout_duration_seconds and eval_duration_seconds instead
     duration_seconds: Optional[float] = Field(
         default=None,
-        description="Processing duration in seconds for this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
+        deprecated=True,
+        description="[Deprecated] Processing duration in seconds for this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
+    )
+    rollout_duration_seconds: Optional[float] = Field(
+        default=None,
+        description="Processing duration in seconds for the rollout of this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
+    )
+    eval_duration_seconds: Optional[float] = Field(
+        default=None,
+        description="Processing duration in seconds for the evaluation of this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
     )
     experiment_duration_seconds: Optional[float] = Field(

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py RENAMED Viewed

@@ -267,7 +267,7 @@ class AgentRolloutProcessor(RolloutProcessor):
                     total_tokens=agent.usage["total_tokens"],
                 )
-                agent.evaluation_row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+                agent.evaluation_row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
                 return agent.evaluation_row
             finally:

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py RENAMED Viewed

@@ -83,7 +83,7 @@ class PydanticAgentRolloutProcessor(RolloutProcessor):
             #     total_tokens=usage_info.total_tokens or 0,
             # )
-            row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+            row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
             return row

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_single_turn_rollout_process.py RENAMED Viewed

@@ -180,7 +180,7 @@ class SingleTurnRolloutProcessor(RolloutProcessor):
             row.messages = messages
-            row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+            row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
             default_logger.log(row)
             return row

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/evaluation_test_utils.py RENAMED Viewed

@@ -42,7 +42,7 @@ AggregationMethod = Literal["mean", "max", "min", "bootstrap"]
 async def run_tasks_with_eval_progress(
-    pointwise_tasks: list[asyncio.Task[EvaluationRow]], run_idx: int
+    pointwise_tasks: list[asyncio.Task[EvaluationRow]], run_idx: int, disable_tqdm: bool = False
 ) -> list[EvaluationRow]:
     """
     Run evaluation tasks with a progress bar and proper cancellation handling.
@@ -66,6 +66,7 @@ async def run_tasks_with_eval_progress(
         miniters=1,
         mininterval=0.1,
         bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
+        disable=disable_tqdm,
     ) as eval_pbar:
         async def task_with_progress(task: asyncio.Task[EvaluationRow]) -> EvaluationRow:
@@ -88,7 +89,10 @@ async def run_tasks_with_eval_progress(
 async def run_tasks_with_run_progress(
-    execute_run_func: Callable[[int, RolloutProcessorConfig], Any], num_runs: int, config: RolloutProcessorConfig
+    execute_run_func: Callable[[int, RolloutProcessorConfig], Any],
+    num_runs: int,
+    config: RolloutProcessorConfig,
+    disable_tqdm: bool = False,
 ) -> None:
     """
     Run tasks with a parallel runs progress bar, preserving original logic.
@@ -108,6 +112,7 @@ async def run_tasks_with_run_progress(
         dynamic_ncols=True,
         miniters=1,
         bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
+        disable=disable_tqdm,
     ) as run_pbar:
         async def execute_run_with_progress(run_idx: int, config: RolloutProcessorConfig) -> Any:
@@ -330,6 +335,7 @@ async def rollout_processor_with_retry(
     fresh_dataset: list[EvaluationRow],
     config: RolloutProcessorConfig,
     run_idx: int = 0,
+    disable_tqdm: bool = False,
 ) -> AsyncGenerator[EvaluationRow, None]:
     """
     Wrapper around rollout_processor that handles retry logic using the Python backoff library.
@@ -449,6 +455,7 @@ async def rollout_processor_with_retry(
             miniters=1,
             mininterval=0.1,
             bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
+            disable=disable_tqdm,
         ) as rollout_pbar:
             # Yield results as they complete
             for task in asyncio.as_completed(retry_tasks):

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/github_action_rollout_processor.py RENAMED Viewed

@@ -162,7 +162,7 @@ class GithubActionRolloutProcessor(RolloutProcessor):
                 row.rollout_status = Status.rollout_error(
                     f"Failed to find workflow run in GHA with rollout_id {row.execution_metadata.rollout_id}"
                 )
-                row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+                row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
                 return row
             run_id = run.get("id")
@@ -170,7 +170,7 @@ class GithubActionRolloutProcessor(RolloutProcessor):
                 row.rollout_status = Status.rollout_error(
                     f"Failed to find workflow run in GHA with rollout_id {row.execution_metadata.rollout_id}"
                 )
-                row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+                row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
                 return row
             # Poll the specific run until completion
@@ -194,10 +194,10 @@ class GithubActionRolloutProcessor(RolloutProcessor):
                 row.rollout_status = Status.rollout_error(
                     f"GitHub Actions run timed out after {self.timeout_seconds} seconds"
                 )
-                row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+                row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
                 return row
-            row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+            row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
             def _update_with_trace() -> None:
                 return update_row_with_remote_trace(row, self._output_data_loader, self.model_base_url)

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/openenv_rollout_processor.py RENAMED Viewed

@@ -411,7 +411,7 @@ class OpenEnvRolloutProcessor(RolloutProcessor):
                     completion_tokens=usage["completion_tokens"],
                     total_tokens=usage["total_tokens"],
                 )
-                row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+                row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
                 # Attach per-step rewards and accumulated token IDs to
                 # execution_metadata.extra for downstream integrations
@@ -436,14 +436,14 @@ class OpenEnvRolloutProcessor(RolloutProcessor):
                 logger.info("[OpenEnvRolloutProcessor] Total reward: %.3f", total_reward)
                 logger.info(
                     "[OpenEnvRolloutProcessor] Duration: %.2fs",
-                    row.execution_metadata.duration_seconds,
+                    row.execution_metadata.rollout_duration_seconds,
                 )
                 logger.debug("[OpenEnvRolloutProcessor] Messages collected: %d", len(messages))
                 logger.info(
                     f"Rollout complete: {len(step_rewards)} steps, "
                     f"total_reward={total_reward:.2f}, "
-                    f"duration={row.execution_metadata.duration_seconds:.2f}s"
+                    f"duration={row.execution_metadata.rollout_duration_seconds:.2f}s"
                 )
                 # Final log with complete message history
                 if getattr(config, "logger", None):

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/priority_scheduler.py RENAMED Viewed

@@ -1,9 +1,12 @@
 import asyncio
 import logging
 import os
+import time
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import Any, Callable, List, Dict, Optional, Union, Awaitable
+from typing import Any, List, Dict, Optional, Union
+from tqdm.asyncio import tqdm as async_tqdm
 from eval_protocol.models import EvaluationRow, Status
 from eval_protocol.pytest.types import RolloutProcessorConfig, TestFunction
@@ -79,6 +82,18 @@ class PriorityRolloutScheduler:
         self.rollout_n = rollout_n
         self.in_group_minibatch_size = in_group_minibatch_size if in_group_minibatch_size > 0 else rollout_n
         self.evaluation_test_kwargs = evaluation_test_kwargs
+        # Progress bars (initialized in run())
+        self.rollout_pbar: Optional[async_tqdm] = None
+        self.eval_pbar: Optional[async_tqdm] = None
+        # Track active rollouts: {row_index: set of run_indices currently in progress}
+        self.active_rollouts: Dict[int, set] = defaultdict(set)
+        self.active_rollouts_lock = asyncio.Lock()
+        # Track active evaluations
+        self.active_evals: int = 0
+        self.active_evals_lock = asyncio.Lock()
     async def schedule_dataset(
         self,
@@ -132,41 +147,68 @@ class PriorityRolloutScheduler:
             experiment_id = rows_to_eval[0].execution_metadata.experiment_id if isinstance(rows_to_eval, list) else rows_to_eval.execution_metadata.experiment_id
             run_id = rows_to_eval[0].execution_metadata.run_id if isinstance(rows_to_eval, list) else rows_to_eval.execution_metadata.run_id
             eval_res = None
+            # Track active eval
+            async with self.active_evals_lock:
+                self.active_evals += 1
+                if self.eval_pbar:
+                    self.eval_pbar.set_postfix_str(f"active={self.active_evals}")
+            start_time = time.perf_counter()
-            async with self.eval_sem:
-                async with rollout_logging_context(
-                    rollout_id or "",
-                    experiment_id=experiment_id,
-                    run_id=run_id,
-                ):
-                    if isinstance(rows_to_eval, list):
-                        eval_res = await execute_pytest_with_exception_handling(
-                            test_func=self.eval_executor,
-                            evaluation_test_kwargs=self.evaluation_test_kwargs,
-                            processed_dataset=rows_to_eval,
-                        )
-                    else:
-                        eval_res = await execute_pytest_with_exception_handling(
-                            test_func=self.eval_executor,
-                            evaluation_test_kwargs=self.evaluation_test_kwargs,
-                            processed_row=rows_to_eval,
-                        )
-            # push result to the output buffer
-            if self.output_buffer:
+            try:
+                async with self.eval_sem:
+                    async with rollout_logging_context(
+                        rollout_id or "",
+                        experiment_id=experiment_id,
+                        run_id=run_id,
+                    ):
+                        if isinstance(rows_to_eval, list):
+                            eval_res = await execute_pytest_with_exception_handling(
+                                test_func=self.eval_executor,
+                                evaluation_test_kwargs=self.evaluation_test_kwargs,
+                                processed_dataset=rows_to_eval,
+                            )
+                        else:
+                            eval_res = await execute_pytest_with_exception_handling(
+                                test_func=self.eval_executor,
+                                evaluation_test_kwargs=self.evaluation_test_kwargs,
+                                processed_row=rows_to_eval,
+                            )
+                eval_duration = time.perf_counter() - start_time
+                # Set eval_duration_seconds BEFORE buffer writes to ensure it's included in serialization
                 if isinstance(eval_res, list):
                     for row in eval_res:
-                        self._post_process_result(row)
-                        await self.output_buffer.add_result(row)
+                        row.execution_metadata.eval_duration_seconds = eval_duration
                 else:
-                    self._post_process_result(eval_res)
-                    await self.output_buffer.add_result(eval_res)
+                    eval_res.execution_metadata.eval_duration_seconds = eval_duration
-            if isinstance(eval_res, list):
-                self.results.extend(eval_res)
-            else:
-                self.results.append(eval_res)
-            return eval_res
+                # push result to the output buffer
+                if self.output_buffer:
+                    if isinstance(eval_res, list):
+                        for row in eval_res:
+                            self._post_process_result(row)
+                            await self.output_buffer.add_result(row)
+                    else:
+                        self._post_process_result(eval_res)
+                        await self.output_buffer.add_result(eval_res)
+                if isinstance(eval_res, list):
+                    for row in eval_res:
+                        self.results.append(row)
+                else:
+                    self.results.append(eval_res)
+                return eval_res
+            finally:
+                # Always update progress bar (handles both success and failure cases)
+                if self.eval_pbar:
+                    self.eval_pbar.update(1)
+                # Decrement active eval counter
+                async with self.active_evals_lock:
+                    self.active_evals -= 1
+                    if self.eval_pbar:
+                        self.eval_pbar.set_postfix_str(f"active={self.active_evals}")
         # 1. Prepare Config & Row for this micro-batch
         current_batch_rows = []
@@ -205,15 +247,33 @@ class PriorityRolloutScheduler:
         batch_results: List[EvaluationRow] = []
         if current_batch_rows:
             for idx, row in current_batch_rows:
-                async for result_row in rollout_processor_with_retry(
-                    self.rollout_processor, [row], task.config, idx
-                ):
-                    batch_results.append(result_row)
-                    # in pointwise, we start evaluation immediately
-                    if self.mode == "pointwise":
-                        t = asyncio.create_task(_run_eval(result_row))
-                        self.background_tasks.add(t)
-                        t.add_done_callback(self.background_tasks.discard)
+                # Track this rollout as active
+                async with self.active_rollouts_lock:
+                    self.active_rollouts[task.row_index].add(idx)
+                    await self._update_rollout_pbar_postfix()
+                try:
+                    async for result_row in rollout_processor_with_retry(
+                        self.rollout_processor, [row], task.config, idx, disable_tqdm=True
+                    ):
+                        batch_results.append(result_row)
+                        # Update rollout progress bar
+                        if self.rollout_pbar:
+                            self.rollout_pbar.update(1)
+                        # in pointwise, we start evaluation immediately
+                        if self.mode == "pointwise":
+                            t = asyncio.create_task(_run_eval(result_row))
+                            self.background_tasks.add(t)
+                            t.add_done_callback(self.background_tasks.discard)
+                finally:
+                    # Remove from active tracking
+                    async with self.active_rollouts_lock:
+                        self.active_rollouts[task.row_index].discard(idx)
+                        if not self.active_rollouts[task.row_index]:
+                            del self.active_rollouts[task.row_index]
+                        await self._update_rollout_pbar_postfix()
         # 3. Evaluate and Collect History
         current_batch_history_updates = []
@@ -257,6 +317,34 @@ class PriorityRolloutScheduler:
             )
             self.queue.put_nowait(new_task)
+    def _format_active_rollouts(self) -> str:
+        """Format active rollouts for display in progress bar."""
+        if not self.active_rollouts:
+            return ""
+        # Show active rows and their run indices
+        parts = []
+        for row_idx in sorted(self.active_rollouts.keys())[:5]:  # Limit to 5 rows to keep it readable
+            runs = sorted(self.active_rollouts[row_idx])
+            if runs:
+                runs_str = ",".join(str(r) for r in runs[:3])  # Show up to 3 run indices
+                if len(runs) > 3:
+                    runs_str += f"+{len(runs)-3}"
+                parts.append(f"r{row_idx}:[{runs_str}]")
+        if len(self.active_rollouts) > 5:
+            parts.append(f"+{len(self.active_rollouts)-5} more")
+        return " | ".join(parts)
+    async def _update_rollout_pbar_postfix(self):
+        """Update the rollout progress bar postfix with active tasks info."""
+        if self.rollout_pbar:
+            active_count = sum(len(runs) for runs in self.active_rollouts.values())
+            self.rollout_pbar.set_postfix_str(
+                f"active={active_count} {self._format_active_rollouts()}"
+            )
     def _post_process_result(self, res: EvaluationRow):
         """
         Process evaluation result: update cost metrics, status, and log.
@@ -294,28 +382,58 @@ class PriorityRolloutScheduler:
     async def run(self, dataset: List[EvaluationRow], num_runs: int, base_config: RolloutProcessorConfig):
         self.num_runs = num_runs
-        # 1. Schedule initial tasks
-        await self.schedule_dataset(dataset, base_config)
-        # 2. Start Workers
-        # If we have separate limits, we need enough workers to saturate both stages
-        num_workers = self.max_concurrent_rollouts
-        workers = [asyncio.create_task(self.worker()) for _ in range(num_workers)]
-        # 3. Wait for completion
-        await self.queue.join()
-        # Wait for background evaluations to finish
-        if self.background_tasks:
-            await asyncio.gather(*self.background_tasks, return_exceptions=True)
+        # Calculate totals for progress bars
+        total_rollouts = len(dataset) * num_runs
+        # In pointwise mode: 1 eval per rollout; in groupwise mode: 1 eval per dataset row
+        total_evals = total_rollouts if self.mode == "pointwise" else len(dataset)
-        # 4. Cleanup
-        for w in workers:
-            w.cancel()
+        # Initialize progress bars
+        self.rollout_pbar = async_tqdm(
+            total=total_rollouts,
+            desc="🚀 Rollouts",
+            unit="row",
+            position=0,
+            leave=True,
+            colour="cyan",
+        )
+        self.eval_pbar = async_tqdm(
+            total=total_evals,
+            desc="📊 Evals",
+            unit="eval",
+            position=1,
+            leave=True,
+            colour="green",
+        )
-        if workers:
-            await asyncio.gather(*workers, return_exceptions=True)
+        try:
+            # 1. Schedule initial tasks
+            await self.schedule_dataset(dataset, base_config)
+            # 2. Start Workers
+            # If we have separate limits, we need enough workers to saturate both stages
+            num_workers = self.max_concurrent_rollouts
+            workers = [asyncio.create_task(self.worker()) for _ in range(num_workers)]
+            # 3. Wait for completion
+            await self.queue.join()
+            # Wait for background evaluations to finish
+            if self.background_tasks:
+                await asyncio.gather(*self.background_tasks, return_exceptions=True)
+            # 4. Cleanup
+            for w in workers:
+                w.cancel()
+            if workers:
+                await asyncio.gather(*workers, return_exceptions=True)
+        finally:
+            # Close progress bars
+            if self.rollout_pbar:
+                self.rollout_pbar.close()
+            if self.eval_pbar:
+                self.eval_pbar.close()
         # Return collected results
         return self.results

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/remote_rollout_processor.py RENAMED Viewed

@@ -185,7 +185,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
                     f"Rollout {row.execution_metadata.rollout_id} timed out after {timeout_seconds} seconds"
                 )
-            row.execution_metadata.duration_seconds = time.perf_counter() - start_time
+            row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
             def _update_with_trace() -> None:
                 return update_row_with_remote_trace(row, self._output_data_loader, model_base_url)

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2/eval_protocol.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-protocol
-Version: 0.2.99
+Version: 0.2.99.dev2
 Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
 Author-email: Fireworks AI <info@fireworks.ai>
 License-Expression: MIT

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cli_create_rft.py RENAMED Viewed

@@ -182,8 +182,8 @@ def test_create_rft_passes_all_flags_into_request_body(rft_test_harness, monkeyp
     assert abs(ip["temperature"] - 0.9) < 1e-12
     assert abs(ip["topP"] - 0.95) < 1e-12
     assert ip["topK"] == 50
-    assert ip["maxTokens"] == 4096
-    assert ip["n"] == 6
+    assert ip["maxOutputTokens"] == 4096
+    assert ip["responseCandidatesCount"] == 6
     assert ip["extraBody"] == '{"foo":"bar"}'
     # W&B mapping
@@ -1126,8 +1126,8 @@ def test_cli_full_command_style_evaluator_and_dataset_flags(tmp_path, monkeypatc
     # Inference params mapping
     ip = body["inferenceParameters"]
-    assert ip["n"] == 4
-    assert ip["maxTokens"] == 32768
+    assert ip["responseCandidatesCount"] == 4
+    assert ip["maxOutputTokens"] == 32768
     # Other top-level
     assert body["chunkSize"] == 50

{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_priority_scheduler.py RENAMED Viewed

@@ -57,7 +57,7 @@ async def test_scheduler_basic_execution(
     micro_batch_size = 1
     # Mock rollout processor with delay
-    async def delayed_rollout(processor, rows, config, run_idx):
+    async def delayed_rollout(processor, rows, config, run_idx, **kwargs):
         await asyncio.sleep(0.01)
         for row in rows:
             yield row
@@ -110,7 +110,7 @@ async def test_concurrency_control(
     rollout_lock = asyncio.Lock()
     eval_lock = asyncio.Lock()
-    async def mock_rollout_gen(processor, rows, config, run_idx):
+    async def mock_rollout_gen(processor, rows, config, run_idx, **kwargs):
         nonlocal active_rollouts, max_active_rollouts_seen
         async with rollout_lock:
             active_rollouts += 1
@@ -177,7 +177,7 @@ async def test_priority_scheduling(
     execution_order = []
-    async def mock_rollout_gen(processor, rows, config, run_idx):
+    async def mock_rollout_gen(processor, rows, config, run_idx, **kwargs):
         row_id = rows[0].input_metadata.row_id
         execution_order.append(f"{row_id}_run_{run_idx}")
         for row in rows:
@@ -290,7 +290,7 @@ async def test_groupwise_mode(
         eval_calls.append(rows)
         return rows # Pass through
-    async def mock_rollout_gen(processor, rows, config, run_idx):
+    async def mock_rollout_gen(processor, rows, config, run_idx, **kwargs):
         for row in rows:
             yield row

eval-protocol 0.2.99__tar.gz → 0.2.99.dev2__tar.gz

eval-protocol 0.2.99tar.gz → 0.2.99.dev2tar.gz