eval-protocol 0.2.99__tar.gz → 0.2.99.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.99/eval_protocol.egg-info → eval_protocol-0.2.99.dev2}/PKG-INFO +1 -1
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/create_rft.py +2 -2
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/tinker_rollout_processor.py +1 -1
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/execution/manager.py +1 -1
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/models.py +13 -1
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py +1 -1
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +1 -1
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_single_turn_rollout_process.py +1 -1
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/evaluation_test_utils.py +9 -2
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/github_action_rollout_processor.py +4 -4
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/openenv_rollout_processor.py +3 -3
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/priority_scheduler.py +178 -60
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/remote_rollout_processor.py +1 -1
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cli_create_rft.py +4 -4
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_priority_scheduler.py +4 -4
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/assets/index-CuQbfdPD.js +1 -1
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/assets/index-CuQbfdPD.js.map +1 -1
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/LICENSE +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/README.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/development/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/export_docs.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/local_test.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/cli_commands/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/openai_rft.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/tinker_cookbook.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/buffer.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/training/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/training/gepa_trainer.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/training/gepa_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/training/trainer.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/training/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/pyproject.toml +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/setup.cfg +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/setup.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cli_local_test.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_config.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_exception_config.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_format.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_length.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_litellm_policy_provider_fields.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_math.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_models.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_openai_rft_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_sqlite_hardening.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_training_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/versioneer.py +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/assets/index-iZp_HgyW.css +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.99
|
|
3
|
+
Version: 0.2.99.dev2
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-12-
|
|
11
|
+
"date": "2025-12-17T19:22:32-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.99"
|
|
14
|
+
"full-revisionid": "686ed67e7b83d4451d8fbd613f7d261a41fff9cb",
|
|
15
|
+
"version": "0.2.99.dev.2"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -660,8 +660,8 @@ def _create_rft_job(
|
|
|
660
660
|
("temperature", "temperature"),
|
|
661
661
|
("topP", "top_p"),
|
|
662
662
|
("topK", "top_k"),
|
|
663
|
-
("
|
|
664
|
-
("
|
|
663
|
+
("maxOutputTokens", "max_output_tokens"),
|
|
664
|
+
("responseCandidatesCount", "response_candidates_count"),
|
|
665
665
|
]:
|
|
666
666
|
val = getattr(args, arg_name, None)
|
|
667
667
|
if val is not None:
|
|
@@ -152,7 +152,7 @@ class TinkerRolloutProcessor(RolloutProcessor):
|
|
|
152
152
|
# Update row
|
|
153
153
|
new_messages = list(row.messages) + [Message(role="assistant", content=assistant_content)]
|
|
154
154
|
row.messages = new_messages
|
|
155
|
-
row.execution_metadata.
|
|
155
|
+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
|
|
156
156
|
|
|
157
157
|
# Log usage (approximate since Tinker might not return usage stats in same format)
|
|
158
158
|
# We can count tokens ourselves
|
|
@@ -150,7 +150,7 @@ class ExecutionManager:
|
|
|
150
150
|
else:
|
|
151
151
|
evaluation_row.rollout_status = Status.rollout_running()
|
|
152
152
|
|
|
153
|
-
evaluation_row.execution_metadata.
|
|
153
|
+
evaluation_row.execution_metadata.rollout_duration_seconds = time.perf_counter() - row_start_time
|
|
154
154
|
|
|
155
155
|
return evaluation_row
|
|
156
156
|
|
|
@@ -809,9 +809,21 @@ class ExecutionMetadata(BaseModel):
|
|
|
809
809
|
|
|
810
810
|
cost_metrics: Optional[CostMetrics] = Field(default=None, description="Cost breakdown for LLM API calls.")
|
|
811
811
|
|
|
812
|
+
# deprecated: use rollout_duration_seconds and eval_duration_seconds instead
|
|
812
813
|
duration_seconds: Optional[float] = Field(
|
|
813
814
|
default=None,
|
|
814
|
-
|
|
815
|
+
deprecated=True,
|
|
816
|
+
description="[Deprecated] Processing duration in seconds for this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
rollout_duration_seconds: Optional[float] = Field(
|
|
820
|
+
default=None,
|
|
821
|
+
description="Processing duration in seconds for the rollout of this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
eval_duration_seconds: Optional[float] = Field(
|
|
825
|
+
default=None,
|
|
826
|
+
description="Processing duration in seconds for the evaluation of this evaluation row. Note that if it gets retried, this will be the duration of the last attempt.",
|
|
815
827
|
)
|
|
816
828
|
|
|
817
829
|
experiment_duration_seconds: Optional[float] = Field(
|
|
@@ -267,7 +267,7 @@ class AgentRolloutProcessor(RolloutProcessor):
|
|
|
267
267
|
total_tokens=agent.usage["total_tokens"],
|
|
268
268
|
)
|
|
269
269
|
|
|
270
|
-
agent.evaluation_row.execution_metadata.
|
|
270
|
+
agent.evaluation_row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
|
|
271
271
|
|
|
272
272
|
return agent.evaluation_row
|
|
273
273
|
finally:
|
|
@@ -83,7 +83,7 @@ class PydanticAgentRolloutProcessor(RolloutProcessor):
|
|
|
83
83
|
# total_tokens=usage_info.total_tokens or 0,
|
|
84
84
|
# )
|
|
85
85
|
|
|
86
|
-
row.execution_metadata.
|
|
86
|
+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
|
|
87
87
|
|
|
88
88
|
return row
|
|
89
89
|
|
|
@@ -180,7 +180,7 @@ class SingleTurnRolloutProcessor(RolloutProcessor):
|
|
|
180
180
|
|
|
181
181
|
row.messages = messages
|
|
182
182
|
|
|
183
|
-
row.execution_metadata.
|
|
183
|
+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
|
|
184
184
|
|
|
185
185
|
default_logger.log(row)
|
|
186
186
|
return row
|
{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/evaluation_test_utils.py
RENAMED
|
@@ -42,7 +42,7 @@ AggregationMethod = Literal["mean", "max", "min", "bootstrap"]
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
async def run_tasks_with_eval_progress(
|
|
45
|
-
pointwise_tasks: list[asyncio.Task[EvaluationRow]], run_idx: int
|
|
45
|
+
pointwise_tasks: list[asyncio.Task[EvaluationRow]], run_idx: int, disable_tqdm: bool = False
|
|
46
46
|
) -> list[EvaluationRow]:
|
|
47
47
|
"""
|
|
48
48
|
Run evaluation tasks with a progress bar and proper cancellation handling.
|
|
@@ -66,6 +66,7 @@ async def run_tasks_with_eval_progress(
|
|
|
66
66
|
miniters=1,
|
|
67
67
|
mininterval=0.1,
|
|
68
68
|
bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
|
|
69
|
+
disable=disable_tqdm,
|
|
69
70
|
) as eval_pbar:
|
|
70
71
|
|
|
71
72
|
async def task_with_progress(task: asyncio.Task[EvaluationRow]) -> EvaluationRow:
|
|
@@ -88,7 +89,10 @@ async def run_tasks_with_eval_progress(
|
|
|
88
89
|
|
|
89
90
|
|
|
90
91
|
async def run_tasks_with_run_progress(
|
|
91
|
-
execute_run_func: Callable[[int, RolloutProcessorConfig], Any],
|
|
92
|
+
execute_run_func: Callable[[int, RolloutProcessorConfig], Any],
|
|
93
|
+
num_runs: int,
|
|
94
|
+
config: RolloutProcessorConfig,
|
|
95
|
+
disable_tqdm: bool = False,
|
|
92
96
|
) -> None:
|
|
93
97
|
"""
|
|
94
98
|
Run tasks with a parallel runs progress bar, preserving original logic.
|
|
@@ -108,6 +112,7 @@ async def run_tasks_with_run_progress(
|
|
|
108
112
|
dynamic_ncols=True,
|
|
109
113
|
miniters=1,
|
|
110
114
|
bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
|
|
115
|
+
disable=disable_tqdm,
|
|
111
116
|
) as run_pbar:
|
|
112
117
|
|
|
113
118
|
async def execute_run_with_progress(run_idx: int, config: RolloutProcessorConfig) -> Any:
|
|
@@ -330,6 +335,7 @@ async def rollout_processor_with_retry(
|
|
|
330
335
|
fresh_dataset: list[EvaluationRow],
|
|
331
336
|
config: RolloutProcessorConfig,
|
|
332
337
|
run_idx: int = 0,
|
|
338
|
+
disable_tqdm: bool = False,
|
|
333
339
|
) -> AsyncGenerator[EvaluationRow, None]:
|
|
334
340
|
"""
|
|
335
341
|
Wrapper around rollout_processor that handles retry logic using the Python backoff library.
|
|
@@ -449,6 +455,7 @@ async def rollout_processor_with_retry(
|
|
|
449
455
|
miniters=1,
|
|
450
456
|
mininterval=0.1,
|
|
451
457
|
bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
|
|
458
|
+
disable=disable_tqdm,
|
|
452
459
|
) as rollout_pbar:
|
|
453
460
|
# Yield results as they complete
|
|
454
461
|
for task in asyncio.as_completed(retry_tasks):
|
|
@@ -162,7 +162,7 @@ class GithubActionRolloutProcessor(RolloutProcessor):
|
|
|
162
162
|
row.rollout_status = Status.rollout_error(
|
|
163
163
|
f"Failed to find workflow run in GHA with rollout_id {row.execution_metadata.rollout_id}"
|
|
164
164
|
)
|
|
165
|
-
row.execution_metadata.
|
|
165
|
+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
|
|
166
166
|
return row
|
|
167
167
|
|
|
168
168
|
run_id = run.get("id")
|
|
@@ -170,7 +170,7 @@ class GithubActionRolloutProcessor(RolloutProcessor):
|
|
|
170
170
|
row.rollout_status = Status.rollout_error(
|
|
171
171
|
f"Failed to find workflow run in GHA with rollout_id {row.execution_metadata.rollout_id}"
|
|
172
172
|
)
|
|
173
|
-
row.execution_metadata.
|
|
173
|
+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
|
|
174
174
|
return row
|
|
175
175
|
|
|
176
176
|
# Poll the specific run until completion
|
|
@@ -194,10 +194,10 @@ class GithubActionRolloutProcessor(RolloutProcessor):
|
|
|
194
194
|
row.rollout_status = Status.rollout_error(
|
|
195
195
|
f"GitHub Actions run timed out after {self.timeout_seconds} seconds"
|
|
196
196
|
)
|
|
197
|
-
row.execution_metadata.
|
|
197
|
+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
|
|
198
198
|
return row
|
|
199
199
|
|
|
200
|
-
row.execution_metadata.
|
|
200
|
+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
|
|
201
201
|
|
|
202
202
|
def _update_with_trace() -> None:
|
|
203
203
|
return update_row_with_remote_trace(row, self._output_data_loader, self.model_base_url)
|
{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/openenv_rollout_processor.py
RENAMED
|
@@ -411,7 +411,7 @@ class OpenEnvRolloutProcessor(RolloutProcessor):
|
|
|
411
411
|
completion_tokens=usage["completion_tokens"],
|
|
412
412
|
total_tokens=usage["total_tokens"],
|
|
413
413
|
)
|
|
414
|
-
row.execution_metadata.
|
|
414
|
+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
|
|
415
415
|
|
|
416
416
|
# Attach per-step rewards and accumulated token IDs to
|
|
417
417
|
# execution_metadata.extra for downstream integrations
|
|
@@ -436,14 +436,14 @@ class OpenEnvRolloutProcessor(RolloutProcessor):
|
|
|
436
436
|
logger.info("[OpenEnvRolloutProcessor] Total reward: %.3f", total_reward)
|
|
437
437
|
logger.info(
|
|
438
438
|
"[OpenEnvRolloutProcessor] Duration: %.2fs",
|
|
439
|
-
row.execution_metadata.
|
|
439
|
+
row.execution_metadata.rollout_duration_seconds,
|
|
440
440
|
)
|
|
441
441
|
logger.debug("[OpenEnvRolloutProcessor] Messages collected: %d", len(messages))
|
|
442
442
|
|
|
443
443
|
logger.info(
|
|
444
444
|
f"Rollout complete: {len(step_rewards)} steps, "
|
|
445
445
|
f"total_reward={total_reward:.2f}, "
|
|
446
|
-
f"duration={row.execution_metadata.
|
|
446
|
+
f"duration={row.execution_metadata.rollout_duration_seconds:.2f}s"
|
|
447
447
|
)
|
|
448
448
|
# Final log with complete message history
|
|
449
449
|
if getattr(config, "logger", None):
|
{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/priority_scheduler.py
RENAMED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
+
import time
|
|
4
5
|
from collections import defaultdict
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
|
-
from typing import Any,
|
|
7
|
+
from typing import Any, List, Dict, Optional, Union
|
|
8
|
+
|
|
9
|
+
from tqdm.asyncio import tqdm as async_tqdm
|
|
7
10
|
|
|
8
11
|
from eval_protocol.models import EvaluationRow, Status
|
|
9
12
|
from eval_protocol.pytest.types import RolloutProcessorConfig, TestFunction
|
|
@@ -79,6 +82,18 @@ class PriorityRolloutScheduler:
|
|
|
79
82
|
self.rollout_n = rollout_n
|
|
80
83
|
self.in_group_minibatch_size = in_group_minibatch_size if in_group_minibatch_size > 0 else rollout_n
|
|
81
84
|
self.evaluation_test_kwargs = evaluation_test_kwargs
|
|
85
|
+
|
|
86
|
+
# Progress bars (initialized in run())
|
|
87
|
+
self.rollout_pbar: Optional[async_tqdm] = None
|
|
88
|
+
self.eval_pbar: Optional[async_tqdm] = None
|
|
89
|
+
|
|
90
|
+
# Track active rollouts: {row_index: set of run_indices currently in progress}
|
|
91
|
+
self.active_rollouts: Dict[int, set] = defaultdict(set)
|
|
92
|
+
self.active_rollouts_lock = asyncio.Lock()
|
|
93
|
+
|
|
94
|
+
# Track active evaluations
|
|
95
|
+
self.active_evals: int = 0
|
|
96
|
+
self.active_evals_lock = asyncio.Lock()
|
|
82
97
|
|
|
83
98
|
async def schedule_dataset(
|
|
84
99
|
self,
|
|
@@ -132,41 +147,68 @@ class PriorityRolloutScheduler:
|
|
|
132
147
|
experiment_id = rows_to_eval[0].execution_metadata.experiment_id if isinstance(rows_to_eval, list) else rows_to_eval.execution_metadata.experiment_id
|
|
133
148
|
run_id = rows_to_eval[0].execution_metadata.run_id if isinstance(rows_to_eval, list) else rows_to_eval.execution_metadata.run_id
|
|
134
149
|
eval_res = None
|
|
150
|
+
|
|
151
|
+
# Track active eval
|
|
152
|
+
async with self.active_evals_lock:
|
|
153
|
+
self.active_evals += 1
|
|
154
|
+
if self.eval_pbar:
|
|
155
|
+
self.eval_pbar.set_postfix_str(f"active={self.active_evals}")
|
|
156
|
+
|
|
157
|
+
start_time = time.perf_counter()
|
|
135
158
|
|
|
136
|
-
|
|
137
|
-
async with
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
159
|
+
try:
|
|
160
|
+
async with self.eval_sem:
|
|
161
|
+
async with rollout_logging_context(
|
|
162
|
+
rollout_id or "",
|
|
163
|
+
experiment_id=experiment_id,
|
|
164
|
+
run_id=run_id,
|
|
165
|
+
):
|
|
166
|
+
if isinstance(rows_to_eval, list):
|
|
167
|
+
eval_res = await execute_pytest_with_exception_handling(
|
|
168
|
+
test_func=self.eval_executor,
|
|
169
|
+
evaluation_test_kwargs=self.evaluation_test_kwargs,
|
|
170
|
+
processed_dataset=rows_to_eval,
|
|
171
|
+
)
|
|
172
|
+
else:
|
|
173
|
+
eval_res = await execute_pytest_with_exception_handling(
|
|
174
|
+
test_func=self.eval_executor,
|
|
175
|
+
evaluation_test_kwargs=self.evaluation_test_kwargs,
|
|
176
|
+
processed_row=rows_to_eval,
|
|
177
|
+
)
|
|
178
|
+
eval_duration = time.perf_counter() - start_time
|
|
179
|
+
|
|
180
|
+
# Set eval_duration_seconds BEFORE buffer writes to ensure it's included in serialization
|
|
157
181
|
if isinstance(eval_res, list):
|
|
158
182
|
for row in eval_res:
|
|
159
|
-
|
|
160
|
-
await self.output_buffer.add_result(row)
|
|
183
|
+
row.execution_metadata.eval_duration_seconds = eval_duration
|
|
161
184
|
else:
|
|
162
|
-
|
|
163
|
-
await self.output_buffer.add_result(eval_res)
|
|
185
|
+
eval_res.execution_metadata.eval_duration_seconds = eval_duration
|
|
164
186
|
|
|
165
|
-
|
|
166
|
-
self.
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
187
|
+
# push result to the output buffer
|
|
188
|
+
if self.output_buffer:
|
|
189
|
+
if isinstance(eval_res, list):
|
|
190
|
+
for row in eval_res:
|
|
191
|
+
self._post_process_result(row)
|
|
192
|
+
await self.output_buffer.add_result(row)
|
|
193
|
+
else:
|
|
194
|
+
self._post_process_result(eval_res)
|
|
195
|
+
await self.output_buffer.add_result(eval_res)
|
|
196
|
+
|
|
197
|
+
if isinstance(eval_res, list):
|
|
198
|
+
for row in eval_res:
|
|
199
|
+
self.results.append(row)
|
|
200
|
+
else:
|
|
201
|
+
self.results.append(eval_res)
|
|
202
|
+
return eval_res
|
|
203
|
+
finally:
|
|
204
|
+
# Always update progress bar (handles both success and failure cases)
|
|
205
|
+
if self.eval_pbar:
|
|
206
|
+
self.eval_pbar.update(1)
|
|
207
|
+
# Decrement active eval counter
|
|
208
|
+
async with self.active_evals_lock:
|
|
209
|
+
self.active_evals -= 1
|
|
210
|
+
if self.eval_pbar:
|
|
211
|
+
self.eval_pbar.set_postfix_str(f"active={self.active_evals}")
|
|
170
212
|
|
|
171
213
|
# 1. Prepare Config & Row for this micro-batch
|
|
172
214
|
current_batch_rows = []
|
|
@@ -205,15 +247,33 @@ class PriorityRolloutScheduler:
|
|
|
205
247
|
batch_results: List[EvaluationRow] = []
|
|
206
248
|
if current_batch_rows:
|
|
207
249
|
for idx, row in current_batch_rows:
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
self.
|
|
216
|
-
|
|
250
|
+
# Track this rollout as active
|
|
251
|
+
async with self.active_rollouts_lock:
|
|
252
|
+
self.active_rollouts[task.row_index].add(idx)
|
|
253
|
+
await self._update_rollout_pbar_postfix()
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
async for result_row in rollout_processor_with_retry(
|
|
257
|
+
self.rollout_processor, [row], task.config, idx, disable_tqdm=True
|
|
258
|
+
):
|
|
259
|
+
batch_results.append(result_row)
|
|
260
|
+
|
|
261
|
+
# Update rollout progress bar
|
|
262
|
+
if self.rollout_pbar:
|
|
263
|
+
self.rollout_pbar.update(1)
|
|
264
|
+
|
|
265
|
+
# in pointwise, we start evaluation immediately
|
|
266
|
+
if self.mode == "pointwise":
|
|
267
|
+
t = asyncio.create_task(_run_eval(result_row))
|
|
268
|
+
self.background_tasks.add(t)
|
|
269
|
+
t.add_done_callback(self.background_tasks.discard)
|
|
270
|
+
finally:
|
|
271
|
+
# Remove from active tracking
|
|
272
|
+
async with self.active_rollouts_lock:
|
|
273
|
+
self.active_rollouts[task.row_index].discard(idx)
|
|
274
|
+
if not self.active_rollouts[task.row_index]:
|
|
275
|
+
del self.active_rollouts[task.row_index]
|
|
276
|
+
await self._update_rollout_pbar_postfix()
|
|
217
277
|
|
|
218
278
|
# 3. Evaluate and Collect History
|
|
219
279
|
current_batch_history_updates = []
|
|
@@ -257,6 +317,34 @@ class PriorityRolloutScheduler:
|
|
|
257
317
|
)
|
|
258
318
|
self.queue.put_nowait(new_task)
|
|
259
319
|
|
|
320
|
+
def _format_active_rollouts(self) -> str:
|
|
321
|
+
"""Format active rollouts for display in progress bar."""
|
|
322
|
+
if not self.active_rollouts:
|
|
323
|
+
return ""
|
|
324
|
+
|
|
325
|
+
# Show active rows and their run indices
|
|
326
|
+
parts = []
|
|
327
|
+
for row_idx in sorted(self.active_rollouts.keys())[:5]: # Limit to 5 rows to keep it readable
|
|
328
|
+
runs = sorted(self.active_rollouts[row_idx])
|
|
329
|
+
if runs:
|
|
330
|
+
runs_str = ",".join(str(r) for r in runs[:3]) # Show up to 3 run indices
|
|
331
|
+
if len(runs) > 3:
|
|
332
|
+
runs_str += f"+{len(runs)-3}"
|
|
333
|
+
parts.append(f"r{row_idx}:[{runs_str}]")
|
|
334
|
+
|
|
335
|
+
if len(self.active_rollouts) > 5:
|
|
336
|
+
parts.append(f"+{len(self.active_rollouts)-5} more")
|
|
337
|
+
|
|
338
|
+
return " | ".join(parts)
|
|
339
|
+
|
|
340
|
+
async def _update_rollout_pbar_postfix(self):
|
|
341
|
+
"""Update the rollout progress bar postfix with active tasks info."""
|
|
342
|
+
if self.rollout_pbar:
|
|
343
|
+
active_count = sum(len(runs) for runs in self.active_rollouts.values())
|
|
344
|
+
self.rollout_pbar.set_postfix_str(
|
|
345
|
+
f"active={active_count} {self._format_active_rollouts()}"
|
|
346
|
+
)
|
|
347
|
+
|
|
260
348
|
def _post_process_result(self, res: EvaluationRow):
|
|
261
349
|
"""
|
|
262
350
|
Process evaluation result: update cost metrics, status, and log.
|
|
@@ -294,28 +382,58 @@ class PriorityRolloutScheduler:
|
|
|
294
382
|
async def run(self, dataset: List[EvaluationRow], num_runs: int, base_config: RolloutProcessorConfig):
|
|
295
383
|
self.num_runs = num_runs
|
|
296
384
|
|
|
297
|
-
#
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
# If we have separate limits, we need enough workers to saturate both stages
|
|
302
|
-
num_workers = self.max_concurrent_rollouts
|
|
303
|
-
|
|
304
|
-
workers = [asyncio.create_task(self.worker()) for _ in range(num_workers)]
|
|
305
|
-
|
|
306
|
-
# 3. Wait for completion
|
|
307
|
-
await self.queue.join()
|
|
308
|
-
|
|
309
|
-
# Wait for background evaluations to finish
|
|
310
|
-
if self.background_tasks:
|
|
311
|
-
await asyncio.gather(*self.background_tasks, return_exceptions=True)
|
|
385
|
+
# Calculate totals for progress bars
|
|
386
|
+
total_rollouts = len(dataset) * num_runs
|
|
387
|
+
# In pointwise mode: 1 eval per rollout; in groupwise mode: 1 eval per dataset row
|
|
388
|
+
total_evals = total_rollouts if self.mode == "pointwise" else len(dataset)
|
|
312
389
|
|
|
313
|
-
#
|
|
314
|
-
|
|
315
|
-
|
|
390
|
+
# Initialize progress bars
|
|
391
|
+
self.rollout_pbar = async_tqdm(
|
|
392
|
+
total=total_rollouts,
|
|
393
|
+
desc="🚀 Rollouts",
|
|
394
|
+
unit="row",
|
|
395
|
+
position=0,
|
|
396
|
+
leave=True,
|
|
397
|
+
colour="cyan",
|
|
398
|
+
)
|
|
399
|
+
self.eval_pbar = async_tqdm(
|
|
400
|
+
total=total_evals,
|
|
401
|
+
desc="📊 Evals",
|
|
402
|
+
unit="eval",
|
|
403
|
+
position=1,
|
|
404
|
+
leave=True,
|
|
405
|
+
colour="green",
|
|
406
|
+
)
|
|
316
407
|
|
|
317
|
-
|
|
318
|
-
|
|
408
|
+
try:
|
|
409
|
+
# 1. Schedule initial tasks
|
|
410
|
+
await self.schedule_dataset(dataset, base_config)
|
|
411
|
+
|
|
412
|
+
# 2. Start Workers
|
|
413
|
+
# If we have separate limits, we need enough workers to saturate both stages
|
|
414
|
+
num_workers = self.max_concurrent_rollouts
|
|
415
|
+
|
|
416
|
+
workers = [asyncio.create_task(self.worker()) for _ in range(num_workers)]
|
|
417
|
+
|
|
418
|
+
# 3. Wait for completion
|
|
419
|
+
await self.queue.join()
|
|
420
|
+
|
|
421
|
+
# Wait for background evaluations to finish
|
|
422
|
+
if self.background_tasks:
|
|
423
|
+
await asyncio.gather(*self.background_tasks, return_exceptions=True)
|
|
424
|
+
|
|
425
|
+
# 4. Cleanup
|
|
426
|
+
for w in workers:
|
|
427
|
+
w.cancel()
|
|
428
|
+
|
|
429
|
+
if workers:
|
|
430
|
+
await asyncio.gather(*workers, return_exceptions=True)
|
|
431
|
+
finally:
|
|
432
|
+
# Close progress bars
|
|
433
|
+
if self.rollout_pbar:
|
|
434
|
+
self.rollout_pbar.close()
|
|
435
|
+
if self.eval_pbar:
|
|
436
|
+
self.eval_pbar.close()
|
|
319
437
|
|
|
320
438
|
# Return collected results
|
|
321
439
|
return self.results
|
{eval_protocol-0.2.99 → eval_protocol-0.2.99.dev2}/eval_protocol/pytest/remote_rollout_processor.py
RENAMED
|
@@ -185,7 +185,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
185
185
|
f"Rollout {row.execution_metadata.rollout_id} timed out after {timeout_seconds} seconds"
|
|
186
186
|
)
|
|
187
187
|
|
|
188
|
-
row.execution_metadata.
|
|
188
|
+
row.execution_metadata.rollout_duration_seconds = time.perf_counter() - start_time
|
|
189
189
|
|
|
190
190
|
def _update_with_trace() -> None:
|
|
191
191
|
return update_row_with_remote_trace(row, self._output_data_loader, model_base_url)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.99
|
|
3
|
+
Version: 0.2.99.dev2
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -182,8 +182,8 @@ def test_create_rft_passes_all_flags_into_request_body(rft_test_harness, monkeyp
|
|
|
182
182
|
assert abs(ip["temperature"] - 0.9) < 1e-12
|
|
183
183
|
assert abs(ip["topP"] - 0.95) < 1e-12
|
|
184
184
|
assert ip["topK"] == 50
|
|
185
|
-
assert ip["
|
|
186
|
-
assert ip["
|
|
185
|
+
assert ip["maxOutputTokens"] == 4096
|
|
186
|
+
assert ip["responseCandidatesCount"] == 6
|
|
187
187
|
assert ip["extraBody"] == '{"foo":"bar"}'
|
|
188
188
|
|
|
189
189
|
# W&B mapping
|
|
@@ -1126,8 +1126,8 @@ def test_cli_full_command_style_evaluator_and_dataset_flags(tmp_path, monkeypatc
|
|
|
1126
1126
|
|
|
1127
1127
|
# Inference params mapping
|
|
1128
1128
|
ip = body["inferenceParameters"]
|
|
1129
|
-
assert ip["
|
|
1130
|
-
assert ip["
|
|
1129
|
+
assert ip["responseCandidatesCount"] == 4
|
|
1130
|
+
assert ip["maxOutputTokens"] == 32768
|
|
1131
1131
|
|
|
1132
1132
|
# Other top-level
|
|
1133
1133
|
assert body["chunkSize"] == 50
|
|
@@ -57,7 +57,7 @@ async def test_scheduler_basic_execution(
|
|
|
57
57
|
micro_batch_size = 1
|
|
58
58
|
|
|
59
59
|
# Mock rollout processor with delay
|
|
60
|
-
async def delayed_rollout(processor, rows, config, run_idx):
|
|
60
|
+
async def delayed_rollout(processor, rows, config, run_idx, **kwargs):
|
|
61
61
|
await asyncio.sleep(0.01)
|
|
62
62
|
for row in rows:
|
|
63
63
|
yield row
|
|
@@ -110,7 +110,7 @@ async def test_concurrency_control(
|
|
|
110
110
|
rollout_lock = asyncio.Lock()
|
|
111
111
|
eval_lock = asyncio.Lock()
|
|
112
112
|
|
|
113
|
-
async def mock_rollout_gen(processor, rows, config, run_idx):
|
|
113
|
+
async def mock_rollout_gen(processor, rows, config, run_idx, **kwargs):
|
|
114
114
|
nonlocal active_rollouts, max_active_rollouts_seen
|
|
115
115
|
async with rollout_lock:
|
|
116
116
|
active_rollouts += 1
|
|
@@ -177,7 +177,7 @@ async def test_priority_scheduling(
|
|
|
177
177
|
|
|
178
178
|
execution_order = []
|
|
179
179
|
|
|
180
|
-
async def mock_rollout_gen(processor, rows, config, run_idx):
|
|
180
|
+
async def mock_rollout_gen(processor, rows, config, run_idx, **kwargs):
|
|
181
181
|
row_id = rows[0].input_metadata.row_id
|
|
182
182
|
execution_order.append(f"{row_id}_run_{run_idx}")
|
|
183
183
|
for row in rows:
|
|
@@ -290,7 +290,7 @@ async def test_groupwise_mode(
|
|
|
290
290
|
eval_calls.append(rows)
|
|
291
291
|
return rows # Pass through
|
|
292
292
|
|
|
293
|
-
async def mock_rollout_gen(processor, rows, config, run_idx):
|
|
293
|
+
async def mock_rollout_gen(processor, rows, config, run_idx, **kwargs):
|
|
294
294
|
for row in rows:
|
|
295
295
|
yield row
|
|
296
296
|
|