eval-protocol 0.2.8__tar.gz → 0.2.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.8/eval_protocol.egg-info → eval_protocol-0.2.9}/PKG-INFO +1 -1
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/_version.py +3 -3
- eval_protocol-0.2.9/eval_protocol/benchmarks/__init__.py +9 -0
- eval_protocol-0.2.9/eval_protocol/benchmarks/registry.py +174 -0
- eval_protocol-0.2.9/eval_protocol/benchmarks/run.py +100 -0
- eval_protocol-0.2.9/eval_protocol/benchmarks/suites/__init__.py +3 -0
- eval_protocol-0.2.9/eval_protocol/benchmarks/suites/aime25.py +118 -0
- eval_protocol-0.2.9/eval_protocol/benchmarks/suites/gpqa.py +100 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/execution/base_policy.py +17 -11
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/execution/manager.py +27 -33
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/execution/policy.py +2 -1
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/models.py +3 -3
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/playback_policy.py +2 -2
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/default_single_turn_rollout_process.py +19 -5
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/evaluation_test.py +328 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/plugin.py +2 -3
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/types/types.py +27 -3
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/static_policy.py +4 -4
- {eval_protocol-0.2.8 → eval_protocol-0.2.9/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol.egg-info/SOURCES.txt +9 -3
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_rollout_control_plane_integration.py +7 -3
- eval_protocol-0.2.9/vite-app/dist/assets/index-CmEkuH8E.js +93 -0
- eval_protocol-0.2.9/vite-app/dist/assets/index-CmEkuH8E.js.map +1 -0
- eval_protocol-0.2.9/vite-app/dist/assets/index-DZwKPeo5.css +1 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vite-app/dist/index.html +2 -2
- eval_protocol-0.2.8/vite-app/dist/assets/index-CGYj40Gx.css +0 -1
- eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js +0 -88
- eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js.map +0 -1
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/LICENSE +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/README.md +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/development/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/integrations/braintrust.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_agent/session.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/pyproject.toml +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/setup.cfg +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/setup.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_braintrust_adapter.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_braintrust_example.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_config.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_format.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_frozen_lake_http_server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_frozen_lake_seed_evaluation.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_length.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_math.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_models.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/versioneer.py +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.8 → eval_protocol-0.2.9}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.9
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-08-
|
|
11
|
+
"date": "2025-08-12T13:33:17-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "6b018d4d211d239896a5bda83b375b9bbb4fca34",
|
|
15
|
+
"version": "0.2.9"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Benchmark registry and export decorator.
|
|
3
|
+
|
|
4
|
+
This module provides a lightweight registry for benchmarks and a decorator
|
|
5
|
+
`@export_benchmark(name)` that can be stacked with `@evaluation_test`.
|
|
6
|
+
|
|
7
|
+
It registers a runnable handle that executes the exact same evaluation pipeline
|
|
8
|
+
as the pytest flow by calling `run_evaluation_test_direct` with the parameters
|
|
9
|
+
captured from the decorated function.
|
|
10
|
+
|
|
11
|
+
Usage in a suite module (stack under @evaluation_test):
|
|
12
|
+
|
|
13
|
+
from eval_protocol.benchmarks.registry import export_benchmark
|
|
14
|
+
|
|
15
|
+
@export_benchmark("aime25_low")
|
|
16
|
+
@evaluation_test(...)
|
|
17
|
+
def test_aime_pointwise(row: EvaluationRow) -> EvaluationRow:
|
|
18
|
+
...
|
|
19
|
+
|
|
20
|
+
Programmatic run:
|
|
21
|
+
|
|
22
|
+
from eval_protocol.benchmarks.registry import get_benchmark_runner
|
|
23
|
+
get_benchmark_runner("aime25_low")(model="fireworks_ai/...", print_summary=True, out="artifacts/aime.json")
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import json
|
|
29
|
+
import os
|
|
30
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Global registry: name -> callable runner
|
|
34
|
+
_BENCHMARK_REGISTRY: Dict[str, Callable[..., Any]] = {}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def list_benchmarks() -> List[str]:
|
|
38
|
+
return sorted(_BENCHMARK_REGISTRY.keys())
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_benchmark_runner(name: str) -> Callable[..., Any]:
|
|
42
|
+
try:
|
|
43
|
+
return _BENCHMARK_REGISTRY[name]
|
|
44
|
+
except KeyError as exc:
|
|
45
|
+
raise KeyError(f"Benchmark '{name}' not found. Available: {list_benchmarks()}") from exc
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def export_benchmark(name: str) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
|
|
49
|
+
"""
|
|
50
|
+
Decorator to export a benchmark test into the global registry.
|
|
51
|
+
|
|
52
|
+
This expects to be stacked with `@evaluation_test`, so the decorated function
|
|
53
|
+
should carry `__ep_config` and `__ep_original_test_func` attributes that the
|
|
54
|
+
decorator can read to construct a direct runner.
|
|
55
|
+
|
|
56
|
+
The registered runner supports a subset of convenient overrides and maps them
|
|
57
|
+
to the same EP_* environment variables used by the pytest plugin to ensure
|
|
58
|
+
identical summaries and JSON artifact behavior.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def _decorator(test_wrapper: Callable[..., Any]) -> Callable[..., Any]:
|
|
62
|
+
# Pull through metadata attached by evaluation_test
|
|
63
|
+
ep_config: Dict[str, Any] = getattr(test_wrapper, "__ep_config", {})
|
|
64
|
+
original_test_func: Optional[Callable[..., Any]] = getattr(
|
|
65
|
+
test_wrapper, "__ep_original_test_func", None
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def _runner(
|
|
69
|
+
*,
|
|
70
|
+
model: Optional[str] = None,
|
|
71
|
+
print_summary: bool = False,
|
|
72
|
+
out: Optional[str] = None,
|
|
73
|
+
reasoning_effort: Optional[str] = None,
|
|
74
|
+
max_rows: Optional[int | str] = None,
|
|
75
|
+
num_runs: Optional[int] = None,
|
|
76
|
+
input_params_override: Optional[Dict[str, Any]] = None,
|
|
77
|
+
max_concurrency: Optional[int] = None,
|
|
78
|
+
) -> Any:
|
|
79
|
+
# Map convenience flags to EP_* env used by the pytest flow
|
|
80
|
+
if print_summary:
|
|
81
|
+
os.environ["EP_PRINT_SUMMARY"] = "1"
|
|
82
|
+
if out:
|
|
83
|
+
os.environ["EP_SUMMARY_JSON"] = out
|
|
84
|
+
# Merge reasoning effort and arbitrary overrides into EP_INPUT_PARAMS_JSON
|
|
85
|
+
merged: Dict[str, Any] = {}
|
|
86
|
+
if reasoning_effort:
|
|
87
|
+
# Fireworks OpenAI-compatible endpoint expects extra_body.reasoning_effort, not nested reasoning dict
|
|
88
|
+
merged.setdefault("extra_body", {})["reasoning_effort"] = str(reasoning_effort)
|
|
89
|
+
if input_params_override:
|
|
90
|
+
def _deep_update(base: Dict[str, Any], over: Dict[str, Any]) -> Dict[str, Any]:
|
|
91
|
+
for k, v in over.items():
|
|
92
|
+
if isinstance(v, dict) and isinstance(base.get(k), dict):
|
|
93
|
+
_deep_update(base[k], v)
|
|
94
|
+
else:
|
|
95
|
+
base[k] = v
|
|
96
|
+
return base
|
|
97
|
+
merged = _deep_update(merged, dict(input_params_override))
|
|
98
|
+
if merged:
|
|
99
|
+
os.environ["EP_INPUT_PARAMS_JSON"] = json.dumps(merged)
|
|
100
|
+
|
|
101
|
+
if max_rows is not None:
|
|
102
|
+
if isinstance(max_rows, str) and max_rows.strip().lower() == "all":
|
|
103
|
+
os.environ["EP_MAX_DATASET_ROWS"] = "None"
|
|
104
|
+
else:
|
|
105
|
+
os.environ["EP_MAX_DATASET_ROWS"] = str(max_rows)
|
|
106
|
+
|
|
107
|
+
# Build effective parameters, preferring overrides
|
|
108
|
+
models: List[str] = ep_config.get("model") or []
|
|
109
|
+
model_to_use = model or (models[0] if models else None)
|
|
110
|
+
if not model_to_use:
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"No model provided and none captured from evaluation_test for benchmark '{name}'"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
input_messages = ep_config.get("input_messages")
|
|
116
|
+
input_dataset = ep_config.get("input_dataset")
|
|
117
|
+
dataset_adapter = ep_config.get("dataset_adapter")
|
|
118
|
+
rollout_input_params_list = ep_config.get("rollout_input_params")
|
|
119
|
+
rollout_processor = ep_config.get("rollout_processor")
|
|
120
|
+
aggregation_method = ep_config.get("aggregation_method")
|
|
121
|
+
threshold = ep_config.get("threshold_of_success")
|
|
122
|
+
default_num_runs = ep_config.get("num_runs")
|
|
123
|
+
max_dataset_rows = ep_config.get("max_dataset_rows")
|
|
124
|
+
mcp_config_path = ep_config.get("mcp_config_path")
|
|
125
|
+
max_concurrent_rollouts = ep_config.get("max_concurrent_rollouts")
|
|
126
|
+
if max_concurrency is not None:
|
|
127
|
+
max_concurrent_rollouts = int(max_concurrency)
|
|
128
|
+
server_script_path = ep_config.get("server_script_path")
|
|
129
|
+
steps = ep_config.get("steps")
|
|
130
|
+
mode = ep_config.get("mode")
|
|
131
|
+
combine_datasets = ep_config.get("combine_datasets")
|
|
132
|
+
|
|
133
|
+
# Choose the first rollout param set by default
|
|
134
|
+
rollout_params = None
|
|
135
|
+
if isinstance(rollout_input_params_list, list) and rollout_input_params_list:
|
|
136
|
+
rollout_params = rollout_input_params_list[0]
|
|
137
|
+
|
|
138
|
+
# Import runner lazily to avoid hard import dependencies and circulars
|
|
139
|
+
import importlib
|
|
140
|
+
|
|
141
|
+
_mod = importlib.import_module("eval_protocol.pytest.evaluation_test")
|
|
142
|
+
run_evaluation_test_direct = getattr(_mod, "run_evaluation_test_direct")
|
|
143
|
+
|
|
144
|
+
return run_evaluation_test_direct(
|
|
145
|
+
test_func=original_test_func or test_wrapper,
|
|
146
|
+
model=model_to_use,
|
|
147
|
+
input_messages=input_messages,
|
|
148
|
+
input_dataset=input_dataset,
|
|
149
|
+
dataset_adapter=dataset_adapter,
|
|
150
|
+
rollout_input_params=rollout_params,
|
|
151
|
+
rollout_processor=rollout_processor,
|
|
152
|
+
aggregation_method=aggregation_method,
|
|
153
|
+
threshold_of_success=threshold,
|
|
154
|
+
num_runs=(num_runs if num_runs is not None else default_num_runs),
|
|
155
|
+
max_dataset_rows=max_dataset_rows,
|
|
156
|
+
mcp_config_path=mcp_config_path,
|
|
157
|
+
max_concurrent_rollouts=max_concurrent_rollouts,
|
|
158
|
+
server_script_path=server_script_path,
|
|
159
|
+
steps=steps,
|
|
160
|
+
mode=mode,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Register runner
|
|
164
|
+
if name in _BENCHMARK_REGISTRY:
|
|
165
|
+
# Overwrite with latest definition
|
|
166
|
+
_BENCHMARK_REGISTRY[name] = _runner
|
|
167
|
+
else:
|
|
168
|
+
_BENCHMARK_REGISTRY[name] = _runner
|
|
169
|
+
|
|
170
|
+
return test_wrapper
|
|
171
|
+
|
|
172
|
+
return _decorator
|
|
173
|
+
|
|
174
|
+
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Minimal CLI runner for exported benchmarks.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
|
|
6
|
+
python -m eval_protocol.benchmarks.run aime25_low \
|
|
7
|
+
--model fireworks_ai/accounts/fireworks/models/gpt-oss-120b \
|
|
8
|
+
--print-summary \
|
|
9
|
+
--out artifacts/aime25_low.json \
|
|
10
|
+
--max-rows 50 \
|
|
11
|
+
--reasoning-effort low
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from importlib import import_module
|
|
20
|
+
import pkgutil
|
|
21
|
+
import eval_protocol.benchmarks.suites as suites_pkg
|
|
22
|
+
from eval_protocol.benchmarks.registry import get_benchmark_runner, list_benchmarks
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _parse_args() -> argparse.Namespace:
|
|
26
|
+
parser = argparse.ArgumentParser(description="Run an exported eval-protocol benchmark")
|
|
27
|
+
parser.add_argument("name", help=f"Benchmark name. Known: {', '.join(list_benchmarks()) or '(none)'}")
|
|
28
|
+
parser.add_argument("--model", required=True, help="Model identifier (provider/model)")
|
|
29
|
+
parser.add_argument("--print-summary", action="store_true", help="Print concise EP summary line")
|
|
30
|
+
parser.add_argument("--out", help="Write JSON summary artifact to path or directory")
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--reasoning-effort",
|
|
33
|
+
choices=["low", "medium", "high"],
|
|
34
|
+
help="Sets extra_body.reasoning.effort via EP_INPUT_PARAMS_JSON",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--max-rows",
|
|
38
|
+
help="Limit rows: integer or 'all' for no limit (maps to EP_MAX_DATASET_ROWS)",
|
|
39
|
+
)
|
|
40
|
+
parser.add_argument("--num-runs", type=int, help="Override num_runs if provided")
|
|
41
|
+
parser.add_argument("--max-tokens", type=int, help="Override max_tokens for generation requests")
|
|
42
|
+
parser.add_argument("--max-concurrency", type=int, help="Override max concurrent rollouts")
|
|
43
|
+
# Allow overriding reasoning effort explicitly (low/medium/high). If omitted, suite default is used.
|
|
44
|
+
# Already mapped by --reasoning-effort above.
|
|
45
|
+
return parser.parse_args()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def main() -> int:
|
|
49
|
+
args = _parse_args()
|
|
50
|
+
# Auto-import all suite modules so their @export_benchmark decorators register
|
|
51
|
+
# Import all suite modules so their @export_benchmark decorators register
|
|
52
|
+
import sys, traceback
|
|
53
|
+
for modinfo in pkgutil.iter_modules(suites_pkg.__path__):
|
|
54
|
+
mod_name = f"{suites_pkg.__name__}.{modinfo.name}"
|
|
55
|
+
try:
|
|
56
|
+
import_module(mod_name)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
print(f"[bench] failed to import suite module: {mod_name}: {e}", file=sys.stderr)
|
|
59
|
+
traceback.print_exc()
|
|
60
|
+
# Fallback: if nothing registered yet and a known suite was requested, try explicit import
|
|
61
|
+
if not list_benchmarks():
|
|
62
|
+
known_map = {
|
|
63
|
+
"aime25_low": "eval_protocol.benchmarks.suites.aime25",
|
|
64
|
+
}
|
|
65
|
+
forced = known_map.get(args.name)
|
|
66
|
+
if forced:
|
|
67
|
+
try:
|
|
68
|
+
import_module(forced)
|
|
69
|
+
except Exception as e:
|
|
70
|
+
print(f"[bench] explicit import failed for {forced}: {e}", file=sys.stderr)
|
|
71
|
+
runner = get_benchmark_runner(args.name)
|
|
72
|
+
max_rows: int | str | None = None
|
|
73
|
+
if args.max_rows is not None:
|
|
74
|
+
try:
|
|
75
|
+
max_rows = int(args.max_rows)
|
|
76
|
+
except Exception:
|
|
77
|
+
max_rows = str(args.max_rows)
|
|
78
|
+
# Build input params override if needed
|
|
79
|
+
ip_override = {}
|
|
80
|
+
if args.max_tokens is not None:
|
|
81
|
+
ip_override["max_tokens"] = int(args.max_tokens)
|
|
82
|
+
|
|
83
|
+
_ = runner(
|
|
84
|
+
model=args.model,
|
|
85
|
+
print_summary=args.print_summary,
|
|
86
|
+
out=args.out,
|
|
87
|
+
reasoning_effort=args.reasoning_effort,
|
|
88
|
+
max_rows=max_rows,
|
|
89
|
+
num_runs=args.num_runs,
|
|
90
|
+
input_params_override=(ip_override or None),
|
|
91
|
+
max_concurrency=args.max_concurrency,
|
|
92
|
+
)
|
|
93
|
+
# Non-zero exit on failure gate is handled within the runner via assertions
|
|
94
|
+
return 0
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
if __name__ == "__main__":
|
|
98
|
+
raise SystemExit(main())
|
|
99
|
+
|
|
100
|
+
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional
|
|
2
|
+
|
|
3
|
+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
|
|
4
|
+
from eval_protocol.pytest.default_single_turn_rollout_process import (
|
|
5
|
+
default_single_turn_rollout_processor,
|
|
6
|
+
)
|
|
7
|
+
from eval_protocol.pytest.evaluation_test import evaluation_test
|
|
8
|
+
from eval_protocol.benchmarks.registry import export_benchmark
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
SYSTEM_PROMPT = (
|
|
12
|
+
"You are a helpful math assistant. Please reason step by step, and put your "
|
|
13
|
+
"final answer within \\boxed{...}."
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _extract_boxed_text(text: str) -> str:
|
|
18
|
+
import re
|
|
19
|
+
|
|
20
|
+
if not text:
|
|
21
|
+
return ""
|
|
22
|
+
|
|
23
|
+
pattern_boxed = r"boxed{(.*?)}|framebox{(.*?)}"
|
|
24
|
+
matches = re.findall(pattern_boxed, text, re.DOTALL)
|
|
25
|
+
if matches:
|
|
26
|
+
for match in matches[::-1]:
|
|
27
|
+
for group in match:
|
|
28
|
+
if group:
|
|
29
|
+
return group.split(",")[-1].strip()
|
|
30
|
+
matches_digits = re.findall(r"\d+", text, re.DOTALL)
|
|
31
|
+
if matches_digits:
|
|
32
|
+
return matches_digits[-1]
|
|
33
|
+
return ""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _normalize_to_int_or_none(s: Optional[str]) -> Optional[int]:
|
|
37
|
+
import re
|
|
38
|
+
|
|
39
|
+
if s is None:
|
|
40
|
+
return None
|
|
41
|
+
m = re.match(r"\d+", str(s).strip())
|
|
42
|
+
if not m:
|
|
43
|
+
return None
|
|
44
|
+
try:
|
|
45
|
+
return int(m.group(0))
|
|
46
|
+
except ValueError:
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
|
|
51
|
+
converted: List[EvaluationRow] = []
|
|
52
|
+
for r in rows:
|
|
53
|
+
question = r.get("question", "")
|
|
54
|
+
answer = r.get("answer", None)
|
|
55
|
+
messages = [
|
|
56
|
+
Message(role="system", content=SYSTEM_PROMPT),
|
|
57
|
+
Message(role="user", content=str(question)),
|
|
58
|
+
]
|
|
59
|
+
converted.append(
|
|
60
|
+
EvaluationRow(messages=messages, ground_truth=str(answer) if answer is not None else None)
|
|
61
|
+
)
|
|
62
|
+
return converted
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@export_benchmark("aime25")
|
|
66
|
+
@evaluation_test(
|
|
67
|
+
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
|
|
68
|
+
input_dataset=[
|
|
69
|
+
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
|
|
70
|
+
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
|
|
71
|
+
],
|
|
72
|
+
dataset_adapter=aime2025_dataset_adapter,
|
|
73
|
+
rollout_input_params=[{"max_tokens": 131000, "extra_body": {"reasoning_effort": "low"}}],
|
|
74
|
+
rollout_processor=default_single_turn_rollout_processor,
|
|
75
|
+
aggregation_method="mean",
|
|
76
|
+
threshold_of_success=None,
|
|
77
|
+
num_runs=8,
|
|
78
|
+
max_dataset_rows=2,
|
|
79
|
+
max_concurrent_rollouts=4,
|
|
80
|
+
mode="pointwise",
|
|
81
|
+
)
|
|
82
|
+
def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
|
|
83
|
+
assistant_msgs = [m for m in row.messages if m.role == "assistant"]
|
|
84
|
+
content = assistant_msgs[-1].content if assistant_msgs else ""
|
|
85
|
+
|
|
86
|
+
extracted_text = _extract_boxed_text(content or "")
|
|
87
|
+
extracted_int = _normalize_to_int_or_none(extracted_text)
|
|
88
|
+
gt_int = _normalize_to_int_or_none(row.ground_truth or "")
|
|
89
|
+
|
|
90
|
+
is_valid = extracted_int is not None and gt_int is not None
|
|
91
|
+
score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0
|
|
92
|
+
|
|
93
|
+
metrics = {
|
|
94
|
+
"exact_match": MetricResult(
|
|
95
|
+
score=score,
|
|
96
|
+
is_score_valid=is_valid,
|
|
97
|
+
reason=(
|
|
98
|
+
"Parsed both integers and they matched"
|
|
99
|
+
if score == 1.0
|
|
100
|
+
else ("Parsed integers did not match" if is_valid else "Failed to parse integer")
|
|
101
|
+
),
|
|
102
|
+
data={
|
|
103
|
+
"extracted_text": extracted_text,
|
|
104
|
+
"extracted_int": extracted_int,
|
|
105
|
+
"ground_truth_int": gt_int,
|
|
106
|
+
},
|
|
107
|
+
)
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
row.evaluation_result = EvaluateResult(
|
|
111
|
+
score=score,
|
|
112
|
+
reason=("Answer correct" if score == 1.0 else "Answer incorrect"),
|
|
113
|
+
is_score_valid=is_valid,
|
|
114
|
+
metrics=metrics,
|
|
115
|
+
)
|
|
116
|
+
return row
|
|
117
|
+
|
|
118
|
+
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import io
|
|
5
|
+
import re
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
|
|
9
|
+
from eval_protocol.pytest.evaluation_test import evaluation_test
|
|
10
|
+
from eval_protocol.pytest.default_single_turn_rollout_process import (
|
|
11
|
+
default_single_turn_rollout_processor,
|
|
12
|
+
)
|
|
13
|
+
from eval_protocol.benchmarks.registry import export_benchmark
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
SYSTEM_PROMPT = (
|
|
17
|
+
"You are a helpful assistant. Read the question and options carefully. "
|
|
18
|
+
"Express your final answer strictly as a single letter: A, B, C, or D."
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _load_gpqa_messages_from_csv() -> List[List[Message]]:
|
|
23
|
+
url = "https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv"
|
|
24
|
+
resp = requests.get(url, timeout=60)
|
|
25
|
+
resp.raise_for_status()
|
|
26
|
+
|
|
27
|
+
messages_list: List[List[Message]] = []
|
|
28
|
+
reader = csv.DictReader(io.StringIO(resp.text))
|
|
29
|
+
for ex in reader:
|
|
30
|
+
q = str(ex.get("Question", ""))
|
|
31
|
+
correct = str(ex.get("Correct Answer", "")).strip()
|
|
32
|
+
inc1 = str(ex.get("Incorrect Answer 1", ""))
|
|
33
|
+
inc2 = str(ex.get("Incorrect Answer 2", ""))
|
|
34
|
+
inc3 = str(ex.get("Incorrect Answer 3", ""))
|
|
35
|
+
choices = [correct, inc1, inc2, inc3]
|
|
36
|
+
user_content = (
|
|
37
|
+
f"{q}\n\n(A) {choices[0]}\n(B) {choices[1]}\n(C) {choices[2]}\n(D) {choices[3]}\n\nAnswer with one letter."
|
|
38
|
+
)
|
|
39
|
+
messages_list.append(
|
|
40
|
+
[
|
|
41
|
+
Message(role="system", content=SYSTEM_PROMPT),
|
|
42
|
+
Message(role="user", content=user_content),
|
|
43
|
+
# Correct answer is always option A by construction
|
|
44
|
+
Message(role="system", content="__GT__:A"),
|
|
45
|
+
]
|
|
46
|
+
)
|
|
47
|
+
if not messages_list:
|
|
48
|
+
raise RuntimeError("Failed to load GPQA messages: no rows found from source")
|
|
49
|
+
return messages_list
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _extract_abcd_letter(text: str) -> str | None:
|
|
53
|
+
if not text:
|
|
54
|
+
return None
|
|
55
|
+
m = re.search(r"\b([ABCD])\b", text.upper())
|
|
56
|
+
return m.group(1) if m else None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
_GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@export_benchmark("gpqa")
|
|
63
|
+
@evaluation_test(
|
|
64
|
+
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
|
|
65
|
+
input_messages=_GPQA_INPUT_MESSAGES,
|
|
66
|
+
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
|
|
67
|
+
rollout_processor=default_single_turn_rollout_processor,
|
|
68
|
+
aggregation_method="mean",
|
|
69
|
+
threshold_of_success=None,
|
|
70
|
+
num_runs=8,
|
|
71
|
+
mode="pointwise",
|
|
72
|
+
)
|
|
73
|
+
def gpqa_pointwise(row: EvaluationRow) -> EvaluationRow:
|
|
74
|
+
assistant_msgs = [m for m in row.messages if m.role == "assistant"]
|
|
75
|
+
content = assistant_msgs[-1].content if assistant_msgs else ""
|
|
76
|
+
|
|
77
|
+
pred = _extract_abcd_letter(content or "")
|
|
78
|
+
# Retrieve GT from the trailing system message we appended
|
|
79
|
+
gt_tokens = [m.content for m in row.messages if m.role == "system" and (m.content or "").startswith("__GT__:")]
|
|
80
|
+
gt = gt_tokens[-1].split(":", 1)[1].strip() if gt_tokens else None
|
|
81
|
+
|
|
82
|
+
is_valid = pred is not None and gt in {"A", "B", "C", "D"}
|
|
83
|
+
score = 1.0 if (is_valid and pred == gt) else 0.0
|
|
84
|
+
|
|
85
|
+
row.evaluation_result = EvaluateResult(
|
|
86
|
+
score=score,
|
|
87
|
+
reason=("Correct option" if score == 1.0 else "Incorrect option"),
|
|
88
|
+
is_score_valid=is_valid,
|
|
89
|
+
metrics={
|
|
90
|
+
"exact_match": MetricResult(
|
|
91
|
+
score=score,
|
|
92
|
+
is_score_valid=is_valid,
|
|
93
|
+
reason=("Matched" if score == 1.0 else "Not matched"),
|
|
94
|
+
data={"pred": pred, "gt": gt},
|
|
95
|
+
)
|
|
96
|
+
},
|
|
97
|
+
)
|
|
98
|
+
return row
|
|
99
|
+
|
|
100
|
+
|
|
@@ -151,7 +151,7 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
|
|
|
151
151
|
tool_schemas: List[Dict],
|
|
152
152
|
env_index: int,
|
|
153
153
|
conversation_history: List[Dict[str, Any]],
|
|
154
|
-
) -> Tuple[List[MCPToolCall], CompletionUsage]:
|
|
154
|
+
) -> Tuple[List[MCPToolCall], CompletionUsage, str]:
|
|
155
155
|
"""
|
|
156
156
|
Generate tool calls using conversation history for proper OpenAI trajectories.
|
|
157
157
|
|
|
@@ -161,7 +161,7 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
|
|
|
161
161
|
user_prompt: Current user prompt with observation
|
|
162
162
|
|
|
163
163
|
Returns:
|
|
164
|
-
List of MCPToolCall objects
|
|
164
|
+
List of MCPToolCall objects, LLM usage stats, and finish reason
|
|
165
165
|
"""
|
|
166
166
|
# Convert MCP tools to LLM format
|
|
167
167
|
llm_tools = self._convert_mcp_tools_to_llm_format(tool_schemas)
|
|
@@ -190,6 +190,8 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
|
|
|
190
190
|
total_tokens=response["usage"]["total_tokens"],
|
|
191
191
|
)
|
|
192
192
|
|
|
193
|
+
finish_reason = response["choices"][0]["finish_reason"]
|
|
194
|
+
|
|
193
195
|
# Extract tool call from response
|
|
194
196
|
message = response["choices"][0]["message"]
|
|
195
197
|
logger.debug(f"Environment {env_index} - Response message: {message}")
|
|
@@ -217,15 +219,19 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
|
|
|
217
219
|
if self.max_tools_per_turn:
|
|
218
220
|
mcp_tool_calls = mcp_tool_calls[: self.max_tools_per_turn]
|
|
219
221
|
|
|
220
|
-
return mcp_tool_calls, usage_stats
|
|
222
|
+
return mcp_tool_calls, usage_stats, finish_reason
|
|
221
223
|
else:
|
|
222
224
|
# No tool calls in response - this is normal when episode ends or LLM provides only text
|
|
223
225
|
logger.debug(f"No tool calls in response for env {env_index}, message content: {message.get('content')}")
|
|
224
|
-
return
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
226
|
+
return (
|
|
227
|
+
[
|
|
228
|
+
MCPToolCall(
|
|
229
|
+
tool_name="_no_tool_call",
|
|
230
|
+
arguments={
|
|
231
|
+
"reason": "no_tool_call_generated",
|
|
232
|
+
},
|
|
233
|
+
)
|
|
234
|
+
],
|
|
235
|
+
usage_stats,
|
|
236
|
+
finish_reason,
|
|
237
|
+
)
|