eval-protocol 0.2.4__tar.gz → 0.2.5.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.4/eval_protocol.egg-info → eval_protocol-0.2.5.dev1}/PKG-INFO +2 -3
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/client/connection.py +19 -1
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/execution/manager.py +3 -38
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/mcpgym.py +25 -2
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/session/manager.py +7 -9
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_env.py +25 -9
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +17 -19
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/types/types.py +4 -2
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1/eval_protocol.egg-info}/PKG-INFO +2 -3
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol.egg-info/requires.txt +1 -2
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/pyproject.toml +1 -2
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_parallel_rollouts.py +2 -2
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_rollout_control_plane_integration.py +10 -2
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_url_handling.py +26 -12
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/LICENSE +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/README.md +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/development/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/integrations/braintrust.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/mcp_agent/session.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/setup.cfg +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/setup.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_braintrust_adapter.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_braintrust_example.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_config.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_format.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_frozen_lake_http_server.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_frozen_lake_seed_evaluation.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_length.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_math.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_models.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_server.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/versioneer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5.dev1
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -40,7 +40,6 @@ Requires-Dist: deepdiff>=6.0.0
|
|
|
40
40
|
Requires-Dist: pandas>=1.5.0
|
|
41
41
|
Requires-Dist: watchdog>=2.1.0
|
|
42
42
|
Requires-Dist: websockets>=15.0.1
|
|
43
|
-
Requires-Dist: fireworks-ai>=0.19.12
|
|
44
43
|
Requires-Dist: fastapi>=0.116.1
|
|
45
44
|
Provides-Extra: dev
|
|
46
45
|
Requires-Dist: build; extra == "dev"
|
|
@@ -79,7 +78,7 @@ Requires-Dist: accelerate>=0.28.0; extra == "trl"
|
|
|
79
78
|
Provides-Extra: openevals
|
|
80
79
|
Requires-Dist: openevals>=0.1.0; extra == "openevals"
|
|
81
80
|
Provides-Extra: fireworks
|
|
82
|
-
Requires-Dist: fireworks-ai>=0.19.
|
|
81
|
+
Requires-Dist: fireworks-ai>=0.19.12; extra == "fireworks"
|
|
83
82
|
Provides-Extra: box2d
|
|
84
83
|
Requires-Dist: swig; extra == "box2d"
|
|
85
84
|
Requires-Dist: gymnasium[box2d]>=0.29.0; extra == "box2d"
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-08-
|
|
11
|
+
"date": "2025-08-06T17:51:29-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "a807140937b9002c71ee42a6afef594ea6377c2d",
|
|
15
|
+
"version": "0.2.5-dev1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -101,7 +101,7 @@ class MCPConnectionManager:
|
|
|
101
101
|
|
|
102
102
|
# Update the session ID to match what the server generated
|
|
103
103
|
session.session_id = server_session_id
|
|
104
|
-
logger.
|
|
104
|
+
logger.info(f"Updated session ID to match server: {server_session_id}")
|
|
105
105
|
|
|
106
106
|
# PRE-WARM: Discover and cache tools immediately after session initialization
|
|
107
107
|
# This prevents concurrent list_tools() calls later
|
|
@@ -133,6 +133,24 @@ class MCPConnectionManager:
|
|
|
133
133
|
self._tools_cache[cache_key] = tool_schemas
|
|
134
134
|
logger.debug(f"✅ PRE-WARMED {len(tool_schemas)} tools for{cache_key}")
|
|
135
135
|
|
|
136
|
+
async def reset_session(self, session: MCPSession) -> None:
|
|
137
|
+
"""
|
|
138
|
+
Clean session data in remote mcp server for the given session
|
|
139
|
+
"""
|
|
140
|
+
import httpx
|
|
141
|
+
|
|
142
|
+
base_url = session.base_url.rstrip("/").removesuffix("/mcp")
|
|
143
|
+
url = f"{base_url}/control/reset_session"
|
|
144
|
+
|
|
145
|
+
headers = {"mcp-session-id": session.session_id}
|
|
146
|
+
body = {"seed": session.seed}
|
|
147
|
+
|
|
148
|
+
timeout = httpx.Timeout(3.0)
|
|
149
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
150
|
+
resp = await client.post(url, headers=headers, json=body)
|
|
151
|
+
resp.raise_for_status()
|
|
152
|
+
logger.debug(f"Session {session.session_id}: reset_session -> {resp.json()}")
|
|
153
|
+
|
|
136
154
|
async def discover_tools(self, session: MCPSession) -> List[Dict]:
|
|
137
155
|
"""
|
|
138
156
|
Discover available tools from an MCP session.
|
|
@@ -22,7 +22,6 @@ from vendor.tau2.user.user_simulator import UserSimulator
|
|
|
22
22
|
|
|
23
23
|
from ...models import CompletionParams, EvaluationRow, InputMetadata, Message
|
|
24
24
|
from ...types import MCPSession, MCPToolCall, TerminationReason, Trajectory
|
|
25
|
-
from ..client.connection import MCPConnectionManager
|
|
26
25
|
|
|
27
26
|
if TYPE_CHECKING:
|
|
28
27
|
from ..session.manager import GeneralMCPVectorEnv
|
|
@@ -33,43 +32,9 @@ logger = logging.getLogger(__name__)
|
|
|
33
32
|
|
|
34
33
|
class ExecutionManager:
|
|
35
34
|
"""
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
Combines the functionality of SessionManager and RolloutManager for better
|
|
39
|
-
organization and reduced complexity.
|
|
35
|
+
Manage rollout for MCP environments.
|
|
40
36
|
"""
|
|
41
37
|
|
|
42
|
-
def __init__(self):
|
|
43
|
-
"""Initialize the execution manager."""
|
|
44
|
-
self.connection_manager = MCPConnectionManager()
|
|
45
|
-
|
|
46
|
-
async def initialize_sessions(self, sessions: List[MCPSession]) -> None:
|
|
47
|
-
"""
|
|
48
|
-
Initialize multiple MCP sessions in parallel.
|
|
49
|
-
|
|
50
|
-
Args:
|
|
51
|
-
sessions: List of MCPSessions to initialize
|
|
52
|
-
"""
|
|
53
|
-
tasks = [self.connection_manager.initialize_session(session) for session in sessions]
|
|
54
|
-
await asyncio.gather(*tasks)
|
|
55
|
-
|
|
56
|
-
async def close_sessions(self, sessions: List[MCPSession]) -> None:
|
|
57
|
-
"""
|
|
58
|
-
Close multiple MCP sessions in parallel.
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
sessions: List of MCPSessions to close
|
|
62
|
-
"""
|
|
63
|
-
tasks = [asyncio.create_task(self.connection_manager.close_session(session)) for session in sessions]
|
|
64
|
-
|
|
65
|
-
if tasks:
|
|
66
|
-
try:
|
|
67
|
-
# Wait for all close operations to complete
|
|
68
|
-
await asyncio.gather(*tasks, return_exceptions=True)
|
|
69
|
-
except asyncio.CancelledError:
|
|
70
|
-
# Handle cancellation gracefully (especially important for Python 3.12)
|
|
71
|
-
logger.debug("Close operation was cancelled, but sessions are marked as closed")
|
|
72
|
-
|
|
73
38
|
async def execute_rollouts(
|
|
74
39
|
self,
|
|
75
40
|
envs: "GeneralMCPVectorEnv",
|
|
@@ -178,7 +143,7 @@ class ExecutionManager:
|
|
|
178
143
|
for msg in trajectory.conversation_history:
|
|
179
144
|
# Create a copy to avoid modifying the original
|
|
180
145
|
msg_dict = dict(msg)
|
|
181
|
-
|
|
146
|
+
|
|
182
147
|
# Handle multimodal content (list of content blocks) by extracting text
|
|
183
148
|
if isinstance(msg_dict.get("content"), list):
|
|
184
149
|
text_content = None
|
|
@@ -187,7 +152,7 @@ class ExecutionManager:
|
|
|
187
152
|
text_content = content_block.get("text")
|
|
188
153
|
break
|
|
189
154
|
msg_dict["content"] = text_content or ""
|
|
190
|
-
|
|
155
|
+
|
|
191
156
|
messages.append(Message.model_validate(msg_dict))
|
|
192
157
|
|
|
193
158
|
input_metadata = InputMetadata(
|
|
@@ -116,6 +116,7 @@ class McpGym(ABC):
|
|
|
116
116
|
# Register tools and control plane endpoints
|
|
117
117
|
self._register_tools()
|
|
118
118
|
self._discover_and_register_control_plane_endpoints()
|
|
119
|
+
self._register_session_reset_endpoint()
|
|
119
120
|
|
|
120
121
|
def _get_session_id(self, ctx: Context) -> str:
|
|
121
122
|
"""
|
|
@@ -227,6 +228,28 @@ class McpGym(ABC):
|
|
|
227
228
|
|
|
228
229
|
return self.sessions[session_id]
|
|
229
230
|
|
|
231
|
+
def _register_session_reset_endpoint(self):
|
|
232
|
+
|
|
233
|
+
@self.mcp.custom_route("/control/reset_session", methods=["POST"])
|
|
234
|
+
async def reset_session_endpoint(request: Request) -> JSONResponse:
|
|
235
|
+
session_id = request.headers.get("mcp-session-id")
|
|
236
|
+
body = await request.json()
|
|
237
|
+
seed = body.get("seed", None)
|
|
238
|
+
print(f"🔍 _register_session_reset_endpoint: Resetting session, session_id: {session_id}, seed: {seed}")
|
|
239
|
+
if not session_id:
|
|
240
|
+
return JSONResponse({"error": "Missing mcp-session-id header"}, status_code=400)
|
|
241
|
+
with self.session_lock:
|
|
242
|
+
if session_id in self.sessions:
|
|
243
|
+
env, obs, _ = self._new_env(seed=seed)
|
|
244
|
+
self.sessions[session_id] = {
|
|
245
|
+
"env": env,
|
|
246
|
+
"obs": obs,
|
|
247
|
+
"session_data": {},
|
|
248
|
+
"session_id": session_id,
|
|
249
|
+
}
|
|
250
|
+
print(f"🔍 _register_session_reset_endpoint: Finished reset session, session_id: {session_id}")
|
|
251
|
+
return JSONResponse({"message": "Session reset successfully"})
|
|
252
|
+
|
|
230
253
|
def _discover_and_register_control_plane_endpoints(self):
|
|
231
254
|
"""
|
|
232
255
|
Discover and register control plane endpoints on the subclass instance.
|
|
@@ -323,7 +346,7 @@ class McpGym(ABC):
|
|
|
323
346
|
|
|
324
347
|
# Log control plane update (for debugging)
|
|
325
348
|
print(
|
|
326
|
-
f"🎛️ Control plane updated: reward={reward}, terminated={terminated}, step={self.control_plane_state['step_count']}"
|
|
349
|
+
f"🎛️ Control plane updated: reward={reward}, terminated={terminated}, step={self.control_plane_state['step_count']}, total_reward={self.control_plane_state['total_reward']}"
|
|
327
350
|
)
|
|
328
351
|
|
|
329
352
|
def _get_or_create_session_control_plane(self, session_id: str) -> Dict[str, Any]:
|
|
@@ -365,7 +388,7 @@ class McpGym(ABC):
|
|
|
365
388
|
|
|
366
389
|
# Log control plane update
|
|
367
390
|
print(
|
|
368
|
-
f"🎛️ Session {session_id[:16]}... control plane: reward={reward}, terminated={terminated}, step={control_plane['step_count']}"
|
|
391
|
+
f"🎛️ Session {session_id[:16]}... control plane: reward={reward}, terminated={terminated}, step={control_plane['step_count']}, total_reward={control_plane['total_reward']}"
|
|
369
392
|
)
|
|
370
393
|
|
|
371
394
|
def get_control_plane_state(self, session_id: str) -> Optional[Dict[str, Any]]:
|
|
@@ -11,7 +11,7 @@ import logging
|
|
|
11
11
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
12
12
|
|
|
13
13
|
from ...types import DatasetRow, MCPSession, MCPToolCall
|
|
14
|
-
from ..
|
|
14
|
+
from ..client.connection import MCPConnectionManager
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
@@ -44,7 +44,7 @@ class GeneralMCPVectorEnv:
|
|
|
44
44
|
self.user_prompt_formatter = user_prompt_formatter or self._default_formatter
|
|
45
45
|
self.n = len(sessions)
|
|
46
46
|
self.tool_schemas = [] # Discovered from MCP servers
|
|
47
|
-
self.
|
|
47
|
+
self.connection_manager = MCPConnectionManager()
|
|
48
48
|
self.usage_stats = {} # llm usage stats for monitoring
|
|
49
49
|
|
|
50
50
|
if len(sessions) != len(dataset_rows):
|
|
@@ -58,17 +58,14 @@ class GeneralMCPVectorEnv:
|
|
|
58
58
|
|
|
59
59
|
This is thread-safe and can be called from worker threads.
|
|
60
60
|
"""
|
|
61
|
-
# Establish a persistent session for each environment.
|
|
62
|
-
await self.execution_manager.connection_manager.initialize_session(session)
|
|
63
|
-
|
|
64
61
|
# Get available tools from MCP server
|
|
65
|
-
tool_schemas = await self.
|
|
62
|
+
tool_schemas = await self.connection_manager.discover_tools(session)
|
|
66
63
|
|
|
67
64
|
if not self.tool_schemas:
|
|
68
65
|
self.tool_schemas = tool_schemas
|
|
69
66
|
|
|
70
67
|
# PROPER MCP PATTERN: Get initial state from resources during session establishment
|
|
71
|
-
initial_observation = await self.
|
|
68
|
+
initial_observation = await self.connection_manager.get_initial_state(session)
|
|
72
69
|
|
|
73
70
|
# Update session state
|
|
74
71
|
session.terminated = False
|
|
@@ -119,7 +116,7 @@ class GeneralMCPVectorEnv:
|
|
|
119
116
|
)
|
|
120
117
|
|
|
121
118
|
# Execute the tool call via MCP protocol
|
|
122
|
-
observation, reward, done, info = await self.
|
|
119
|
+
observation, reward, done, info = await self.connection_manager.call_tool(
|
|
123
120
|
session, tool_call.tool_name, tool_call.arguments
|
|
124
121
|
)
|
|
125
122
|
|
|
@@ -223,5 +220,6 @@ class GeneralMCPVectorEnv:
|
|
|
223
220
|
async def close(self):
|
|
224
221
|
"""Closes all MCP sessions."""
|
|
225
222
|
print(f"🧹 Closing {self.n} MCP sessions...")
|
|
226
|
-
|
|
223
|
+
tasks = [self.connection_manager.close_session(session) for session in self.sessions]
|
|
224
|
+
await asyncio.gather(*tasks)
|
|
227
225
|
print(f"✅ All MCP sessions closed.")
|
|
@@ -17,7 +17,7 @@ Usage remains the same:
|
|
|
17
17
|
policy = ep.FireworksPolicy(model_id="accounts/fireworks/models/qwen3-235b-a22b")
|
|
18
18
|
|
|
19
19
|
# Create environments with evaluation_rows configuration
|
|
20
|
-
envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
|
|
20
|
+
envs = await ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
|
|
21
21
|
|
|
22
22
|
# Execute tool-calling rollouts
|
|
23
23
|
evaluation_rows = await ep.rollout(envs, policy=policy, steps=512)
|
|
@@ -51,11 +51,20 @@ from .mcp.execution.policy import AnthropicPolicy, FireworksPolicy, LLMBasePolic
|
|
|
51
51
|
from .mcp.session.manager import GeneralMCPVectorEnv
|
|
52
52
|
from .models import EvaluationRow
|
|
53
53
|
from .types import DatasetRow, MCPSession, MCPToolCall
|
|
54
|
+
import asyncio
|
|
54
55
|
|
|
55
56
|
logger = logging.getLogger(__name__)
|
|
56
57
|
|
|
57
58
|
|
|
58
|
-
def
|
|
59
|
+
async def reset_mcp_sessions(envs: GeneralMCPVectorEnv):
|
|
60
|
+
"""
|
|
61
|
+
Reset mcp server sessions
|
|
62
|
+
"""
|
|
63
|
+
tasks = [envs.connection_manager.reset_session(session) for session in envs.sessions]
|
|
64
|
+
await asyncio.gather(*tasks)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def make(
|
|
59
68
|
env_spec: str,
|
|
60
69
|
evaluation_rows: Optional[List[EvaluationRow]] = None,
|
|
61
70
|
dataset: Optional[List[Dict]] = None,
|
|
@@ -63,6 +72,7 @@ def make(
|
|
|
63
72
|
seeds: Optional[List[int]] = None,
|
|
64
73
|
model_id: str = "unknown",
|
|
65
74
|
user_prompt_formatter: Optional[Callable] = None,
|
|
75
|
+
reset_sessions: bool = False,
|
|
66
76
|
) -> GeneralMCPVectorEnv:
|
|
67
77
|
"""
|
|
68
78
|
Create general MCP environments driven by evaluation_rows configuration.
|
|
@@ -75,19 +85,20 @@ def make(
|
|
|
75
85
|
seeds: List of seeds (for backward compatibility)
|
|
76
86
|
model_id: Model identifier
|
|
77
87
|
user_prompt_formatter: Optional callback for formatting user prompts
|
|
88
|
+
reset_sessions: Whether to reset sessions before returning the environment
|
|
78
89
|
|
|
79
90
|
Returns:
|
|
80
91
|
General MCP environment that works with any MCP server
|
|
81
92
|
|
|
82
93
|
Example:
|
|
83
94
|
# EvaluationRow approach (preferred)
|
|
84
|
-
envs = ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
|
|
95
|
+
envs = await ep.make("http://localhost:8000/mcp", evaluation_rows=evaluation_rows)
|
|
85
96
|
|
|
86
97
|
# Dataset approach (backward compatibility)
|
|
87
|
-
envs = ep.make("http://localhost:8000/mcp", dataset=dataset)
|
|
98
|
+
envs = await ep.make("http://localhost:8000/mcp", dataset=dataset)
|
|
88
99
|
|
|
89
100
|
# Legacy approach (backward compatibility)
|
|
90
|
-
envs = ep.make("http://localhost:8000/mcp", n=10, seeds=seeds)
|
|
101
|
+
envs = await ep.make("http://localhost:8000/mcp", n=10, seeds=seeds)
|
|
91
102
|
"""
|
|
92
103
|
# Parse environment specification - make sure URL format is correct
|
|
93
104
|
base_url = env_spec
|
|
@@ -160,8 +171,6 @@ def make(
|
|
|
160
171
|
)
|
|
161
172
|
sessions.append(session)
|
|
162
173
|
|
|
163
|
-
return GeneralMCPVectorEnv(sessions, dataset_rows, user_prompt_formatter)
|
|
164
|
-
|
|
165
174
|
else:
|
|
166
175
|
# Legacy approach for backward compatibility
|
|
167
176
|
if n is None:
|
|
@@ -198,7 +207,14 @@ def make(
|
|
|
198
207
|
)
|
|
199
208
|
sessions.append(session)
|
|
200
209
|
|
|
201
|
-
|
|
210
|
+
mcp_envs = GeneralMCPVectorEnv(sessions, dataset_rows, user_prompt_formatter)
|
|
211
|
+
tasks = [mcp_envs.connection_manager.initialize_session(session) for session in sessions]
|
|
212
|
+
await asyncio.gather(*tasks)
|
|
213
|
+
|
|
214
|
+
if reset_sessions:
|
|
215
|
+
await reset_mcp_sessions(mcp_envs)
|
|
216
|
+
|
|
217
|
+
return mcp_envs
|
|
202
218
|
|
|
203
219
|
|
|
204
220
|
async def rollout(
|
|
@@ -266,7 +282,7 @@ async def rollout(
|
|
|
266
282
|
raise ValueError("Either 'evaluation_rows' or 'dataset' must be provided when envs is a URL")
|
|
267
283
|
|
|
268
284
|
auto_model_id = model_id or getattr(policy, "model_id", "unknown")
|
|
269
|
-
envs = make(envs, evaluation_rows=evaluation_rows, dataset=dataset, model_id=auto_model_id)
|
|
285
|
+
envs = await make(envs, evaluation_rows=evaluation_rows, dataset=dataset, model_id=auto_model_id)
|
|
270
286
|
|
|
271
287
|
# Use the new ExecutionManager for execution
|
|
272
288
|
execution_manager = ExecutionManager()
|
|
@@ -182,49 +182,47 @@ class MCPServerManager:
|
|
|
182
182
|
return False # Don't suppress exceptions
|
|
183
183
|
|
|
184
184
|
|
|
185
|
-
|
|
186
|
-
|
|
185
|
+
async def default_mcp_gym_rollout_processor(
|
|
186
|
+
rows: List[EvaluationRow], config: RolloutProcessorConfig
|
|
187
|
+
) -> List[EvaluationRow]:
|
|
187
188
|
"""
|
|
188
189
|
Rollout processor for tau bench environments.
|
|
189
|
-
|
|
190
|
+
|
|
190
191
|
This processor starts an MCP server, creates tau bench environments, and runs rollouts
|
|
191
192
|
using the eval_protocol framework, following the pattern from test_tau2_e2e.py.
|
|
192
|
-
|
|
193
|
+
|
|
193
194
|
Args:
|
|
194
195
|
rows: List of EvaluationRow objects containing messages and dataset info in input_metadata
|
|
195
196
|
config: RolloutProcessorConfig with model and other parameters
|
|
196
|
-
|
|
197
|
+
|
|
197
198
|
Returns:
|
|
198
199
|
List of EvaluationRow objects with completed conversations
|
|
199
200
|
"""
|
|
200
201
|
server = MCPServerManager(config.server_script_path, port=9700)
|
|
201
|
-
|
|
202
|
+
|
|
202
203
|
try:
|
|
203
204
|
server.start()
|
|
204
|
-
|
|
205
|
+
|
|
205
206
|
policy = ep.LiteLLMPolicy(
|
|
206
207
|
model_id=config.model,
|
|
207
|
-
temperature=config.input_params.get(
|
|
208
|
-
max_tokens=config.input_params.get(
|
|
208
|
+
temperature=config.input_params.get("temperature", 0.0),
|
|
209
|
+
max_tokens=config.input_params.get("max_tokens", 4096),
|
|
209
210
|
)
|
|
210
|
-
|
|
211
|
+
|
|
211
212
|
# Create MCP environments directly from evaluation_rows
|
|
212
|
-
envs = ep.make(
|
|
213
|
-
|
|
213
|
+
envs = await ep.make(
|
|
214
|
+
"http://localhost:9700/mcp/",
|
|
214
215
|
evaluation_rows=rows,
|
|
215
216
|
model_id=policy.model_id,
|
|
216
217
|
)
|
|
217
|
-
|
|
218
|
+
|
|
218
219
|
# Run rollout with environments and policy
|
|
219
220
|
evaluation_rows = await ep.rollout(
|
|
220
|
-
envs,
|
|
221
|
-
policy=policy,
|
|
222
|
-
steps=config.steps,
|
|
223
|
-
max_concurrent_rollouts=config.max_concurrent_rollouts
|
|
221
|
+
envs, policy=policy, steps=config.steps, max_concurrent_rollouts=config.max_concurrent_rollouts
|
|
224
222
|
)
|
|
225
|
-
|
|
223
|
+
|
|
226
224
|
return evaluation_rows
|
|
227
|
-
|
|
225
|
+
|
|
228
226
|
finally:
|
|
229
227
|
# Always clean up the server
|
|
230
228
|
server.stop()
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from typing import Any, Dict, List, Optional
|
|
4
|
+
from mcp.client.session import ClientSession
|
|
5
|
+
from contextlib import AsyncExitStack
|
|
4
6
|
|
|
5
7
|
|
|
6
8
|
class TerminationReason(str, Enum):
|
|
@@ -50,8 +52,8 @@ class MCPSession:
|
|
|
50
52
|
last_observation: Any = None
|
|
51
53
|
|
|
52
54
|
# Persistent MCP connection components
|
|
53
|
-
_exit_stack: Optional[
|
|
54
|
-
_mcp_session: Optional[
|
|
55
|
+
_exit_stack: Optional[AsyncExitStack] = None
|
|
56
|
+
_mcp_session: Optional[ClientSession] = None
|
|
55
57
|
|
|
56
58
|
|
|
57
59
|
@dataclass
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5.dev1
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -40,7 +40,6 @@ Requires-Dist: deepdiff>=6.0.0
|
|
|
40
40
|
Requires-Dist: pandas>=1.5.0
|
|
41
41
|
Requires-Dist: watchdog>=2.1.0
|
|
42
42
|
Requires-Dist: websockets>=15.0.1
|
|
43
|
-
Requires-Dist: fireworks-ai>=0.19.12
|
|
44
43
|
Requires-Dist: fastapi>=0.116.1
|
|
45
44
|
Provides-Extra: dev
|
|
46
45
|
Requires-Dist: build; extra == "dev"
|
|
@@ -79,7 +78,7 @@ Requires-Dist: accelerate>=0.28.0; extra == "trl"
|
|
|
79
78
|
Provides-Extra: openevals
|
|
80
79
|
Requires-Dist: openevals>=0.1.0; extra == "openevals"
|
|
81
80
|
Provides-Extra: fireworks
|
|
82
|
-
Requires-Dist: fireworks-ai>=0.19.
|
|
81
|
+
Requires-Dist: fireworks-ai>=0.19.12; extra == "fireworks"
|
|
83
82
|
Provides-Extra: box2d
|
|
84
83
|
Requires-Dist: swig; extra == "box2d"
|
|
85
84
|
Requires-Dist: gymnasium[box2d]>=0.29.0; extra == "box2d"
|
|
@@ -28,7 +28,6 @@ deepdiff>=6.0.0
|
|
|
28
28
|
pandas>=1.5.0
|
|
29
29
|
watchdog>=2.1.0
|
|
30
30
|
websockets>=15.0.1
|
|
31
|
-
fireworks-ai>=0.19.12
|
|
32
31
|
fastapi>=0.116.1
|
|
33
32
|
|
|
34
33
|
[adapters]
|
|
@@ -71,7 +70,7 @@ pip>=25.1.1
|
|
|
71
70
|
haikus==0.3.8
|
|
72
71
|
|
|
73
72
|
[fireworks]
|
|
74
|
-
fireworks-ai>=0.19.
|
|
73
|
+
fireworks-ai>=0.19.12
|
|
75
74
|
|
|
76
75
|
[huggingface]
|
|
77
76
|
datasets>=2.0.0
|
|
@@ -48,7 +48,6 @@ dependencies = [
|
|
|
48
48
|
"pandas>=1.5.0",
|
|
49
49
|
"watchdog>=2.1.0",
|
|
50
50
|
"websockets>=15.0.1",
|
|
51
|
-
"fireworks-ai>=0.19.12",
|
|
52
51
|
"fastapi>=0.116.1",
|
|
53
52
|
]
|
|
54
53
|
|
|
@@ -96,7 +95,7 @@ openevals = [
|
|
|
96
95
|
"openevals>=0.1.0",
|
|
97
96
|
]
|
|
98
97
|
fireworks = [
|
|
99
|
-
"fireworks-ai>=0.19.
|
|
98
|
+
"fireworks-ai>=0.19.12",
|
|
100
99
|
]
|
|
101
100
|
box2d = [
|
|
102
101
|
"swig",
|
|
@@ -138,7 +138,7 @@ async def _test_seed_handling_and_type_compatibility_impl():
|
|
|
138
138
|
)
|
|
139
139
|
|
|
140
140
|
# 3. Test that environments are created with proper seed isolation
|
|
141
|
-
envs = ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
|
|
141
|
+
envs = await ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
|
|
142
142
|
|
|
143
143
|
# Verify we have the right number of environments
|
|
144
144
|
assert len(envs.sessions) == len(test_seeds), f"Expected {len(test_seeds)} sessions, got {len(envs.sessions)}"
|
|
@@ -273,7 +273,7 @@ async def _run_simplified_compatibility_test():
|
|
|
273
273
|
)
|
|
274
274
|
|
|
275
275
|
# This should work even without a server (just creates session objects)
|
|
276
|
-
envs = ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
|
|
276
|
+
envs = await ep.make("http://127.0.0.1:8001/mcp/", dataset=dataset)
|
|
277
277
|
assert len(envs.sessions) == len(test_seeds)
|
|
278
278
|
print("✅ Environment creation works")
|
|
279
279
|
|
{eval_protocol-0.2.4 → eval_protocol-0.2.5.dev1}/tests/test_rollout_control_plane_integration.py
RENAMED
|
@@ -489,7 +489,7 @@ class TestRolloutControlPlaneIntegration:
|
|
|
489
489
|
policy = MockPolicy(["right"])
|
|
490
490
|
|
|
491
491
|
with (
|
|
492
|
-
patch("eval_protocol.mcp_env.make") as mock_make,
|
|
492
|
+
patch("eval_protocol.mcp_env.make", new_callable=AsyncMock) as mock_make,
|
|
493
493
|
patch("eval_protocol.mcp_env.ExecutionManager") as MockManager,
|
|
494
494
|
):
|
|
495
495
|
mock_env = MagicMock()
|
|
@@ -512,7 +512,15 @@ class TestRolloutControlPlaneIntegration:
|
|
|
512
512
|
dataset=dataset,
|
|
513
513
|
model_id="test_model",
|
|
514
514
|
)
|
|
515
|
-
|
|
515
|
+
|
|
516
|
+
manager_instance.execute_rollouts.assert_called_once_with(
|
|
517
|
+
mock_make.return_value,
|
|
518
|
+
policy,
|
|
519
|
+
5,
|
|
520
|
+
None,
|
|
521
|
+
8,
|
|
522
|
+
)
|
|
523
|
+
|
|
516
524
|
assert result == ["ok"]
|
|
517
525
|
|
|
518
526
|
def test_control_plane_trajectory_serialization(self):
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
1
|
+
from unittest.mock import AsyncMock, patch
|
|
3
2
|
import httpx
|
|
4
3
|
import pytest
|
|
5
4
|
from werkzeug.wrappers import Response
|
|
@@ -7,31 +6,46 @@ from werkzeug.wrappers import Response
|
|
|
7
6
|
import eval_protocol as ep
|
|
8
7
|
|
|
9
8
|
|
|
10
|
-
# Sync tests for the ep.make() function
|
|
11
|
-
|
|
9
|
+
# Sync tests for the await ep.make() function
|
|
10
|
+
@pytest.mark.asyncio
|
|
11
|
+
async def test_mcp_env_make_appends_trailing_slash():
|
|
12
12
|
"""
|
|
13
|
-
Verify that ep.make() appends a trailing slash to the MCP server URL if it's missing.
|
|
13
|
+
Verify that await ep.make() appends a trailing slash to the MCP server URL if it's missing.
|
|
14
14
|
This prevents 307 redirects that can break HTTP clients.
|
|
15
15
|
"""
|
|
16
16
|
base_url = "http://localhost:8000/mcp"
|
|
17
17
|
corrected_url = "http://localhost:8000/mcp/"
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
with patch(
|
|
20
|
+
"eval_protocol.mcp.client.connection.MCPConnectionManager.initialize_session",
|
|
21
|
+
new_callable=AsyncMock,
|
|
22
|
+
) as mock_init:
|
|
23
|
+
mock_init.return_value = None
|
|
24
|
+
|
|
25
|
+
envs = await ep.make(base_url, n=1, seeds=[42])
|
|
26
|
+
|
|
27
|
+
mock_init.assert_awaited_once()
|
|
21
28
|
|
|
22
29
|
assert len(envs.sessions) == 1
|
|
23
|
-
# The session's base_url should have the trailing slash
|
|
24
30
|
assert envs.sessions[0].base_url == corrected_url
|
|
25
31
|
|
|
26
32
|
|
|
27
|
-
|
|
33
|
+
@pytest.mark.asyncio
|
|
34
|
+
async def test_mcp_env_make_keeps_existing_trailing_slash():
|
|
28
35
|
"""
|
|
29
|
-
Verify that ep.make() does not add an extra slash if one is already present.
|
|
36
|
+
Verify that await ep.make() does not add an extra slash if one is already present.
|
|
30
37
|
"""
|
|
31
38
|
base_url = "http://localhost:8000/mcp/"
|
|
32
39
|
|
|
33
|
-
|
|
34
|
-
|
|
40
|
+
with patch(
|
|
41
|
+
"eval_protocol.mcp.client.connection.MCPConnectionManager.initialize_session",
|
|
42
|
+
new_callable=AsyncMock,
|
|
43
|
+
) as mock_init:
|
|
44
|
+
mock_init.return_value = None
|
|
45
|
+
|
|
46
|
+
envs = await ep.make(base_url, n=1, seeds=[42])
|
|
47
|
+
|
|
48
|
+
mock_init.assert_awaited_once()
|
|
35
49
|
|
|
36
50
|
assert len(envs.sessions) == 1
|
|
37
51
|
# The session's base_url should remain unchanged
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|