eval-protocol 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.1/eval_protocol.egg-info → eval_protocol-0.2.3}/PKG-INFO +48 -3
- eval_protocol-0.2.3/README.md +69 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/default_agent_rollout_processor.py +34 -8
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +37 -7
- {eval_protocol-0.2.1 → eval_protocol-0.2.3/eval_protocol.egg-info}/PKG-INFO +48 -3
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/pyproject.toml +1 -0
- eval_protocol-0.2.1/README.md +0 -24
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/LICENSE +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/development/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/integrations/braintrust.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/session.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/setup.cfg +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/setup.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_braintrust_adapter.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_braintrust_example.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_config.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_format.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_frozen_lake_http_server.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_frozen_lake_seed_evaluation.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_length.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_math.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_models.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_server.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.1 → eval_protocol-0.2.3}/versioneer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -96,8 +96,53 @@ Dynamic: license-file
|
|
|
96
96
|
|
|
97
97
|
[](https://pypi.org/project/eval-protocol/)
|
|
98
98
|
|
|
99
|
-
EP is an open
|
|
100
|
-
language model (LLM)
|
|
99
|
+
EP is an open specification, Python SDK, and pytest wrapper that provides a
|
|
100
|
+
standardized way to write evaluations for large language model (LLM)
|
|
101
|
+
applications. Start with simple single-turn evals for model selection and prompt
|
|
102
|
+
engineering, then scale up to complex multi-turn reinforcement learning (RL) for
|
|
103
|
+
agents using Model Context Protocol (MCP). EP ensures consistent patterns for
|
|
104
|
+
writing evals, storing traces, and saving results—enabling you to build
|
|
105
|
+
sophisticated agent evaluations that work across real-world scenarios, from
|
|
106
|
+
markdown generation tasks to customer service agents with tool calling
|
|
107
|
+
capabilities.
|
|
108
|
+
|
|
109
|
+
## Quick Example
|
|
110
|
+
|
|
111
|
+
Here's a simple test function that checks if a model's response contains **bold** text formatting:
|
|
112
|
+
|
|
113
|
+
```python test_bold_format.py
|
|
114
|
+
from eval_protocol.models import EvaluateResult, EvaluationRow
|
|
115
|
+
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
|
|
116
|
+
|
|
117
|
+
@evaluation_test(
|
|
118
|
+
input_messages=[
|
|
119
|
+
[
|
|
120
|
+
Message(role="system", content="You are a helpful assistant. Use bold text to highlight important information."),
|
|
121
|
+
Message(role="user", content="Explain why **evaluations** matter for building AI agents. Make it dramatic!"),
|
|
122
|
+
],
|
|
123
|
+
],
|
|
124
|
+
model=["accounts/fireworks/models/llama-v3p1-8b-instruct"],
|
|
125
|
+
rollout_processor=default_single_turn_rollout_processor,
|
|
126
|
+
mode="pointwise",
|
|
127
|
+
)
|
|
128
|
+
def test_bold_format(row: EvaluationRow) -> EvaluationRow:
|
|
129
|
+
"""
|
|
130
|
+
Simple evaluation that checks if the model's response contains bold text.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
assistant_response = row.messages[-1].content
|
|
134
|
+
|
|
135
|
+
# Check if response contains **bold** text
|
|
136
|
+
has_bold = "**" in assistant_response
|
|
137
|
+
|
|
138
|
+
if has_bold:
|
|
139
|
+
result = EvaluateResult(score=1.0, reason="✅ Response contains bold text")
|
|
140
|
+
else:
|
|
141
|
+
result = EvaluateResult(score=0.0, reason="❌ No bold text found")
|
|
142
|
+
|
|
143
|
+
row.evaluation_result = result
|
|
144
|
+
return row
|
|
145
|
+
```
|
|
101
146
|
|
|
102
147
|
## Documentation
|
|
103
148
|
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# Eval Protocol (EP)
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/eval-protocol/)
|
|
4
|
+
|
|
5
|
+
EP is an open specification, Python SDK, and pytest wrapper that provides a
|
|
6
|
+
standardized way to write evaluations for large language model (LLM)
|
|
7
|
+
applications. Start with simple single-turn evals for model selection and prompt
|
|
8
|
+
engineering, then scale up to complex multi-turn reinforcement learning (RL) for
|
|
9
|
+
agents using Model Context Protocol (MCP). EP ensures consistent patterns for
|
|
10
|
+
writing evals, storing traces, and saving results—enabling you to build
|
|
11
|
+
sophisticated agent evaluations that work across real-world scenarios, from
|
|
12
|
+
markdown generation tasks to customer service agents with tool calling
|
|
13
|
+
capabilities.
|
|
14
|
+
|
|
15
|
+
## Quick Example
|
|
16
|
+
|
|
17
|
+
Here's a simple test function that checks if a model's response contains **bold** text formatting:
|
|
18
|
+
|
|
19
|
+
```python test_bold_format.py
|
|
20
|
+
from eval_protocol.models import EvaluateResult, EvaluationRow
|
|
21
|
+
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
|
|
22
|
+
|
|
23
|
+
@evaluation_test(
|
|
24
|
+
input_messages=[
|
|
25
|
+
[
|
|
26
|
+
Message(role="system", content="You are a helpful assistant. Use bold text to highlight important information."),
|
|
27
|
+
Message(role="user", content="Explain why **evaluations** matter for building AI agents. Make it dramatic!"),
|
|
28
|
+
],
|
|
29
|
+
],
|
|
30
|
+
model=["accounts/fireworks/models/llama-v3p1-8b-instruct"],
|
|
31
|
+
rollout_processor=default_single_turn_rollout_processor,
|
|
32
|
+
mode="pointwise",
|
|
33
|
+
)
|
|
34
|
+
def test_bold_format(row: EvaluationRow) -> EvaluationRow:
|
|
35
|
+
"""
|
|
36
|
+
Simple evaluation that checks if the model's response contains bold text.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
assistant_response = row.messages[-1].content
|
|
40
|
+
|
|
41
|
+
# Check if response contains **bold** text
|
|
42
|
+
has_bold = "**" in assistant_response
|
|
43
|
+
|
|
44
|
+
if has_bold:
|
|
45
|
+
result = EvaluateResult(score=1.0, reason="✅ Response contains bold text")
|
|
46
|
+
else:
|
|
47
|
+
result = EvaluateResult(score=0.0, reason="❌ No bold text found")
|
|
48
|
+
|
|
49
|
+
row.evaluation_result = result
|
|
50
|
+
return row
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Documentation
|
|
54
|
+
|
|
55
|
+
See our [documentation](https://evalprotocol.io) for more details.
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
|
|
59
|
+
**This library requires Python >= 3.10.**
|
|
60
|
+
|
|
61
|
+
Install with pip:
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
pip install eval-protocol
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## License
|
|
68
|
+
|
|
69
|
+
[MIT](LICENSE)
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-08-
|
|
11
|
+
"date": "2025-08-04T20:35:33-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "52b46a7d3f8455d848d8d5138ec4ca4d6343d3d2",
|
|
15
|
+
"version": "0.2.3"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/pytest/default_agent_rollout_processor.py
RENAMED
|
@@ -1,8 +1,10 @@
|
|
|
1
|
+
import asyncio
|
|
1
2
|
import json
|
|
2
3
|
import os
|
|
3
|
-
from typing import Any, List, Optional
|
|
4
|
+
from typing import Any, List, Optional, Union
|
|
4
5
|
|
|
5
6
|
from mcp.types import CallToolResult
|
|
7
|
+
from openai import NOT_GIVEN, NotGiven
|
|
6
8
|
from openai.types.chat import ChatCompletionMessage, ChatCompletionToolParam
|
|
7
9
|
from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
|
|
8
10
|
|
|
@@ -22,27 +24,43 @@ class Agent:
|
|
|
22
24
|
self.messages: list[Message] = initial_messages
|
|
23
25
|
self._policy = LiteLLMPolicy(model_id=model)
|
|
24
26
|
self.mcp_client = MCPMultiClient(config_path=config_path) if config_path else None
|
|
27
|
+
self.tools: Union[List[ChatCompletionToolParam], NotGiven] = NOT_GIVEN
|
|
25
28
|
|
|
26
29
|
async def setup(self):
|
|
27
30
|
if self.mcp_client:
|
|
28
31
|
await self.mcp_client.connect_to_servers()
|
|
29
32
|
|
|
33
|
+
async def _get_tools(self) -> Optional[List[ChatCompletionToolParam]]:
|
|
34
|
+
if self.tools is NOT_GIVEN:
|
|
35
|
+
self.tools = await self.mcp_client.get_available_tools() if self.mcp_client else None
|
|
36
|
+
return self.tools
|
|
37
|
+
|
|
30
38
|
async def call_agent(self) -> str:
|
|
31
39
|
"""
|
|
32
40
|
Call the assistant with the user query.
|
|
33
41
|
"""
|
|
34
|
-
tools = await self.
|
|
42
|
+
tools = await self._get_tools() if self.mcp_client else None
|
|
35
43
|
|
|
36
44
|
message = await self._call_model(self.messages, tools)
|
|
37
45
|
self.messages.append(message)
|
|
38
46
|
if message["tool_calls"]:
|
|
47
|
+
# Create tasks for all tool calls to run them in parallel
|
|
48
|
+
tool_tasks = []
|
|
39
49
|
for tool_call in message["tool_calls"]:
|
|
40
50
|
tool_call_id = tool_call["id"]
|
|
41
51
|
tool_name = tool_call["function"]["name"]
|
|
42
52
|
tool_args = tool_call["function"]["arguments"]
|
|
43
53
|
tool_args_dict = json.loads(tool_args)
|
|
44
|
-
|
|
45
|
-
|
|
54
|
+
|
|
55
|
+
# Create a task for each tool call
|
|
56
|
+
task = self._execute_tool_call(tool_call_id, tool_name, tool_args_dict)
|
|
57
|
+
tool_tasks.append(task)
|
|
58
|
+
|
|
59
|
+
# Execute all tool calls in parallel
|
|
60
|
+
tool_results = await asyncio.gather(*tool_tasks)
|
|
61
|
+
|
|
62
|
+
# Add all tool results to messages (they will be in the same order as tool_calls)
|
|
63
|
+
for tool_call, (tool_call_id, content) in zip(message["tool_calls"], tool_results):
|
|
46
64
|
self.messages.append(
|
|
47
65
|
{
|
|
48
66
|
"role": "tool",
|
|
@@ -50,18 +68,26 @@ class Agent:
|
|
|
50
68
|
"tool_call_id": tool_call_id,
|
|
51
69
|
}
|
|
52
70
|
)
|
|
71
|
+
return await self.call_agent()
|
|
53
72
|
return message["content"]
|
|
54
73
|
|
|
55
74
|
async def _call_model(
|
|
56
75
|
self, messages: list[Message], tools: Optional[list[ChatCompletionToolParam]]
|
|
57
76
|
) -> ChatCompletionMessage:
|
|
58
77
|
messages = [message.model_dump() if hasattr(message, "model_dump") else message for message in messages]
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
tools=tools,
|
|
62
|
-
)
|
|
78
|
+
tools = [{"function": tool["function"].model_dump(), "type": "function"} for tool in tools] if tools else []
|
|
79
|
+
response = await self._policy._make_llm_call(messages=messages, tools=tools)
|
|
63
80
|
return response["choices"][0]["message"]
|
|
64
81
|
|
|
82
|
+
async def _execute_tool_call(self, tool_call_id: str, tool_name: str, tool_args_dict: dict) -> tuple[str, str]:
|
|
83
|
+
"""
|
|
84
|
+
Execute a single tool call and return the tool_call_id and content.
|
|
85
|
+
This method is designed to be used with asyncio.gather() for parallel execution.
|
|
86
|
+
"""
|
|
87
|
+
tool_result = await self.mcp_client.call_tool(tool_name, tool_args_dict)
|
|
88
|
+
content = self._get_content_from_tool_result(tool_result)
|
|
89
|
+
return tool_call_id, content
|
|
90
|
+
|
|
65
91
|
def _get_content_from_tool_result(self, tool_result: CallToolResult) -> str:
|
|
66
92
|
if tool_result.structuredContent:
|
|
67
93
|
return json.dumps(tool_result.structuredContent)
|
|
@@ -2,6 +2,7 @@ import asyncio
|
|
|
2
2
|
import os
|
|
3
3
|
import subprocess
|
|
4
4
|
import time
|
|
5
|
+
import socket
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import List, Optional
|
|
7
8
|
|
|
@@ -69,11 +70,8 @@ class MCPServerManager:
|
|
|
69
70
|
self._log_file = log_file
|
|
70
71
|
self._log_file_path = log_file_path
|
|
71
72
|
|
|
72
|
-
# Wait for server to
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
# Check if process is still running
|
|
76
|
-
if self.process.poll() is not None:
|
|
73
|
+
# Wait for server to be ready with proper health check
|
|
74
|
+
if not self._wait_for_server_ready(timeout=15):
|
|
77
75
|
try:
|
|
78
76
|
with open(self._log_file_path, "r") as f:
|
|
79
77
|
log_content = f.read()
|
|
@@ -82,13 +80,45 @@ class MCPServerManager:
|
|
|
82
80
|
print("=" * 50)
|
|
83
81
|
print(log_content)
|
|
84
82
|
print("=" * 50)
|
|
85
|
-
raise RuntimeError(f"Server failed to start. Check log above for details.")
|
|
83
|
+
raise RuntimeError(f"Server failed to start or become ready. Check log above for details.")
|
|
86
84
|
except Exception as e:
|
|
87
85
|
stdout, stderr = self.process.communicate()
|
|
88
|
-
raise RuntimeError(f"Server failed to start. stderr: {stderr}, log error: {e}")
|
|
86
|
+
raise RuntimeError(f"Server failed to start or become ready. stderr: {stderr}, log error: {e}")
|
|
89
87
|
|
|
90
88
|
print(f"✅ Server started successfully on port {self.port}")
|
|
91
89
|
|
|
90
|
+
def _wait_for_server_ready(self, timeout: int = 15) -> bool:
|
|
91
|
+
"""
|
|
92
|
+
Wait for server to be ready by polling socket connection.
|
|
93
|
+
"""
|
|
94
|
+
start_time = time.time()
|
|
95
|
+
health_check_failures = 0
|
|
96
|
+
|
|
97
|
+
while time.time() - start_time < timeout:
|
|
98
|
+
# Check if process is still running
|
|
99
|
+
if self.process.poll() is not None:
|
|
100
|
+
print(f"Server process exited early")
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
105
|
+
s.settimeout(1)
|
|
106
|
+
result = s.connect_ex(("localhost", self.port))
|
|
107
|
+
if result == 0:
|
|
108
|
+
time.sleep(0.5)
|
|
109
|
+
return True
|
|
110
|
+
except Exception as e:
|
|
111
|
+
health_check_failures += 1
|
|
112
|
+
# Print first few failures for debugging
|
|
113
|
+
if health_check_failures <= 3:
|
|
114
|
+
print(f"Health check failed: {e}")
|
|
115
|
+
|
|
116
|
+
# Wait before next check
|
|
117
|
+
time.sleep(0.1)
|
|
118
|
+
|
|
119
|
+
print(f"Server failed to become ready within {timeout} seconds")
|
|
120
|
+
return False
|
|
121
|
+
|
|
92
122
|
def stop(self) -> None:
|
|
93
123
|
"""Stop the MCP server."""
|
|
94
124
|
if self.process:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -96,8 +96,53 @@ Dynamic: license-file
|
|
|
96
96
|
|
|
97
97
|
[](https://pypi.org/project/eval-protocol/)
|
|
98
98
|
|
|
99
|
-
EP is an open
|
|
100
|
-
language model (LLM)
|
|
99
|
+
EP is an open specification, Python SDK, and pytest wrapper that provides a
|
|
100
|
+
standardized way to write evaluations for large language model (LLM)
|
|
101
|
+
applications. Start with simple single-turn evals for model selection and prompt
|
|
102
|
+
engineering, then scale up to complex multi-turn reinforcement learning (RL) for
|
|
103
|
+
agents using Model Context Protocol (MCP). EP ensures consistent patterns for
|
|
104
|
+
writing evals, storing traces, and saving results—enabling you to build
|
|
105
|
+
sophisticated agent evaluations that work across real-world scenarios, from
|
|
106
|
+
markdown generation tasks to customer service agents with tool calling
|
|
107
|
+
capabilities.
|
|
108
|
+
|
|
109
|
+
## Quick Example
|
|
110
|
+
|
|
111
|
+
Here's a simple test function that checks if a model's response contains **bold** text formatting:
|
|
112
|
+
|
|
113
|
+
```python test_bold_format.py
|
|
114
|
+
from eval_protocol.models import EvaluateResult, EvaluationRow
|
|
115
|
+
from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
|
|
116
|
+
|
|
117
|
+
@evaluation_test(
|
|
118
|
+
input_messages=[
|
|
119
|
+
[
|
|
120
|
+
Message(role="system", content="You are a helpful assistant. Use bold text to highlight important information."),
|
|
121
|
+
Message(role="user", content="Explain why **evaluations** matter for building AI agents. Make it dramatic!"),
|
|
122
|
+
],
|
|
123
|
+
],
|
|
124
|
+
model=["accounts/fireworks/models/llama-v3p1-8b-instruct"],
|
|
125
|
+
rollout_processor=default_single_turn_rollout_processor,
|
|
126
|
+
mode="pointwise",
|
|
127
|
+
)
|
|
128
|
+
def test_bold_format(row: EvaluationRow) -> EvaluationRow:
|
|
129
|
+
"""
|
|
130
|
+
Simple evaluation that checks if the model's response contains bold text.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
assistant_response = row.messages[-1].content
|
|
134
|
+
|
|
135
|
+
# Check if response contains **bold** text
|
|
136
|
+
has_bold = "**" in assistant_response
|
|
137
|
+
|
|
138
|
+
if has_bold:
|
|
139
|
+
result = EvaluateResult(score=1.0, reason="✅ Response contains bold text")
|
|
140
|
+
else:
|
|
141
|
+
result = EvaluateResult(score=0.0, reason="❌ No bold text found")
|
|
142
|
+
|
|
143
|
+
row.evaluation_result = result
|
|
144
|
+
return row
|
|
145
|
+
```
|
|
101
146
|
|
|
102
147
|
## Documentation
|
|
103
148
|
|
eval_protocol-0.2.1/README.md
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
# Eval Protocol (EP)
|
|
2
|
-
|
|
3
|
-
[](https://pypi.org/project/eval-protocol/)
|
|
4
|
-
|
|
5
|
-
EP is an open protocol that standardizes how developers author evals for large
|
|
6
|
-
language model (LLM) applications.
|
|
7
|
-
|
|
8
|
-
## Documentation
|
|
9
|
-
|
|
10
|
-
See our [documentation](https://evalprotocol.io) for more details.
|
|
11
|
-
|
|
12
|
-
## Installation
|
|
13
|
-
|
|
14
|
-
**This library requires Python >= 3.10.**
|
|
15
|
-
|
|
16
|
-
Install with pip:
|
|
17
|
-
|
|
18
|
-
```
|
|
19
|
-
pip install eval-protocol
|
|
20
|
-
```
|
|
21
|
-
|
|
22
|
-
## License
|
|
23
|
-
|
|
24
|
-
[MIT](LICENSE)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_envs/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_envs/math_api.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_envs/posting_api.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/bfcl_sim_api_resource.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/docker_resource.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/filesystem_resource.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/http_rollout_protocol.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/http_rollout_resource.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/agent/resources/python_state_resource.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/__init__.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.1 → eval_protocol-0.2.3}/eval_protocol/mcp_agent/orchestration/base_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|