eval-protocol 0.2.94.dev2__tar.gz → 0.2.94.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.94.dev2/eval_protocol.egg-info → eval_protocol-0.2.94.dev3}/PKG-INFO +1 -1
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/test_aime25.py +5 -2
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/test_gpqa.py +5 -2
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/test_livebench_data_analysis.py +5 -2
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/execution/base_policy.py +0 -4
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/execution/policy.py +23 -28
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/models.py +36 -1
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/default_agent_rollout_processor.py +8 -3
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/default_single_turn_rollout_process.py +10 -6
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/accuracy.py +13 -3
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/json_schema.py +11 -3
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/language_consistency.py +13 -3
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/repetition.py +13 -3
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/tag_count.py +13 -3
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol.egg-info/SOURCES.txt +3 -3
- eval_protocol-0.2.94.dev3/vite-app/dist/assets/index-CuQbfdPD.js +46 -0
- eval_protocol-0.2.94.dev3/vite-app/dist/assets/index-CuQbfdPD.js.map +1 -0
- eval_protocol-0.2.94.dev3/vite-app/dist/assets/index-iZp_HgyW.css +1 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vite-app/dist/index.html +2 -2
- eval_protocol-0.2.94.dev2/vite-app/dist/assets/index-BIhepl19.css +0 -1
- eval_protocol-0.2.94.dev2/vite-app/dist/assets/index-DaovgarD.js +0 -137
- eval_protocol-0.2.94.dev2/vite-app/dist/assets/index-DaovgarD.js.map +0 -1
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/LICENSE +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/README.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/development/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/create_rft.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/local_test.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/cli_commands/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/integrations/openai_rft.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/integrations/tinker_cookbook.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/pyproject.toml +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/setup.cfg +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/setup.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_cli_create_rft.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_cli_local_test.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_config.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_exception_config.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_format.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_length.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_math.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_models.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_openai_rft_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/versioneer.py +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.94.
|
|
3
|
+
Version: 0.2.94.dev3
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-12-
|
|
11
|
+
"date": "2025-12-02T21:46:38-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.94-dev.
|
|
14
|
+
"full-revisionid": "01bc8e998a3a0370fc0631d39b4fbd8b4b5c7941",
|
|
15
|
+
"version": "0.2.94-dev.3"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/test_aime25.py
RENAMED
|
@@ -5,6 +5,7 @@ from eval_protocol.models import (
|
|
|
5
5
|
EvaluationRow,
|
|
6
6
|
Message,
|
|
7
7
|
MetricResult,
|
|
8
|
+
ChatCompletionContentPartParam,
|
|
8
9
|
ChatCompletionContentPartTextParam,
|
|
9
10
|
)
|
|
10
11
|
from eval_protocol.pytest.default_single_turn_rollout_process import (
|
|
@@ -18,10 +19,12 @@ SYSTEM_PROMPT = (
|
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
def _coerce_content_to_str(
|
|
21
|
-
content: str | list[
|
|
22
|
+
content: str | list[ChatCompletionContentPartParam] | None,
|
|
22
23
|
) -> str:
|
|
23
24
|
if isinstance(content, list):
|
|
24
|
-
return "".join(
|
|
25
|
+
return "".join(
|
|
26
|
+
getattr(p, "text", str(p)) if isinstance(p, ChatCompletionContentPartTextParam) else "" for p in content
|
|
27
|
+
)
|
|
25
28
|
return str(content or "")
|
|
26
29
|
|
|
27
30
|
|
{eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/benchmarks/test_gpqa.py
RENAMED
|
@@ -10,6 +10,7 @@ from eval_protocol.models import (
|
|
|
10
10
|
EvaluationRow,
|
|
11
11
|
Message,
|
|
12
12
|
MetricResult,
|
|
13
|
+
ChatCompletionContentPartParam,
|
|
13
14
|
ChatCompletionContentPartTextParam,
|
|
14
15
|
)
|
|
15
16
|
from eval_protocol.pytest.default_single_turn_rollout_process import (
|
|
@@ -54,10 +55,12 @@ def _load_gpqa_messages_from_csv() -> list[list[list[Message]]]:
|
|
|
54
55
|
|
|
55
56
|
|
|
56
57
|
def _coerce_content_to_str(
|
|
57
|
-
content: str | list[
|
|
58
|
+
content: str | list[ChatCompletionContentPartParam] | None,
|
|
58
59
|
) -> str:
|
|
59
60
|
if isinstance(content, list):
|
|
60
|
-
return "".join(
|
|
61
|
+
return "".join(
|
|
62
|
+
getattr(p, "text", str(p)) if isinstance(p, ChatCompletionContentPartTextParam) else "" for p in content
|
|
63
|
+
)
|
|
61
64
|
return str(content or "")
|
|
62
65
|
|
|
63
66
|
|
|
@@ -8,6 +8,7 @@ from eval_protocol.models import (
|
|
|
8
8
|
EvaluationRow,
|
|
9
9
|
Message,
|
|
10
10
|
MetricResult,
|
|
11
|
+
ChatCompletionContentPartParam,
|
|
11
12
|
ChatCompletionContentPartTextParam,
|
|
12
13
|
)
|
|
13
14
|
from eval_protocol.pytest.default_single_turn_rollout_process import (
|
|
@@ -37,9 +38,11 @@ def _extract_last_boxed_segment(text: str) -> Optional[str]:
|
|
|
37
38
|
return matches[-1]
|
|
38
39
|
|
|
39
40
|
|
|
40
|
-
def _coerce_content_to_str(content: str | list[
|
|
41
|
+
def _coerce_content_to_str(content: str | list[ChatCompletionContentPartParam] | None) -> str:
|
|
41
42
|
if isinstance(content, list):
|
|
42
|
-
return "".join(
|
|
43
|
+
return "".join(
|
|
44
|
+
getattr(p, "text", str(p)) if isinstance(p, ChatCompletionContentPartTextParam) else "" for p in content
|
|
45
|
+
)
|
|
43
46
|
return str(content or "")
|
|
44
47
|
|
|
45
48
|
|
{eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/execution/base_policy.py
RENAMED
|
@@ -199,10 +199,6 @@ class LLMBasePolicy(PlaybackPolicyBase, ABC):
|
|
|
199
199
|
if message.get("tool_calls"):
|
|
200
200
|
assistant_message_for_history["tool_calls"] = message["tool_calls"]
|
|
201
201
|
|
|
202
|
-
rd = message.get("reasoning_details", None)
|
|
203
|
-
if rd is not None:
|
|
204
|
-
assistant_message_for_history["reasoning_details"] = rd
|
|
205
|
-
|
|
206
202
|
# Add to actual conversation history
|
|
207
203
|
conversation_history.append(assistant_message_for_history)
|
|
208
204
|
|
{eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/mcp/execution/policy.py
RENAMED
|
@@ -146,7 +146,7 @@ class LiteLLMPolicy(LLMBasePolicy):
|
|
|
146
146
|
Clean messages with only OpenAI API compatible fields
|
|
147
147
|
"""
|
|
148
148
|
# Standard OpenAI message fields
|
|
149
|
-
allowed_fields = {"role", "content", "tool_calls", "tool_call_id", "name"
|
|
149
|
+
allowed_fields = {"role", "content", "tool_calls", "tool_call_id", "name"}
|
|
150
150
|
|
|
151
151
|
clean_messages = []
|
|
152
152
|
for msg in messages:
|
|
@@ -217,36 +217,31 @@ class LiteLLMPolicy(LLMBasePolicy):
|
|
|
217
217
|
logger.debug(f"🔄 API call for model: {self.model_id}")
|
|
218
218
|
|
|
219
219
|
# LiteLLM already returns OpenAI-compatible format
|
|
220
|
-
message_obj = getattr(response.choices[0], "message", object())
|
|
221
|
-
|
|
222
|
-
message_dict: Dict[str, Any] = {
|
|
223
|
-
"role": getattr(message_obj, "role", "assistant"),
|
|
224
|
-
"content": getattr(message_obj, "content", None),
|
|
225
|
-
"tool_calls": (
|
|
226
|
-
[
|
|
227
|
-
{
|
|
228
|
-
"id": getattr(tc, "id", None),
|
|
229
|
-
"type": getattr(tc, "type", "function"),
|
|
230
|
-
"function": {
|
|
231
|
-
"name": getattr(getattr(tc, "function", None), "name", "tool"),
|
|
232
|
-
"arguments": getattr(getattr(tc, "function", None), "arguments", "{}"),
|
|
233
|
-
},
|
|
234
|
-
}
|
|
235
|
-
for tc in (getattr(message_obj, "tool_calls", []) or [])
|
|
236
|
-
]
|
|
237
|
-
if getattr(message_obj, "tool_calls", None)
|
|
238
|
-
else []
|
|
239
|
-
),
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
rd = getattr(message_obj, "reasoning_details", None)
|
|
243
|
-
if rd is not None:
|
|
244
|
-
message_dict["reasoning_details"] = rd
|
|
245
|
-
|
|
246
220
|
return {
|
|
247
221
|
"choices": [
|
|
248
222
|
{
|
|
249
|
-
"message":
|
|
223
|
+
"message": {
|
|
224
|
+
"role": getattr(getattr(response.choices[0], "message", object()), "role", "assistant"),
|
|
225
|
+
"content": getattr(getattr(response.choices[0], "message", object()), "content", None),
|
|
226
|
+
"tool_calls": (
|
|
227
|
+
[
|
|
228
|
+
{
|
|
229
|
+
"id": getattr(tc, "id", None),
|
|
230
|
+
"type": getattr(tc, "type", "function"),
|
|
231
|
+
"function": {
|
|
232
|
+
"name": getattr(getattr(tc, "function", None), "name", "tool"),
|
|
233
|
+
"arguments": getattr(getattr(tc, "function", None), "arguments", "{}"),
|
|
234
|
+
},
|
|
235
|
+
}
|
|
236
|
+
for tc in (
|
|
237
|
+
getattr(getattr(response.choices[0], "message", object()), "tool_calls", [])
|
|
238
|
+
or []
|
|
239
|
+
)
|
|
240
|
+
]
|
|
241
|
+
if getattr(getattr(response.choices[0], "message", object()), "tool_calls", None)
|
|
242
|
+
else []
|
|
243
|
+
),
|
|
244
|
+
},
|
|
250
245
|
"finish_reason": getattr(response.choices[0], "finish_reason", None),
|
|
251
246
|
}
|
|
252
247
|
],
|
|
@@ -466,11 +466,46 @@ class ChatCompletionContentPartTextParam(BaseModel):
|
|
|
466
466
|
return iter(["text", "type"])
|
|
467
467
|
|
|
468
468
|
|
|
469
|
+
class ChatCompletionContentPartImageParam(BaseModel):
|
|
470
|
+
type: Literal["image_url"] = Field("image_url", description="The type of the content part.")
|
|
471
|
+
image_url: Dict[str, Any] = Field(
|
|
472
|
+
..., description="Image descriptor (e.g., {'url': 'data:image/png;base64,...', 'detail': 'high'})."
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
def __getitem__(self, key: str) -> Any:
|
|
476
|
+
if key == "image_url":
|
|
477
|
+
return self.image_url
|
|
478
|
+
if key == "type":
|
|
479
|
+
return self.type
|
|
480
|
+
raise KeyError(key)
|
|
481
|
+
|
|
482
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
483
|
+
try:
|
|
484
|
+
return self[key]
|
|
485
|
+
except KeyError:
|
|
486
|
+
return default
|
|
487
|
+
|
|
488
|
+
def keys(self):
|
|
489
|
+
return (k for k in ("image_url", "type"))
|
|
490
|
+
|
|
491
|
+
def values(self):
|
|
492
|
+
return (self.image_url, self.type)
|
|
493
|
+
|
|
494
|
+
def items(self):
|
|
495
|
+
return [("image_url", self.image_url), ("type", self.type)]
|
|
496
|
+
|
|
497
|
+
def __iter__(self):
|
|
498
|
+
return iter(["image_url", "type"])
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
ChatCompletionContentPartParam = Union[ChatCompletionContentPartTextParam, ChatCompletionContentPartImageParam]
|
|
502
|
+
|
|
503
|
+
|
|
469
504
|
class Message(BaseModel):
|
|
470
505
|
"""Chat message model with trajectory evaluation support."""
|
|
471
506
|
|
|
472
507
|
role: str # assistant, user, system, tool
|
|
473
|
-
content: Optional[Union[str, List[
|
|
508
|
+
content: Optional[Union[str, List[ChatCompletionContentPartParam]]] = Field(
|
|
474
509
|
default="", description="The content of the message."
|
|
475
510
|
)
|
|
476
511
|
reasoning_content: Optional[str] = Field(
|
|
@@ -13,7 +13,12 @@ from openai.types.chat.chat_completion_message_param import ChatCompletionMessag
|
|
|
13
13
|
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
|
|
14
14
|
from eval_protocol.mcp.execution.policy import LiteLLMPolicy
|
|
15
15
|
from eval_protocol.mcp.mcp_multi_client import MCPMultiClient
|
|
16
|
-
from eval_protocol.models import
|
|
16
|
+
from eval_protocol.models import (
|
|
17
|
+
EvaluationRow,
|
|
18
|
+
Message,
|
|
19
|
+
ChatCompletionContentPartParam,
|
|
20
|
+
ChatCompletionContentPartTextParam,
|
|
21
|
+
)
|
|
17
22
|
from openai.types import CompletionUsage
|
|
18
23
|
from eval_protocol.pytest.rollout_processor import RolloutProcessor
|
|
19
24
|
from eval_protocol.pytest.types import Dataset, RolloutProcessorConfig
|
|
@@ -98,7 +103,7 @@ class Agent:
|
|
|
98
103
|
self.messages.append(message)
|
|
99
104
|
self.logger.log(self.evaluation_row)
|
|
100
105
|
|
|
101
|
-
async def call_agent(self) -> Optional[Union[str, List[
|
|
106
|
+
async def call_agent(self) -> Optional[Union[str, List[ChatCompletionContentPartParam]]]:
|
|
102
107
|
"""
|
|
103
108
|
Call the assistant with the user query.
|
|
104
109
|
"""
|
|
@@ -222,7 +227,7 @@ class Agent:
|
|
|
222
227
|
|
|
223
228
|
def _format_tool_message_content(
|
|
224
229
|
self, content: List[TextContent]
|
|
225
|
-
) -> Union[str, List[
|
|
230
|
+
) -> Union[str, List[ChatCompletionContentPartParam]]:
|
|
226
231
|
"""Format tool result content for inclusion in a tool message.
|
|
227
232
|
|
|
228
233
|
- If a single text item, return plain string per OpenAI semantics.
|
|
@@ -166,13 +166,17 @@ class SingleTurnRolloutProcessor(RolloutProcessor):
|
|
|
166
166
|
row.execution_metadata.tool_call_count = (
|
|
167
167
|
len(converted_tool_calls) if converted_tool_calls is not None else 0
|
|
168
168
|
)
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
169
|
+
usage = getattr(response, "usage", None)
|
|
170
|
+
if usage:
|
|
171
|
+
row.execution_metadata.usage = (
|
|
172
|
+
CompletionUsage( # Note: LiteLLM sets usage dynamically via setattr(), not as a typed field
|
|
173
|
+
prompt_tokens=getattr(usage, "prompt_tokens", 0),
|
|
174
|
+
completion_tokens=getattr(usage, "completion_tokens", 0),
|
|
175
|
+
total_tokens=getattr(usage, "total_tokens", 0),
|
|
176
|
+
)
|
|
174
177
|
)
|
|
175
|
-
|
|
178
|
+
else:
|
|
179
|
+
row.execution_metadata.usage = None
|
|
176
180
|
|
|
177
181
|
row.messages = messages
|
|
178
182
|
|
|
@@ -10,10 +10,16 @@ like normalization and LaTeX parsing.
|
|
|
10
10
|
import re
|
|
11
11
|
from typing import Any, Callable, Dict, List, Optional, Union, cast
|
|
12
12
|
|
|
13
|
-
from ..models import
|
|
13
|
+
from ..models import (
|
|
14
|
+
EvaluateResult,
|
|
15
|
+
Message,
|
|
16
|
+
MetricResult,
|
|
17
|
+
ChatCompletionContentPartParam,
|
|
18
|
+
ChatCompletionContentPartTextParam,
|
|
19
|
+
)
|
|
14
20
|
|
|
15
21
|
|
|
16
|
-
def _to_text(content: Optional[Union[str, List[
|
|
22
|
+
def _to_text(content: Optional[Union[str, List[ChatCompletionContentPartParam]]]) -> str:
|
|
17
23
|
"""Coerce Message.content into a plain string for regex and comparisons."""
|
|
18
24
|
if content is None:
|
|
19
25
|
return ""
|
|
@@ -21,7 +27,11 @@ def _to_text(content: Optional[Union[str, List[ChatCompletionContentPartTextPara
|
|
|
21
27
|
return content
|
|
22
28
|
# List[ChatCompletionContentPartTextParam]
|
|
23
29
|
try:
|
|
24
|
-
|
|
30
|
+
texts: List[str] = []
|
|
31
|
+
for part in content:
|
|
32
|
+
if isinstance(part, ChatCompletionContentPartTextParam):
|
|
33
|
+
texts.append(part.text)
|
|
34
|
+
return "\n".join(texts)
|
|
25
35
|
except Exception:
|
|
26
36
|
return ""
|
|
27
37
|
|
{eval_protocol-0.2.94.dev2 → eval_protocol-0.2.94.dev3}/eval_protocol/rewards/json_schema.py
RENAMED
|
@@ -2,7 +2,13 @@ import json
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import Any, Dict, List, Optional, Union
|
|
4
4
|
|
|
5
|
-
from ..models import
|
|
5
|
+
from ..models import (
|
|
6
|
+
EvaluateResult,
|
|
7
|
+
Message,
|
|
8
|
+
MetricResult,
|
|
9
|
+
ChatCompletionContentPartParam,
|
|
10
|
+
ChatCompletionContentPartTextParam,
|
|
11
|
+
)
|
|
6
12
|
from ..typed_interface import reward_function
|
|
7
13
|
from .function_calling import (
|
|
8
14
|
calculate_jaccard_similarity,
|
|
@@ -59,8 +65,10 @@ def json_schema_reward(
|
|
|
59
65
|
content_text = last_message.content
|
|
60
66
|
else:
|
|
61
67
|
try:
|
|
62
|
-
parts: List[
|
|
63
|
-
content_text = "\n".join(
|
|
68
|
+
parts: List[ChatCompletionContentPartParam] = last_message.content # type: ignore[assignment]
|
|
69
|
+
content_text = "\n".join(
|
|
70
|
+
getattr(p, "text", "") for p in parts if isinstance(p, ChatCompletionContentPartTextParam)
|
|
71
|
+
)
|
|
64
72
|
except Exception:
|
|
65
73
|
content_text = ""
|
|
66
74
|
else:
|
|
@@ -9,7 +9,13 @@ are in the expected language.
|
|
|
9
9
|
import re
|
|
10
10
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
11
11
|
|
|
12
|
-
from ..models import
|
|
12
|
+
from ..models import (
|
|
13
|
+
EvaluateResult,
|
|
14
|
+
Message,
|
|
15
|
+
MetricResult,
|
|
16
|
+
ChatCompletionContentPartParam,
|
|
17
|
+
ChatCompletionContentPartTextParam,
|
|
18
|
+
)
|
|
13
19
|
from ..typed_interface import reward_function
|
|
14
20
|
|
|
15
21
|
# Dictionary mapping language codes to common words/patterns in that language
|
|
@@ -573,13 +579,17 @@ def language_consistency_reward(
|
|
|
573
579
|
},
|
|
574
580
|
)
|
|
575
581
|
|
|
576
|
-
def _to_text(content: Union[str, List[
|
|
582
|
+
def _to_text(content: Union[str, List[ChatCompletionContentPartParam], None]) -> str:
|
|
577
583
|
if content is None:
|
|
578
584
|
return ""
|
|
579
585
|
if isinstance(content, str):
|
|
580
586
|
return content
|
|
581
587
|
try:
|
|
582
|
-
|
|
588
|
+
texts: List[str] = []
|
|
589
|
+
for part in content:
|
|
590
|
+
if isinstance(part, ChatCompletionContentPartTextParam):
|
|
591
|
+
texts.append(part.text)
|
|
592
|
+
return "\n".join(texts)
|
|
583
593
|
except Exception:
|
|
584
594
|
return ""
|
|
585
595
|
|
|
@@ -8,16 +8,26 @@ encouraging more diverse and information-rich outputs.
|
|
|
8
8
|
import re
|
|
9
9
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
10
10
|
|
|
11
|
-
from ..models import
|
|
11
|
+
from ..models import (
|
|
12
|
+
EvaluateResult,
|
|
13
|
+
Message,
|
|
14
|
+
MetricResult,
|
|
15
|
+
ChatCompletionContentPartParam,
|
|
16
|
+
ChatCompletionContentPartTextParam,
|
|
17
|
+
)
|
|
12
18
|
|
|
13
19
|
|
|
14
|
-
def _to_text(content: Optional[Union[str, List[
|
|
20
|
+
def _to_text(content: Optional[Union[str, List[ChatCompletionContentPartParam]]]) -> str:
|
|
15
21
|
if content is None:
|
|
16
22
|
return ""
|
|
17
23
|
if isinstance(content, str):
|
|
18
24
|
return content
|
|
19
25
|
try:
|
|
20
|
-
|
|
26
|
+
texts: List[str] = []
|
|
27
|
+
for part in content:
|
|
28
|
+
if isinstance(part, ChatCompletionContentPartTextParam):
|
|
29
|
+
texts.append(part.text)
|
|
30
|
+
return "\n".join(texts)
|
|
21
31
|
except Exception:
|
|
22
32
|
return ""
|
|
23
33
|
|
|
@@ -8,16 +8,26 @@ specified XML/HTML-like tags in correct quantities.
|
|
|
8
8
|
import re
|
|
9
9
|
from typing import Any, Dict, List, Set, Union
|
|
10
10
|
|
|
11
|
-
from ..models import
|
|
11
|
+
from ..models import (
|
|
12
|
+
EvaluateResult,
|
|
13
|
+
Message,
|
|
14
|
+
MetricResult,
|
|
15
|
+
ChatCompletionContentPartParam,
|
|
16
|
+
ChatCompletionContentPartTextParam,
|
|
17
|
+
)
|
|
12
18
|
|
|
13
19
|
|
|
14
|
-
def _to_text(content: Union[str, List[
|
|
20
|
+
def _to_text(content: Union[str, List[ChatCompletionContentPartParam], None]) -> str:
|
|
15
21
|
if content is None:
|
|
16
22
|
return ""
|
|
17
23
|
if isinstance(content, str):
|
|
18
24
|
return content
|
|
19
25
|
try:
|
|
20
|
-
|
|
26
|
+
texts: List[str] = []
|
|
27
|
+
for part in content:
|
|
28
|
+
if isinstance(part, ChatCompletionContentPartTextParam):
|
|
29
|
+
texts.append(part.text)
|
|
30
|
+
return "\n".join(texts)
|
|
21
31
|
except Exception:
|
|
22
32
|
return ""
|
|
23
33
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.94.
|
|
3
|
+
Version: 0.2.94.dev3
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -42,9 +42,9 @@ eval_protocol.egg-info/requires.txt
|
|
|
42
42
|
eval_protocol.egg-info/top_level.txt
|
|
43
43
|
eval_protocol/../vite-app/dist/index.html
|
|
44
44
|
eval_protocol/../vite-app/dist/assets/favicon-BkAAWQga.png
|
|
45
|
-
eval_protocol/../vite-app/dist/assets/index-
|
|
46
|
-
eval_protocol/../vite-app/dist/assets/index-
|
|
47
|
-
eval_protocol/../vite-app/dist/assets/index-
|
|
45
|
+
eval_protocol/../vite-app/dist/assets/index-CuQbfdPD.js
|
|
46
|
+
eval_protocol/../vite-app/dist/assets/index-CuQbfdPD.js.map
|
|
47
|
+
eval_protocol/../vite-app/dist/assets/index-iZp_HgyW.css
|
|
48
48
|
eval_protocol/../vite-app/dist/assets/logo-light-BprIBJQW.png
|
|
49
49
|
eval_protocol/adapters/__init__.py
|
|
50
50
|
eval_protocol/adapters/base.py
|