eval-protocol 0.3.9.dev1__tar.gz → 0.3.9.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.3.9.dev1/eval_protocol.egg-info → eval_protocol-0.3.9.dev3}/PKG-INFO +1 -1
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/evaluation_test.py +22 -25
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/evaluation_test_utils.py +0 -19
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/remote_rollout_processor.py +16 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/tracing_utils.py +12 -1
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol.egg-info/SOURCES.txt +3 -3
- eval_protocol-0.3.9.dev3/vite-app/dist/assets/index-10cZ11iB.js +137 -0
- eval_protocol-0.3.9.dev3/vite-app/dist/assets/index-10cZ11iB.js.map +1 -0
- eval_protocol-0.3.9.dev3/vite-app/dist/assets/index-DOD73Wyg.css +1 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vite-app/dist/index.html +2 -2
- eval_protocol-0.3.9.dev1/vite-app/dist/assets/index-CuQbfdPD.js +0 -46
- eval_protocol-0.3.9.dev1/vite-app/dist/assets/index-CuQbfdPD.js.map +0 -1
- eval_protocol-0.3.9.dev1/vite-app/dist/assets/index-iZp_HgyW.css +0 -1
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/LICENSE +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/README.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/development/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/development/utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/dataframe.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/cli_commands/create_rft.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/cli_commands/export_docs.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/cli_commands/local_test.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/cli_commands/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/integrations/openai_rft.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/integrations/tinker_cookbook.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/buffer.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/priority_scheduler.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/training/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/training/gepa_trainer.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/training/gepa_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/training/trainer.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/training/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/pyproject.toml +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/setup.cfg +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/setup.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_auth.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_cli_create_rft.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_cli_local_test.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_exception_config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_format.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_human_id.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_length.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_litellm_policy_provider_fields.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_math.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_minimal.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_openai_rft_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_packaging.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_priority_scheduler.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_readiness.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_repetition.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_rollout_logprobs.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_sqlite_hardening.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_status_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_training_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/versioneer.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.3.9.
|
|
3
|
+
Version: 0.3.9.dev3
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2026-01-
|
|
11
|
+
"date": "2026-01-08T14:08:17-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.3.9.dev.
|
|
14
|
+
"full-revisionid": "74e35d4e2e53433124d13671c12a4677078a8b0a",
|
|
15
|
+
"version": "0.3.9.dev.3"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/evaluation_test.py
RENAMED
|
@@ -20,12 +20,13 @@ from eval_protocol.models import (
|
|
|
20
20
|
EvaluationRow,
|
|
21
21
|
EvaluationThreshold,
|
|
22
22
|
EvaluationThresholdDict,
|
|
23
|
+
EvaluateResult,
|
|
23
24
|
Status,
|
|
24
25
|
EPParameters,
|
|
25
26
|
)
|
|
26
27
|
from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper
|
|
27
28
|
from eval_protocol.pytest.evaluation_test_postprocess import postprocess
|
|
28
|
-
from eval_protocol.pytest.execution import execute_pytest_with_exception_handling
|
|
29
|
+
from eval_protocol.pytest.execution import execute_pytest, execute_pytest_with_exception_handling
|
|
29
30
|
from eval_protocol.pytest.priority_scheduler import execute_priority_rollouts
|
|
30
31
|
from eval_protocol.pytest.generate_parameter_combinations import (
|
|
31
32
|
ParameterizedTestKwargs,
|
|
@@ -55,7 +56,6 @@ from eval_protocol.pytest.evaluation_test_utils import (
|
|
|
55
56
|
AggregationMethod,
|
|
56
57
|
add_cost_metrics,
|
|
57
58
|
log_eval_status_and_rows,
|
|
58
|
-
normalize_fireworks_model,
|
|
59
59
|
parse_ep_completion_params,
|
|
60
60
|
parse_ep_completion_params_overwrite,
|
|
61
61
|
parse_ep_max_concurrent_rollouts,
|
|
@@ -205,7 +205,6 @@ def evaluation_test(
|
|
|
205
205
|
max_dataset_rows = parse_ep_max_rows(max_dataset_rows)
|
|
206
206
|
completion_params = parse_ep_completion_params(completion_params)
|
|
207
207
|
completion_params = parse_ep_completion_params_overwrite(completion_params)
|
|
208
|
-
completion_params = [normalize_fireworks_model(cp) for cp in completion_params]
|
|
209
208
|
original_completion_params = completion_params
|
|
210
209
|
passed_threshold = parse_ep_passed_threshold(passed_threshold)
|
|
211
210
|
data_loaders = parse_ep_dataloaders(data_loaders)
|
|
@@ -366,7 +365,6 @@ def evaluation_test(
|
|
|
366
365
|
row.input_metadata.row_id = generate_id(seed=0, index=index)
|
|
367
366
|
|
|
368
367
|
completion_params = kwargs["completion_params"] if "completion_params" in kwargs else None
|
|
369
|
-
completion_params = normalize_fireworks_model(completion_params)
|
|
370
368
|
# Create eval metadata with test function info and current commit hash
|
|
371
369
|
eval_metadata = EvalMetadata(
|
|
372
370
|
name=test_func.__name__,
|
|
@@ -411,22 +409,21 @@ def evaluation_test(
|
|
|
411
409
|
|
|
412
410
|
rollout_processor.setup()
|
|
413
411
|
|
|
414
|
-
use_priority_scheduler =
|
|
415
|
-
|
|
416
|
-
|
|
412
|
+
use_priority_scheduler = (
|
|
413
|
+
(
|
|
414
|
+
os.environ.get("EP_USE_PRIORITY_SCHEDULER", "0") == "1"
|
|
415
|
+
and not isinstance(rollout_processor, MCPGymRolloutProcessor)
|
|
416
|
+
)
|
|
417
|
+
)
|
|
417
418
|
|
|
418
419
|
if use_priority_scheduler:
|
|
419
420
|
microbatch_output_size = os.environ.get("EP_MICRO_BATCH_OUTPUT_SIZE", None)
|
|
420
421
|
output_dir = os.environ.get("EP_OUTPUT_DIR", None)
|
|
421
422
|
if microbatch_output_size and output_dir:
|
|
422
|
-
output_buffer = MicroBatchDataBuffer(
|
|
423
|
-
num_runs=num_runs,
|
|
424
|
-
batch_size=int(microbatch_output_size),
|
|
425
|
-
output_path_template=os.path.join(output_dir, "buffer_{index}.jsonl"),
|
|
426
|
-
)
|
|
423
|
+
output_buffer = MicroBatchDataBuffer(num_runs=num_runs, batch_size=int(microbatch_output_size), output_path_template=os.path.join(output_dir, "buffer_{index}.jsonl"))
|
|
427
424
|
else:
|
|
428
425
|
output_buffer = None
|
|
429
|
-
|
|
426
|
+
|
|
430
427
|
try:
|
|
431
428
|
priority_results = await execute_priority_rollouts(
|
|
432
429
|
dataset=data,
|
|
@@ -444,12 +441,12 @@ def evaluation_test(
|
|
|
444
441
|
finally:
|
|
445
442
|
if output_buffer:
|
|
446
443
|
await output_buffer.close()
|
|
447
|
-
|
|
444
|
+
|
|
448
445
|
for res in priority_results:
|
|
449
446
|
run_idx = (res.execution_metadata.extra or {}).get("run_index", 0)
|
|
450
447
|
if run_idx < len(all_results):
|
|
451
448
|
all_results[run_idx].append(res)
|
|
452
|
-
|
|
449
|
+
|
|
453
450
|
processed_rows_in_run.append(res)
|
|
454
451
|
|
|
455
452
|
postprocess(
|
|
@@ -465,7 +462,6 @@ def evaluation_test(
|
|
|
465
462
|
)
|
|
466
463
|
|
|
467
464
|
else:
|
|
468
|
-
|
|
469
465
|
async def execute_run(run_idx: int, config: RolloutProcessorConfig):
|
|
470
466
|
nonlocal all_results
|
|
471
467
|
|
|
@@ -510,7 +506,9 @@ def evaluation_test(
|
|
|
510
506
|
raise ValueError(
|
|
511
507
|
f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
|
|
512
508
|
)
|
|
513
|
-
result.execution_metadata.eval_duration_seconds =
|
|
509
|
+
result.execution_metadata.eval_duration_seconds = (
|
|
510
|
+
time.perf_counter() - start_time
|
|
511
|
+
)
|
|
514
512
|
return result
|
|
515
513
|
|
|
516
514
|
async def _execute_groupwise_eval_with_semaphore(
|
|
@@ -521,9 +519,7 @@ def evaluation_test(
|
|
|
521
519
|
evaluation_test_kwargs = kwargs.get("evaluation_test_kwargs") or {}
|
|
522
520
|
primary_rollout_id = rows[0].execution_metadata.rollout_id if rows else None
|
|
523
521
|
group_rollout_ids = [
|
|
524
|
-
r.execution_metadata.rollout_id
|
|
525
|
-
for r in rows
|
|
526
|
-
if r.execution_metadata.rollout_id
|
|
522
|
+
r.execution_metadata.rollout_id for r in rows if r.execution_metadata.rollout_id
|
|
527
523
|
]
|
|
528
524
|
async with rollout_logging_context(
|
|
529
525
|
primary_rollout_id or "",
|
|
@@ -600,9 +596,7 @@ def evaluation_test(
|
|
|
600
596
|
row_groups[row.input_metadata.row_id].append(row)
|
|
601
597
|
tasks = []
|
|
602
598
|
for _, rows in row_groups.items():
|
|
603
|
-
tasks.append(
|
|
604
|
-
asyncio.create_task(_execute_groupwise_eval_with_semaphore(rows=rows))
|
|
605
|
-
)
|
|
599
|
+
tasks.append(asyncio.create_task(_execute_groupwise_eval_with_semaphore(rows=rows)))
|
|
606
600
|
results = []
|
|
607
601
|
for task in tasks:
|
|
608
602
|
res = await task
|
|
@@ -698,9 +692,9 @@ def evaluation_test(
|
|
|
698
692
|
# For other processors, create all tasks at once and run in parallel
|
|
699
693
|
# Concurrency is now controlled by the shared semaphore in each rollout processor
|
|
700
694
|
await run_tasks_with_run_progress(execute_run, num_runs, config)
|
|
701
|
-
|
|
695
|
+
|
|
702
696
|
experiment_duration_seconds = time.perf_counter() - experiment_start_time
|
|
703
|
-
|
|
697
|
+
|
|
704
698
|
# for groupwise mode, the result contains eval output from multiple completion_params, we need to differentiate them
|
|
705
699
|
# rollout_id is used to differentiate the result from different completion_params
|
|
706
700
|
if mode == "groupwise":
|
|
@@ -736,12 +730,15 @@ def evaluation_test(
|
|
|
736
730
|
experiment_duration_seconds,
|
|
737
731
|
)
|
|
738
732
|
|
|
733
|
+
|
|
734
|
+
|
|
739
735
|
if not all(r.evaluation_result is not None for run_results in all_results for r in run_results):
|
|
740
736
|
raise AssertionError(
|
|
741
737
|
"Some EvaluationRow instances are missing evaluation_result. "
|
|
742
738
|
"Your @evaluation_test function must set `row.evaluation_result`"
|
|
743
739
|
)
|
|
744
740
|
|
|
741
|
+
|
|
745
742
|
except AssertionError:
|
|
746
743
|
_log_eval_error(
|
|
747
744
|
Status.eval_finished(),
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev3}/eval_protocol/pytest/evaluation_test_utils.py
RENAMED
|
@@ -619,22 +619,3 @@ def build_rollout_processor_config(
|
|
|
619
619
|
server_script_path=None,
|
|
620
620
|
kwargs=rollout_processor_kwargs,
|
|
621
621
|
)
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
def normalize_fireworks_model(completion_params: CompletionParams | None) -> CompletionParams | None:
|
|
625
|
-
"""Fireworks model names like 'accounts/<org>/models/<model>' need the fireworks_ai/
|
|
626
|
-
prefix when routing through LiteLLM. This function adds the prefix if missing.
|
|
627
|
-
"""
|
|
628
|
-
if completion_params is None:
|
|
629
|
-
return None
|
|
630
|
-
|
|
631
|
-
model = completion_params.get("model")
|
|
632
|
-
if (
|
|
633
|
-
model
|
|
634
|
-
and isinstance(model, str)
|
|
635
|
-
and not model.startswith("fireworks_ai/")
|
|
636
|
-
and re.match(r"^accounts/[^/]+/models/.+", model)
|
|
637
|
-
):
|
|
638
|
-
completion_params = completion_params.copy()
|
|
639
|
-
completion_params["model"] = f"fireworks_ai/{model}"
|
|
640
|
-
return completion_params
|
|
@@ -122,9 +122,20 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
122
122
|
status_logs.append(log)
|
|
123
123
|
|
|
124
124
|
if status_logs:
|
|
125
|
+
if len(status_logs) > 1:
|
|
126
|
+
logger.warning(
|
|
127
|
+
"Found %s status logs for rollout %s; expected at most 1. Using the first one: %s",
|
|
128
|
+
len(status_logs),
|
|
129
|
+
row.execution_metadata.rollout_id,
|
|
130
|
+
status_logs[0],
|
|
131
|
+
)
|
|
125
132
|
# Use the first log with status information
|
|
126
133
|
status_log = status_logs[0]
|
|
127
134
|
status_dict = status_log.get("status")
|
|
135
|
+
raw_extras = status_log.get("extras") or {}
|
|
136
|
+
status_extras = {
|
|
137
|
+
k: v for k, v in raw_extras.items() if k not in ("logger_name", "level", "timestamp")
|
|
138
|
+
}
|
|
128
139
|
|
|
129
140
|
logger.info(
|
|
130
141
|
f"Found status log for rollout {row.execution_metadata.rollout_id}: {status_log.get('message', '')}"
|
|
@@ -149,6 +160,11 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
149
160
|
details=status_details,
|
|
150
161
|
)
|
|
151
162
|
|
|
163
|
+
if row.execution_metadata.extra:
|
|
164
|
+
row.execution_metadata.extra.update(status_extras)
|
|
165
|
+
else:
|
|
166
|
+
row.execution_metadata.extra = status_extras
|
|
167
|
+
|
|
152
168
|
logger.info("Stopping polling for rollout %s", row.execution_metadata.rollout_id)
|
|
153
169
|
break
|
|
154
170
|
|
|
@@ -179,7 +179,18 @@ def update_row_with_remote_trace(
|
|
|
179
179
|
if k not in row.input_metadata.dataset_info:
|
|
180
180
|
row.input_metadata.dataset_info[k] = v
|
|
181
181
|
|
|
182
|
-
|
|
182
|
+
preserved_extra = row.execution_metadata.extra
|
|
183
|
+
|
|
184
|
+
row.execution_metadata = remote_row.execution_metadata.model_copy(deep=True)
|
|
185
|
+
|
|
186
|
+
if preserved_extra:
|
|
187
|
+
if row.execution_metadata.extra:
|
|
188
|
+
# Merge remote and local extras; local takes precedence on conflicts
|
|
189
|
+
merged = row.execution_metadata.extra or {}
|
|
190
|
+
merged.update(preserved_extra)
|
|
191
|
+
row.execution_metadata.extra = merged
|
|
192
|
+
else:
|
|
193
|
+
row.execution_metadata.extra = preserved_extra
|
|
183
194
|
return None
|
|
184
195
|
else:
|
|
185
196
|
raise ValueError("Output data loader should return exactly one row.")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.3.9.
|
|
3
|
+
Version: 0.3.9.dev3
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -42,9 +42,9 @@ eval_protocol.egg-info/requires.txt
|
|
|
42
42
|
eval_protocol.egg-info/top_level.txt
|
|
43
43
|
eval_protocol/../vite-app/dist/index.html
|
|
44
44
|
eval_protocol/../vite-app/dist/assets/favicon-BkAAWQga.png
|
|
45
|
-
eval_protocol/../vite-app/dist/assets/index-
|
|
46
|
-
eval_protocol/../vite-app/dist/assets/index-
|
|
47
|
-
eval_protocol/../vite-app/dist/assets/index-
|
|
45
|
+
eval_protocol/../vite-app/dist/assets/index-10cZ11iB.js
|
|
46
|
+
eval_protocol/../vite-app/dist/assets/index-10cZ11iB.js.map
|
|
47
|
+
eval_protocol/../vite-app/dist/assets/index-DOD73Wyg.css
|
|
48
48
|
eval_protocol/../vite-app/dist/assets/logo-light-BprIBJQW.png
|
|
49
49
|
eval_protocol/adapters/__init__.py
|
|
50
50
|
eval_protocol/adapters/base.py
|