eval-protocol 0.3.9.dev1__tar.gz → 0.3.9.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.3.9.dev1/eval_protocol.egg-info → eval_protocol-0.3.9.dev2}/PKG-INFO +1 -1
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/evaluation_test.py +22 -25
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/evaluation_test_utils.py +0 -19
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/remote_rollout_processor.py +16 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/tracing_utils.py +12 -1
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/LICENSE +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/README.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/development/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/development/utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/dataframe.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/create_rft.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/export_docs.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/local_test.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/openai_rft.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/tinker_cookbook.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/buffer.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/priority_scheduler.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/training/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/training/gepa_trainer.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/training/gepa_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/training/trainer.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/training/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/pyproject.toml +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/setup.cfg +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/setup.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_auth.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_cli_create_rft.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_cli_local_test.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_exception_config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_format.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_human_id.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_length.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_litellm_policy_provider_fields.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_math.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_minimal.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_models.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_openai_rft_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_packaging.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_priority_scheduler.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_readiness.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_repetition.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_rollout_logprobs.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_sqlite_hardening.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_status_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_training_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/versioneer.py +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vite-app/dist/assets/index-CuQbfdPD.js +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vite-app/dist/assets/index-CuQbfdPD.js.map +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vite-app/dist/assets/index-iZp_HgyW.css +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.3.9.
|
|
3
|
+
Version: 0.3.9.dev2
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2026-01-
|
|
11
|
+
"date": "2026-01-08T11:04:23-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.3.9.dev.
|
|
14
|
+
"full-revisionid": "e504b404fd0b77190d42a807cea241e95785a441",
|
|
15
|
+
"version": "0.3.9.dev.2"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/evaluation_test.py
RENAMED
|
@@ -20,12 +20,13 @@ from eval_protocol.models import (
|
|
|
20
20
|
EvaluationRow,
|
|
21
21
|
EvaluationThreshold,
|
|
22
22
|
EvaluationThresholdDict,
|
|
23
|
+
EvaluateResult,
|
|
23
24
|
Status,
|
|
24
25
|
EPParameters,
|
|
25
26
|
)
|
|
26
27
|
from eval_protocol.pytest.dual_mode_wrapper import create_dual_mode_wrapper
|
|
27
28
|
from eval_protocol.pytest.evaluation_test_postprocess import postprocess
|
|
28
|
-
from eval_protocol.pytest.execution import execute_pytest_with_exception_handling
|
|
29
|
+
from eval_protocol.pytest.execution import execute_pytest, execute_pytest_with_exception_handling
|
|
29
30
|
from eval_protocol.pytest.priority_scheduler import execute_priority_rollouts
|
|
30
31
|
from eval_protocol.pytest.generate_parameter_combinations import (
|
|
31
32
|
ParameterizedTestKwargs,
|
|
@@ -55,7 +56,6 @@ from eval_protocol.pytest.evaluation_test_utils import (
|
|
|
55
56
|
AggregationMethod,
|
|
56
57
|
add_cost_metrics,
|
|
57
58
|
log_eval_status_and_rows,
|
|
58
|
-
normalize_fireworks_model,
|
|
59
59
|
parse_ep_completion_params,
|
|
60
60
|
parse_ep_completion_params_overwrite,
|
|
61
61
|
parse_ep_max_concurrent_rollouts,
|
|
@@ -205,7 +205,6 @@ def evaluation_test(
|
|
|
205
205
|
max_dataset_rows = parse_ep_max_rows(max_dataset_rows)
|
|
206
206
|
completion_params = parse_ep_completion_params(completion_params)
|
|
207
207
|
completion_params = parse_ep_completion_params_overwrite(completion_params)
|
|
208
|
-
completion_params = [normalize_fireworks_model(cp) for cp in completion_params]
|
|
209
208
|
original_completion_params = completion_params
|
|
210
209
|
passed_threshold = parse_ep_passed_threshold(passed_threshold)
|
|
211
210
|
data_loaders = parse_ep_dataloaders(data_loaders)
|
|
@@ -366,7 +365,6 @@ def evaluation_test(
|
|
|
366
365
|
row.input_metadata.row_id = generate_id(seed=0, index=index)
|
|
367
366
|
|
|
368
367
|
completion_params = kwargs["completion_params"] if "completion_params" in kwargs else None
|
|
369
|
-
completion_params = normalize_fireworks_model(completion_params)
|
|
370
368
|
# Create eval metadata with test function info and current commit hash
|
|
371
369
|
eval_metadata = EvalMetadata(
|
|
372
370
|
name=test_func.__name__,
|
|
@@ -411,22 +409,21 @@ def evaluation_test(
|
|
|
411
409
|
|
|
412
410
|
rollout_processor.setup()
|
|
413
411
|
|
|
414
|
-
use_priority_scheduler =
|
|
415
|
-
|
|
416
|
-
|
|
412
|
+
use_priority_scheduler = (
|
|
413
|
+
(
|
|
414
|
+
os.environ.get("EP_USE_PRIORITY_SCHEDULER", "0") == "1"
|
|
415
|
+
and not isinstance(rollout_processor, MCPGymRolloutProcessor)
|
|
416
|
+
)
|
|
417
|
+
)
|
|
417
418
|
|
|
418
419
|
if use_priority_scheduler:
|
|
419
420
|
microbatch_output_size = os.environ.get("EP_MICRO_BATCH_OUTPUT_SIZE", None)
|
|
420
421
|
output_dir = os.environ.get("EP_OUTPUT_DIR", None)
|
|
421
422
|
if microbatch_output_size and output_dir:
|
|
422
|
-
output_buffer = MicroBatchDataBuffer(
|
|
423
|
-
num_runs=num_runs,
|
|
424
|
-
batch_size=int(microbatch_output_size),
|
|
425
|
-
output_path_template=os.path.join(output_dir, "buffer_{index}.jsonl"),
|
|
426
|
-
)
|
|
423
|
+
output_buffer = MicroBatchDataBuffer(num_runs=num_runs, batch_size=int(microbatch_output_size), output_path_template=os.path.join(output_dir, "buffer_{index}.jsonl"))
|
|
427
424
|
else:
|
|
428
425
|
output_buffer = None
|
|
429
|
-
|
|
426
|
+
|
|
430
427
|
try:
|
|
431
428
|
priority_results = await execute_priority_rollouts(
|
|
432
429
|
dataset=data,
|
|
@@ -444,12 +441,12 @@ def evaluation_test(
|
|
|
444
441
|
finally:
|
|
445
442
|
if output_buffer:
|
|
446
443
|
await output_buffer.close()
|
|
447
|
-
|
|
444
|
+
|
|
448
445
|
for res in priority_results:
|
|
449
446
|
run_idx = (res.execution_metadata.extra or {}).get("run_index", 0)
|
|
450
447
|
if run_idx < len(all_results):
|
|
451
448
|
all_results[run_idx].append(res)
|
|
452
|
-
|
|
449
|
+
|
|
453
450
|
processed_rows_in_run.append(res)
|
|
454
451
|
|
|
455
452
|
postprocess(
|
|
@@ -465,7 +462,6 @@ def evaluation_test(
|
|
|
465
462
|
)
|
|
466
463
|
|
|
467
464
|
else:
|
|
468
|
-
|
|
469
465
|
async def execute_run(run_idx: int, config: RolloutProcessorConfig):
|
|
470
466
|
nonlocal all_results
|
|
471
467
|
|
|
@@ -510,7 +506,9 @@ def evaluation_test(
|
|
|
510
506
|
raise ValueError(
|
|
511
507
|
f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
|
|
512
508
|
)
|
|
513
|
-
result.execution_metadata.eval_duration_seconds =
|
|
509
|
+
result.execution_metadata.eval_duration_seconds = (
|
|
510
|
+
time.perf_counter() - start_time
|
|
511
|
+
)
|
|
514
512
|
return result
|
|
515
513
|
|
|
516
514
|
async def _execute_groupwise_eval_with_semaphore(
|
|
@@ -521,9 +519,7 @@ def evaluation_test(
|
|
|
521
519
|
evaluation_test_kwargs = kwargs.get("evaluation_test_kwargs") or {}
|
|
522
520
|
primary_rollout_id = rows[0].execution_metadata.rollout_id if rows else None
|
|
523
521
|
group_rollout_ids = [
|
|
524
|
-
r.execution_metadata.rollout_id
|
|
525
|
-
for r in rows
|
|
526
|
-
if r.execution_metadata.rollout_id
|
|
522
|
+
r.execution_metadata.rollout_id for r in rows if r.execution_metadata.rollout_id
|
|
527
523
|
]
|
|
528
524
|
async with rollout_logging_context(
|
|
529
525
|
primary_rollout_id or "",
|
|
@@ -600,9 +596,7 @@ def evaluation_test(
|
|
|
600
596
|
row_groups[row.input_metadata.row_id].append(row)
|
|
601
597
|
tasks = []
|
|
602
598
|
for _, rows in row_groups.items():
|
|
603
|
-
tasks.append(
|
|
604
|
-
asyncio.create_task(_execute_groupwise_eval_with_semaphore(rows=rows))
|
|
605
|
-
)
|
|
599
|
+
tasks.append(asyncio.create_task(_execute_groupwise_eval_with_semaphore(rows=rows)))
|
|
606
600
|
results = []
|
|
607
601
|
for task in tasks:
|
|
608
602
|
res = await task
|
|
@@ -698,9 +692,9 @@ def evaluation_test(
|
|
|
698
692
|
# For other processors, create all tasks at once and run in parallel
|
|
699
693
|
# Concurrency is now controlled by the shared semaphore in each rollout processor
|
|
700
694
|
await run_tasks_with_run_progress(execute_run, num_runs, config)
|
|
701
|
-
|
|
695
|
+
|
|
702
696
|
experiment_duration_seconds = time.perf_counter() - experiment_start_time
|
|
703
|
-
|
|
697
|
+
|
|
704
698
|
# for groupwise mode, the result contains eval output from multiple completion_params, we need to differentiate them
|
|
705
699
|
# rollout_id is used to differentiate the result from different completion_params
|
|
706
700
|
if mode == "groupwise":
|
|
@@ -736,12 +730,15 @@ def evaluation_test(
|
|
|
736
730
|
experiment_duration_seconds,
|
|
737
731
|
)
|
|
738
732
|
|
|
733
|
+
|
|
734
|
+
|
|
739
735
|
if not all(r.evaluation_result is not None for run_results in all_results for r in run_results):
|
|
740
736
|
raise AssertionError(
|
|
741
737
|
"Some EvaluationRow instances are missing evaluation_result. "
|
|
742
738
|
"Your @evaluation_test function must set `row.evaluation_result`"
|
|
743
739
|
)
|
|
744
740
|
|
|
741
|
+
|
|
745
742
|
except AssertionError:
|
|
746
743
|
_log_eval_error(
|
|
747
744
|
Status.eval_finished(),
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/pytest/evaluation_test_utils.py
RENAMED
|
@@ -619,22 +619,3 @@ def build_rollout_processor_config(
|
|
|
619
619
|
server_script_path=None,
|
|
620
620
|
kwargs=rollout_processor_kwargs,
|
|
621
621
|
)
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
def normalize_fireworks_model(completion_params: CompletionParams | None) -> CompletionParams | None:
|
|
625
|
-
"""Fireworks model names like 'accounts/<org>/models/<model>' need the fireworks_ai/
|
|
626
|
-
prefix when routing through LiteLLM. This function adds the prefix if missing.
|
|
627
|
-
"""
|
|
628
|
-
if completion_params is None:
|
|
629
|
-
return None
|
|
630
|
-
|
|
631
|
-
model = completion_params.get("model")
|
|
632
|
-
if (
|
|
633
|
-
model
|
|
634
|
-
and isinstance(model, str)
|
|
635
|
-
and not model.startswith("fireworks_ai/")
|
|
636
|
-
and re.match(r"^accounts/[^/]+/models/.+", model)
|
|
637
|
-
):
|
|
638
|
-
completion_params = completion_params.copy()
|
|
639
|
-
completion_params["model"] = f"fireworks_ai/{model}"
|
|
640
|
-
return completion_params
|
|
@@ -122,9 +122,20 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
122
122
|
status_logs.append(log)
|
|
123
123
|
|
|
124
124
|
if status_logs:
|
|
125
|
+
if len(status_logs) > 1:
|
|
126
|
+
logger.warning(
|
|
127
|
+
"Found %s status logs for rollout %s; expected at most 1. Using the first one: %s",
|
|
128
|
+
len(status_logs),
|
|
129
|
+
row.execution_metadata.rollout_id,
|
|
130
|
+
status_logs[0],
|
|
131
|
+
)
|
|
125
132
|
# Use the first log with status information
|
|
126
133
|
status_log = status_logs[0]
|
|
127
134
|
status_dict = status_log.get("status")
|
|
135
|
+
raw_extras = status_log.get("extras") or {}
|
|
136
|
+
status_extras = {
|
|
137
|
+
k: v for k, v in raw_extras.items() if k not in ("logger_name", "level", "timestamp")
|
|
138
|
+
}
|
|
128
139
|
|
|
129
140
|
logger.info(
|
|
130
141
|
f"Found status log for rollout {row.execution_metadata.rollout_id}: {status_log.get('message', '')}"
|
|
@@ -149,6 +160,11 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
149
160
|
details=status_details,
|
|
150
161
|
)
|
|
151
162
|
|
|
163
|
+
if row.execution_metadata.extra:
|
|
164
|
+
row.execution_metadata.extra.update(status_extras)
|
|
165
|
+
else:
|
|
166
|
+
row.execution_metadata.extra = status_extras
|
|
167
|
+
|
|
152
168
|
logger.info("Stopping polling for rollout %s", row.execution_metadata.rollout_id)
|
|
153
169
|
break
|
|
154
170
|
|
|
@@ -179,7 +179,18 @@ def update_row_with_remote_trace(
|
|
|
179
179
|
if k not in row.input_metadata.dataset_info:
|
|
180
180
|
row.input_metadata.dataset_info[k] = v
|
|
181
181
|
|
|
182
|
-
|
|
182
|
+
preserved_extra = row.execution_metadata.extra
|
|
183
|
+
|
|
184
|
+
row.execution_metadata = remote_row.execution_metadata.model_copy(deep=True)
|
|
185
|
+
|
|
186
|
+
if preserved_extra:
|
|
187
|
+
if row.execution_metadata.extra:
|
|
188
|
+
# Merge remote and local extras; local takes precedence on conflicts
|
|
189
|
+
merged = row.execution_metadata.extra or {}
|
|
190
|
+
merged.update(preserved_extra)
|
|
191
|
+
row.execution_metadata.extra = merged
|
|
192
|
+
else:
|
|
193
|
+
row.execution_metadata.extra = preserved_extra
|
|
183
194
|
return None
|
|
184
195
|
else:
|
|
185
196
|
raise ValueError("Output data loader should return exactly one row.")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.3.9.
|
|
3
|
+
Version: 0.3.9.dev2
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/development/normalize_sandbox_fusion.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/development/utils/subprocess_manager.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/adapters/openai_responses.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/agent/resources/sql_resource.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/test_aime25.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/benchmarks/test_frozen_lake.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/__init__.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/create_rft.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/export_docs.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/local_test.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/cli_commands/run_eval_cmd.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/data_loader/jsonl_data_loader.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/dataset_logger/__init__.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/dataset_logger/dataset_logger.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/event_bus/sqlite_event_bus.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/generation/clients/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/__init__.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/deepeval.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/openai_rft.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/openeval.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/integrations/tinker_cookbook.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/log_utils/rollout_context.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/log_utils/rollout_id_filter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/client/connection.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/execution/__init__.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/execution/base_policy.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/execution/manager.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/execution/vllm_policy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/simple_process_manager.py
RENAMED
|
File without changes
|
{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.9.dev2}/eval_protocol/mcp/simulation_server.py
RENAMED
|
File without changes
|
|
File without changes
|