eval-protocol 0.2.96__tar.gz → 0.2.97__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.96/eval_protocol.egg-info → eval_protocol-0.2.97}/PKG-INFO +1 -1
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_glm_streaming_compliance.py +61 -135
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/create_rft.py +8 -2
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/local_test.py +25 -2
- eval_protocol-0.2.97/eval_protocol/pytest/buffer.py +82 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/evaluation_test.py +284 -226
- eval_protocol-0.2.97/eval_protocol/pytest/priority_scheduler.py +348 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/validate_signature.py +0 -2
- {eval_protocol-0.2.96 → eval_protocol-0.2.97/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol.egg-info/SOURCES.txt +3 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_evaluation_postprocess.py +60 -1
- eval_protocol-0.2.97/tests/test_priority_scheduler.py +322 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/LICENSE +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/README.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/development/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/cli_commands/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/openai_rft.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/tinker_cookbook.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/tinker_rollout_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/execution/vllm_policy.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/integrations/openenv_trl_vllm.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/openenv_rollout_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/rollout_result_post_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/pyproject.toml +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/setup.cfg +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/setup.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cli_create_rft.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cli_local_test.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_config.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_exception_config.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_format.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_length.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_litellm_policy_provider_fields.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_math.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_models.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_openai_rft_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/versioneer.py +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/assets/index-CuQbfdPD.js +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/assets/index-CuQbfdPD.js.map +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/assets/index-iZp_HgyW.css +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.96 → eval_protocol-0.2.97}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.97
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-12-
|
|
11
|
+
"date": "2025-12-09T23:27:24-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "a8914717e39825c126682e1686e036c0e7aa8960",
|
|
15
|
+
"version": "0.2.97"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -10,7 +10,6 @@ from eval_protocol.models import (
|
|
|
10
10
|
EvaluationRow,
|
|
11
11
|
Message,
|
|
12
12
|
MetricResult,
|
|
13
|
-
ChatCompletionContentPartTextParam,
|
|
14
13
|
)
|
|
15
14
|
from eval_protocol.pytest.default_single_turn_rollout_process import (
|
|
16
15
|
SingleTurnRolloutProcessor,
|
|
@@ -18,12 +17,12 @@ from eval_protocol.pytest.default_single_turn_rollout_process import (
|
|
|
18
17
|
from eval_protocol.pytest.evaluation_test import evaluation_test
|
|
19
18
|
|
|
20
19
|
|
|
21
|
-
DEFAULT_MODEL_ID = "fireworks_ai/accounts/
|
|
20
|
+
DEFAULT_MODEL_ID = "fireworks_ai/accounts/pyroworks/deployedModels/minimax-m2-zmi4qk9f"
|
|
22
21
|
DEFAULT_MAX_TOKENS = 10000
|
|
23
22
|
|
|
24
23
|
|
|
25
24
|
def _coerce_content_to_str(
|
|
26
|
-
content: str | list[
|
|
25
|
+
content: str | list[Any] | None,
|
|
27
26
|
) -> str:
|
|
28
27
|
if isinstance(content, list):
|
|
29
28
|
texts: list[str] = []
|
|
@@ -153,7 +152,34 @@ PEER_TOOL_BRACE_PAYLOAD = {
|
|
|
153
152
|
"content": "Call test_brace_bug with param1='test_value', param2=42, and param3=true",
|
|
154
153
|
}
|
|
155
154
|
],
|
|
156
|
-
"tools":
|
|
155
|
+
"tools": [
|
|
156
|
+
{
|
|
157
|
+
"type": "function",
|
|
158
|
+
"function": {
|
|
159
|
+
"name": "test_brace_bug",
|
|
160
|
+
"description": "A test function to validate JSON brace handling in tool arguments",
|
|
161
|
+
"parameters": {
|
|
162
|
+
"type": "object",
|
|
163
|
+
"properties": {
|
|
164
|
+
"param1": {
|
|
165
|
+
"type": "string",
|
|
166
|
+
"description": "A string parameter",
|
|
167
|
+
},
|
|
168
|
+
"param2": {
|
|
169
|
+
"type": "integer",
|
|
170
|
+
"description": "An integer parameter",
|
|
171
|
+
},
|
|
172
|
+
"param3": {
|
|
173
|
+
"type": "boolean",
|
|
174
|
+
"description": "A boolean parameter",
|
|
175
|
+
},
|
|
176
|
+
},
|
|
177
|
+
"required": ["param1", "param2", "param3"],
|
|
178
|
+
"additionalProperties": False,
|
|
179
|
+
},
|
|
180
|
+
},
|
|
181
|
+
}
|
|
182
|
+
],
|
|
157
183
|
"temperature": 0.1,
|
|
158
184
|
"top_p": 1,
|
|
159
185
|
}
|
|
@@ -468,48 +494,6 @@ PEER_TOOL_PARAMETER_FORMAT_ERRORS_PAYLOAD = {
|
|
|
468
494
|
"stream": True,
|
|
469
495
|
}
|
|
470
496
|
|
|
471
|
-
PEER_TOOL_RECOVERY_FAILURE_PAYLOAD = {
|
|
472
|
-
"messages": [
|
|
473
|
-
{
|
|
474
|
-
"role": "user",
|
|
475
|
-
"content": (
|
|
476
|
-
"View the file at /tmp/test.txt. If that fails, try again with the correct parameters. "
|
|
477
|
-
"Keep retrying until it works."
|
|
478
|
-
),
|
|
479
|
-
}
|
|
480
|
-
],
|
|
481
|
-
"tools": [
|
|
482
|
-
{
|
|
483
|
-
"type": "function",
|
|
484
|
-
"function": {
|
|
485
|
-
"name": "view",
|
|
486
|
-
"description": "View a file or directory",
|
|
487
|
-
"strict": True,
|
|
488
|
-
"parameters": {
|
|
489
|
-
"type": "object",
|
|
490
|
-
"properties": {
|
|
491
|
-
"path": {
|
|
492
|
-
"type": "string",
|
|
493
|
-
"description": "Path to the file or directory to view",
|
|
494
|
-
},
|
|
495
|
-
"type": {
|
|
496
|
-
"type": "string",
|
|
497
|
-
"enum": ["file", "directory"],
|
|
498
|
-
"description": "Type of the path (file or directory)",
|
|
499
|
-
},
|
|
500
|
-
},
|
|
501
|
-
"required": ["path", "type"],
|
|
502
|
-
"additionalProperties": False,
|
|
503
|
-
},
|
|
504
|
-
},
|
|
505
|
-
}
|
|
506
|
-
],
|
|
507
|
-
"tool_choice": "required",
|
|
508
|
-
"temperature": 0.1,
|
|
509
|
-
"max_tokens": 4000,
|
|
510
|
-
"stream": True,
|
|
511
|
-
}
|
|
512
|
-
|
|
513
497
|
|
|
514
498
|
def _build_row_from_payload(case: str, payload: dict[str, Any]) -> EvaluationRow:
|
|
515
499
|
messages = [
|
|
@@ -1329,13 +1313,13 @@ def test_streaming_multiple_tool_calls(row: EvaluationRow) -> EvaluationRow:
|
|
|
1329
1313
|
return row
|
|
1330
1314
|
|
|
1331
1315
|
|
|
1332
|
-
|
|
1333
|
-
"peer-tool-
|
|
1316
|
+
_PEER_TOOL_REQUIRED_PARAMS_ROW = _build_row_from_payload(
|
|
1317
|
+
"peer-tool-required-params", PEER_TOOL_MISSING_REQUIRED_PARAM_PAYLOAD
|
|
1334
1318
|
)
|
|
1335
1319
|
|
|
1336
1320
|
|
|
1337
1321
|
@evaluation_test(
|
|
1338
|
-
input_rows=[[
|
|
1322
|
+
input_rows=[[_PEER_TOOL_REQUIRED_PARAMS_ROW]],
|
|
1339
1323
|
completion_params=[_build_completion_params_from_payload(PEER_TOOL_MISSING_REQUIRED_PARAM_PAYLOAD)],
|
|
1340
1324
|
rollout_processor=SingleTurnRolloutProcessor(),
|
|
1341
1325
|
aggregation_method="mean",
|
|
@@ -1343,22 +1327,23 @@ _PEER_TOOL_MISSING_REQUIRED_ROW = _build_row_from_payload(
|
|
|
1343
1327
|
num_runs=1,
|
|
1344
1328
|
mode="pointwise",
|
|
1345
1329
|
)
|
|
1346
|
-
def
|
|
1347
|
-
"""
|
|
1330
|
+
def test_streaming_tool_required_params_present(row: EvaluationRow) -> EvaluationRow:
|
|
1331
|
+
"""Verify that tool calls include all required parameters during streaming."""
|
|
1348
1332
|
|
|
1349
1333
|
assistant_msg = row.last_assistant_message()
|
|
1350
1334
|
finish_reason = row.execution_metadata.finish_reason
|
|
1351
|
-
_debug_log_assistant_message("
|
|
1335
|
+
_debug_log_assistant_message("tool_required_params", assistant_msg, finish_reason)
|
|
1352
1336
|
content_str = _coerce_content_to_str(assistant_msg.content) if assistant_msg else ""
|
|
1353
1337
|
reasoning_str = (assistant_msg.reasoning_content or "").strip() if assistant_msg else ""
|
|
1354
1338
|
calls = _collect_tool_calls(assistant_msg.tool_calls if assistant_msg else [])
|
|
1355
1339
|
|
|
1356
|
-
|
|
1340
|
+
required_params_present = False
|
|
1357
1341
|
arguments = None
|
|
1358
1342
|
for _, args in calls:
|
|
1359
1343
|
if args:
|
|
1360
1344
|
arguments = args
|
|
1361
|
-
|
|
1345
|
+
# Check that required 'type' param is present and valid
|
|
1346
|
+
required_params_present = "type" in args and args.get("type") in {"file", "directory"}
|
|
1362
1347
|
|
|
1363
1348
|
metrics = {
|
|
1364
1349
|
"tool_call_emitted": MetricResult(
|
|
@@ -1366,10 +1351,12 @@ def test_streaming_tool_missing_required_param(row: EvaluationRow) -> Evaluation
|
|
|
1366
1351
|
is_score_valid=True,
|
|
1367
1352
|
reason="Tool call emitted" if calls else "No tool call emitted",
|
|
1368
1353
|
),
|
|
1369
|
-
"
|
|
1370
|
-
score=1.0 if
|
|
1354
|
+
"required_params_present": MetricResult(
|
|
1355
|
+
score=1.0 if required_params_present else 0.0,
|
|
1371
1356
|
is_score_valid=bool(calls),
|
|
1372
|
-
reason="
|
|
1357
|
+
reason="All required parameters present"
|
|
1358
|
+
if required_params_present
|
|
1359
|
+
else "Required parameter missing or invalid",
|
|
1373
1360
|
data={"arguments": arguments},
|
|
1374
1361
|
),
|
|
1375
1362
|
"finish_reason": MetricResult(
|
|
@@ -1386,15 +1373,19 @@ def test_streaming_tool_missing_required_param(row: EvaluationRow) -> Evaluation
|
|
|
1386
1373
|
)
|
|
1387
1374
|
|
|
1388
1375
|
all_checks_passed = (
|
|
1389
|
-
|
|
1376
|
+
required_params_present
|
|
1377
|
+
and finish_reason_present
|
|
1378
|
+
and no_forbidden_tags
|
|
1379
|
+
and no_xml_tags
|
|
1380
|
+
and no_reasoning_leakage
|
|
1390
1381
|
)
|
|
1391
1382
|
|
|
1392
1383
|
row.evaluation_result = EvaluateResult(
|
|
1393
1384
|
score=1.0 if all_checks_passed else 0.0,
|
|
1394
1385
|
is_score_valid=True,
|
|
1395
|
-
reason="
|
|
1386
|
+
reason="All required parameters included in tool call"
|
|
1396
1387
|
if all_checks_passed
|
|
1397
|
-
else "Required parameters
|
|
1388
|
+
else "Required parameters missing or response invalid",
|
|
1398
1389
|
metrics=metrics,
|
|
1399
1390
|
)
|
|
1400
1391
|
return row
|
|
@@ -1674,71 +1665,6 @@ def test_streaming_tool_parameter_types(row: EvaluationRow) -> EvaluationRow:
|
|
|
1674
1665
|
return row
|
|
1675
1666
|
|
|
1676
1667
|
|
|
1677
|
-
_PEER_TOOL_RECOVERY_ROW = _build_row_from_payload("peer-tool-recovery-failure", PEER_TOOL_RECOVERY_FAILURE_PAYLOAD)
|
|
1678
|
-
|
|
1679
|
-
|
|
1680
|
-
@evaluation_test(
|
|
1681
|
-
input_rows=[[_PEER_TOOL_RECOVERY_ROW]],
|
|
1682
|
-
completion_params=[_build_completion_params_from_payload(PEER_TOOL_RECOVERY_FAILURE_PAYLOAD)],
|
|
1683
|
-
rollout_processor=SingleTurnRolloutProcessor(),
|
|
1684
|
-
aggregation_method="mean",
|
|
1685
|
-
passed_threshold=0.0,
|
|
1686
|
-
num_runs=1,
|
|
1687
|
-
mode="pointwise",
|
|
1688
|
-
)
|
|
1689
|
-
def test_streaming_tool_retry_behavior(row: EvaluationRow) -> EvaluationRow:
|
|
1690
|
-
"""Check whether the assistant retries tool calls when instructed to recover."""
|
|
1691
|
-
|
|
1692
|
-
assistant_msg = row.last_assistant_message()
|
|
1693
|
-
print(f"assistant_msg: {assistant_msg}")
|
|
1694
|
-
finish_reason = row.execution_metadata.finish_reason
|
|
1695
|
-
_debug_log_assistant_message("tool_recovery", assistant_msg, finish_reason)
|
|
1696
|
-
content_str = _coerce_content_to_str(assistant_msg.content) if assistant_msg else ""
|
|
1697
|
-
calls = _collect_tool_calls(assistant_msg.tool_calls if assistant_msg else [])
|
|
1698
|
-
reasoning = (assistant_msg.reasoning_content or "").strip() if assistant_msg else ""
|
|
1699
|
-
|
|
1700
|
-
multiple_attempts = len(calls) >= 2
|
|
1701
|
-
metrics = {
|
|
1702
|
-
"tool_call_attempts": MetricResult(
|
|
1703
|
-
score=1.0 if multiple_attempts else 0.0,
|
|
1704
|
-
is_score_valid=True,
|
|
1705
|
-
reason="Multiple tool call attempts" if multiple_attempts else "Single/no tool call attempt",
|
|
1706
|
-
data={"tool_call_count": len(calls)},
|
|
1707
|
-
),
|
|
1708
|
-
"reasoning_present": MetricResult(
|
|
1709
|
-
score=1.0 if reasoning else 0.0,
|
|
1710
|
-
is_score_valid=True,
|
|
1711
|
-
reason="Reasoning present" if reasoning else "No reasoning provided",
|
|
1712
|
-
data={"reasoning": reasoning[:160]},
|
|
1713
|
-
),
|
|
1714
|
-
"finish_reason": MetricResult(
|
|
1715
|
-
score=1.0 if finish_reason in {"tool_calls", "stop"} else 0.0,
|
|
1716
|
-
is_score_valid=True,
|
|
1717
|
-
reason="finish_reason acceptable"
|
|
1718
|
-
if finish_reason in {"tool_calls", "stop"}
|
|
1719
|
-
else f"Unexpected finish_reason: {finish_reason}",
|
|
1720
|
-
),
|
|
1721
|
-
}
|
|
1722
|
-
|
|
1723
|
-
finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks(
|
|
1724
|
-
metrics, finish_reason, content_str, reasoning
|
|
1725
|
-
)
|
|
1726
|
-
|
|
1727
|
-
all_checks_passed = (
|
|
1728
|
-
multiple_attempts and finish_reason_present and no_forbidden_tags and no_xml_tags and no_reasoning_leakage
|
|
1729
|
-
)
|
|
1730
|
-
|
|
1731
|
-
row.evaluation_result = EvaluateResult(
|
|
1732
|
-
score=1.0 if all_checks_passed else 0.0,
|
|
1733
|
-
is_score_valid=True,
|
|
1734
|
-
reason="Multiple recovery attempts observed"
|
|
1735
|
-
if all_checks_passed
|
|
1736
|
-
else "Recovery attempts missing or response invalid",
|
|
1737
|
-
metrics=metrics,
|
|
1738
|
-
)
|
|
1739
|
-
return row
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
1668
|
# ============================================================================
|
|
1743
1669
|
# Reasoning Effort Tests
|
|
1744
1670
|
# ============================================================================
|
|
@@ -1759,7 +1685,7 @@ REASONING_DISABLED_ROW.input_metadata.dataset_info = {
|
|
|
1759
1685
|
input_rows=[[REASONING_DISABLED_ROW]],
|
|
1760
1686
|
completion_params=[
|
|
1761
1687
|
{
|
|
1762
|
-
"model":
|
|
1688
|
+
"model": DEFAULT_MODEL_ID, # Reasoning-capable model
|
|
1763
1689
|
"reasoning_effort": "none", # Explicitly disable reasoning
|
|
1764
1690
|
"max_tokens": DEFAULT_MAX_TOKENS,
|
|
1765
1691
|
"temperature": 0.0,
|
|
@@ -1869,7 +1795,7 @@ REASONING_ENABLED_ROW.input_metadata.dataset_info = {
|
|
|
1869
1795
|
input_rows=[[REASONING_ENABLED_ROW]],
|
|
1870
1796
|
completion_params=[
|
|
1871
1797
|
{
|
|
1872
|
-
"model":
|
|
1798
|
+
"model": DEFAULT_MODEL_ID, # Reasoning-capable model
|
|
1873
1799
|
"reasoning_effort": "low", # Enable reasoning
|
|
1874
1800
|
"max_tokens": DEFAULT_MAX_TOKENS,
|
|
1875
1801
|
"temperature": 0.0,
|
|
@@ -2004,7 +1930,7 @@ TOOLS_WITH_REASONING_ROW.input_metadata.dataset_info = {
|
|
|
2004
1930
|
input_rows=[[TOOLS_WITH_REASONING_ROW]],
|
|
2005
1931
|
completion_params=[
|
|
2006
1932
|
{
|
|
2007
|
-
"model":
|
|
1933
|
+
"model": DEFAULT_MODEL_ID, # Reasoning-capable model
|
|
2008
1934
|
"reasoning_effort": "low", # Enable reasoning
|
|
2009
1935
|
"max_tokens": DEFAULT_MAX_TOKENS,
|
|
2010
1936
|
"temperature": 0.0,
|
|
@@ -2727,7 +2653,7 @@ REASONING_DISABLED_NON_STREAM_ROW.input_metadata.dataset_info = {
|
|
|
2727
2653
|
input_rows=[[REASONING_DISABLED_NON_STREAM_ROW]],
|
|
2728
2654
|
completion_params=[
|
|
2729
2655
|
{
|
|
2730
|
-
"model":
|
|
2656
|
+
"model": DEFAULT_MODEL_ID,
|
|
2731
2657
|
"reasoning_effort": "none",
|
|
2732
2658
|
"max_tokens": DEFAULT_MAX_TOKENS,
|
|
2733
2659
|
"temperature": 0.0,
|
|
@@ -2834,7 +2760,7 @@ REASONING_ENABLED_NON_STREAM_ROW.input_metadata.dataset_info = {
|
|
|
2834
2760
|
input_rows=[[REASONING_ENABLED_NON_STREAM_ROW]],
|
|
2835
2761
|
completion_params=[
|
|
2836
2762
|
{
|
|
2837
|
-
"model":
|
|
2763
|
+
"model": DEFAULT_MODEL_ID,
|
|
2838
2764
|
"reasoning_effort": "low",
|
|
2839
2765
|
"max_tokens": DEFAULT_MAX_TOKENS,
|
|
2840
2766
|
"temperature": 0.0,
|
|
@@ -2962,7 +2888,7 @@ TOOLS_WITH_REASONING_NON_STREAM_ROW.input_metadata.dataset_info = {
|
|
|
2962
2888
|
input_rows=[[TOOLS_WITH_REASONING_NON_STREAM_ROW]],
|
|
2963
2889
|
completion_params=[
|
|
2964
2890
|
{
|
|
2965
|
-
"model":
|
|
2891
|
+
"model": DEFAULT_MODEL_ID,
|
|
2966
2892
|
"reasoning_effort": "low",
|
|
2967
2893
|
"max_tokens": DEFAULT_MAX_TOKENS,
|
|
2968
2894
|
"temperature": 0.0,
|
|
@@ -3108,7 +3034,7 @@ STRUCTURED_JSON_SCHEMA = {
|
|
|
3108
3034
|
input_rows=[[STRUCTURED_OUTPUT_WITH_REASONING_ROW]],
|
|
3109
3035
|
completion_params=[
|
|
3110
3036
|
{
|
|
3111
|
-
"model":
|
|
3037
|
+
"model": DEFAULT_MODEL_ID,
|
|
3112
3038
|
"stream": True,
|
|
3113
3039
|
"reasoning_effort": "low",
|
|
3114
3040
|
"response_format": STRUCTURED_JSON_SCHEMA,
|
|
@@ -3211,7 +3137,7 @@ STRUCTURED_OUTPUT_WITH_REASONING_NON_STREAM_ROW.input_metadata.dataset_info = {
|
|
|
3211
3137
|
input_rows=[[STRUCTURED_OUTPUT_WITH_REASONING_NON_STREAM_ROW]],
|
|
3212
3138
|
completion_params=[
|
|
3213
3139
|
{
|
|
3214
|
-
"model":
|
|
3140
|
+
"model": DEFAULT_MODEL_ID,
|
|
3215
3141
|
"stream": False,
|
|
3216
3142
|
"reasoning_effort": "low",
|
|
3217
3143
|
"response_format": STRUCTURED_JSON_SCHEMA,
|
|
@@ -3334,7 +3260,7 @@ MULTIPLE_TOOLS_WITH_REASONING_ROW.input_metadata.dataset_info = {
|
|
|
3334
3260
|
input_rows=[[MULTIPLE_TOOLS_WITH_REASONING_ROW]],
|
|
3335
3261
|
completion_params=[
|
|
3336
3262
|
{
|
|
3337
|
-
"model":
|
|
3263
|
+
"model": DEFAULT_MODEL_ID,
|
|
3338
3264
|
"stream": True,
|
|
3339
3265
|
"reasoning_effort": "low",
|
|
3340
3266
|
"temperature": 0.0,
|
|
@@ -3461,7 +3387,7 @@ MULTIPLE_TOOLS_WITH_REASONING_NON_STREAM_ROW.input_metadata.dataset_info = {
|
|
|
3461
3387
|
input_rows=[[MULTIPLE_TOOLS_WITH_REASONING_NON_STREAM_ROW]],
|
|
3462
3388
|
completion_params=[
|
|
3463
3389
|
{
|
|
3464
|
-
"model":
|
|
3390
|
+
"model": DEFAULT_MODEL_ID,
|
|
3465
3391
|
"stream": False,
|
|
3466
3392
|
"reasoning_effort": "low",
|
|
3467
3393
|
"temperature": 0.0,
|
|
@@ -279,7 +279,13 @@ def _validate_evaluator_locally(
|
|
|
279
279
|
docker_build_extra: str,
|
|
280
280
|
docker_run_extra: str,
|
|
281
281
|
) -> bool:
|
|
282
|
-
"""Run pytest locally for the selected evaluation test to validate the evaluator.
|
|
282
|
+
"""Run pytest locally for the selected evaluation test to validate the evaluator.
|
|
283
|
+
|
|
284
|
+
The pytest helpers always enforce a small success threshold (0.01) for
|
|
285
|
+
evaluation_test-based suites so that an evaluation run where all scores are
|
|
286
|
+
0.0 will naturally fail with a non-zero pytest exit code, which we then treat
|
|
287
|
+
as a failed validator.
|
|
288
|
+
"""
|
|
283
289
|
if not selected_test_file or not selected_test_func:
|
|
284
290
|
# No local test associated; skip validation but warn the user.
|
|
285
291
|
print("Warning: Could not resolve a local evaluation test for this evaluator; skipping local validation.")
|
|
@@ -702,7 +708,7 @@ def _create_rft_job(
|
|
|
702
708
|
print(f"Prepared RFT job for evaluator '{evaluator_id}' using dataset '{dataset_id}'")
|
|
703
709
|
if getattr(args, "evaluation_dataset", None):
|
|
704
710
|
body["evaluationDataset"] = args.evaluation_dataset
|
|
705
|
-
|
|
711
|
+
|
|
706
712
|
output_model_arg = getattr(args, "output_model", None)
|
|
707
713
|
if output_model_arg:
|
|
708
714
|
if len(output_model_arg) > 63:
|
|
@@ -38,7 +38,9 @@ def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List
|
|
|
38
38
|
def _run_pytest_host(pytest_target: str) -> int:
|
|
39
39
|
"""Run pytest against a target on the host and return its exit code."""
|
|
40
40
|
print(f"Running locally: pytest {pytest_target} -vs")
|
|
41
|
-
|
|
41
|
+
# Always enforce a small success threshold for evaluation_test-based suites so that runs with all-zero scores fail.
|
|
42
|
+
cmd = [sys.executable, "-m", "pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
|
|
43
|
+
proc = subprocess.run(cmd)
|
|
42
44
|
return proc.returncode
|
|
43
45
|
|
|
44
46
|
|
|
@@ -69,6 +71,22 @@ def _run_pytest_in_docker(
|
|
|
69
71
|
"-w",
|
|
70
72
|
workdir,
|
|
71
73
|
]
|
|
74
|
+
|
|
75
|
+
# If EP_SUMMARY_JSON is set on the host, mirror it into the container so that
|
|
76
|
+
# pytest evaluation tests can write summary artifacts that are visible to the
|
|
77
|
+
# host. We map paths under the host logs directory (~/.eval_protocol) into the
|
|
78
|
+
# mounted container home directory.
|
|
79
|
+
host_summary_path = os.environ.get("EP_SUMMARY_JSON")
|
|
80
|
+
if host_summary_path:
|
|
81
|
+
try:
|
|
82
|
+
rel_path = os.path.relpath(host_summary_path, host_logs_dir)
|
|
83
|
+
# Only forward the variable when the summary path is inside the logs dir.
|
|
84
|
+
if not rel_path.startswith(os.pardir):
|
|
85
|
+
container_summary_path = os.path.join("/container_home/.eval_protocol", rel_path)
|
|
86
|
+
cmd += ["-e", f"EP_SUMMARY_JSON={container_summary_path}"]
|
|
87
|
+
except Exception:
|
|
88
|
+
# Best-effort only; do not fail docker execution if we can't map the path.
|
|
89
|
+
pass
|
|
72
90
|
# Try to match host user to avoid permission problems on mounted volume
|
|
73
91
|
try:
|
|
74
92
|
uid = os.getuid() # type: ignore[attr-defined]
|
|
@@ -78,7 +96,12 @@ def _run_pytest_in_docker(
|
|
|
78
96
|
pass
|
|
79
97
|
if run_extras:
|
|
80
98
|
cmd += run_extras
|
|
81
|
-
|
|
99
|
+
|
|
100
|
+
# Build pytest command, always enforcing the same small success threshold as
|
|
101
|
+
# the host runner so that all-zero score runs fail consistently.
|
|
102
|
+
pytest_cmd: list[str] = ["pytest", "--ep-success-threshold", "0.001", pytest_target, "-vs"]
|
|
103
|
+
|
|
104
|
+
cmd += [image_tag] + pytest_cmd
|
|
82
105
|
print("Running in Docker:", " ".join(cmd))
|
|
83
106
|
try:
|
|
84
107
|
proc = subprocess.run(cmd)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import List, Dict
|
|
5
|
+
|
|
6
|
+
from eval_protocol.models import EvaluationRow
|
|
7
|
+
|
|
8
|
+
class MicroBatchDataBuffer:
|
|
9
|
+
"""
|
|
10
|
+
Buffers evaluation results and writes them to disk in minibatches.
|
|
11
|
+
Waits for all runs of a sample to complete before considering it ready and flush to disk.
|
|
12
|
+
"""
|
|
13
|
+
def __init__(self, num_runs: int, batch_size: int, output_path_template: str):
|
|
14
|
+
self.num_runs = num_runs
|
|
15
|
+
self.batch_size = batch_size
|
|
16
|
+
self.output_path_template = output_path_template
|
|
17
|
+
self.pending_samples: Dict[str, List[EvaluationRow]] = defaultdict(list) # row_id -> list[EvaluationRow]
|
|
18
|
+
self.completed_samples_buffer: List[List[EvaluationRow]] = [] # List[List[EvaluationRow]]
|
|
19
|
+
self.batch_index = 0
|
|
20
|
+
self.lock = asyncio.Lock()
|
|
21
|
+
|
|
22
|
+
async def add_result(self, row: EvaluationRow):
|
|
23
|
+
"""
|
|
24
|
+
Add a single evaluation result.
|
|
25
|
+
Thread-safe/Coroutine-safe.
|
|
26
|
+
"""
|
|
27
|
+
async with self.lock:
|
|
28
|
+
row_id = row.input_metadata.row_id
|
|
29
|
+
if not row_id:
|
|
30
|
+
# Should not happen in valid EP workflow, unique row_id is required to group things together properly
|
|
31
|
+
return
|
|
32
|
+
|
|
33
|
+
self.pending_samples[row_id].append(row)
|
|
34
|
+
|
|
35
|
+
if len(self.pending_samples[row_id]) >= self.num_runs:
|
|
36
|
+
# Sample completed (all runs finished)
|
|
37
|
+
completed_rows = self.pending_samples.pop(row_id)
|
|
38
|
+
self.completed_samples_buffer.append(completed_rows)
|
|
39
|
+
|
|
40
|
+
if len(self.completed_samples_buffer) >= self.batch_size:
|
|
41
|
+
await self._flush_unsafe()
|
|
42
|
+
|
|
43
|
+
async def _flush_unsafe(self):
|
|
44
|
+
"""
|
|
45
|
+
not thread safe, assumes lock is held by called
|
|
46
|
+
"""
|
|
47
|
+
if not self.completed_samples_buffer:
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
if "{index}" in self.output_path_template:
|
|
51
|
+
output_path = self.output_path_template.format(index=self.batch_index)
|
|
52
|
+
mode = "w"
|
|
53
|
+
else:
|
|
54
|
+
output_path = self.output_path_template
|
|
55
|
+
mode = "a" # Append if no index placeholder
|
|
56
|
+
|
|
57
|
+
# Ensure directory exists
|
|
58
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
|
|
59
|
+
|
|
60
|
+
# Write flattened rows
|
|
61
|
+
with open(output_path, mode) as f:
|
|
62
|
+
for sample_rows in self.completed_samples_buffer:
|
|
63
|
+
for row in sample_rows:
|
|
64
|
+
f.write(row.model_dump_json() + "\n")
|
|
65
|
+
|
|
66
|
+
self.completed_samples_buffer = []
|
|
67
|
+
self.batch_index += 1
|
|
68
|
+
|
|
69
|
+
async def close(self):
|
|
70
|
+
"""
|
|
71
|
+
Flush any remaining samples in the buffer.
|
|
72
|
+
"""
|
|
73
|
+
async with self.lock:
|
|
74
|
+
# Also flush pending (incomplete) samples to avoid data loss
|
|
75
|
+
if self.pending_samples:
|
|
76
|
+
for rows in self.pending_samples.values():
|
|
77
|
+
self.completed_samples_buffer.append(rows)
|
|
78
|
+
self.pending_samples.clear()
|
|
79
|
+
|
|
80
|
+
if self.completed_samples_buffer:
|
|
81
|
+
await self._flush_unsafe()
|
|
82
|
+
|