eval-protocol 0.2.84.dev10__tar.gz → 0.2.86__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.84.dev10/eval_protocol.egg-info → eval_protocol-0.2.86}/PKG-INFO +2 -4
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/README.md +1 -3
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli.py +11 -1
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/local_test.py +25 -12
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86/eval_protocol.egg-info}/PKG-INFO +2 -4
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_cli_local_test.py +115 -4
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/LICENSE +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/development/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/create_rft.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/pyproject.toml +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/setup.cfg +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/setup.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_cli_create_rft_infer.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_config.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_format.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_length.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_math.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_models.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/versioneer.py +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.86
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -118,9 +118,7 @@ Dynamic: license-file
|
|
|
118
118
|
[](https://pypi.org/project/eval-protocol/)
|
|
119
119
|
[](https://deepwiki.com/eval-protocol/python-sdk)
|
|
120
120
|
|
|
121
|
-
**
|
|
122
|
-
|
|
123
|
-
With hundreds of models and configs, you need objective data to choose the right one for your use case. EP helps you evaluate real traces, compare models, and visualize results locally.
|
|
121
|
+
**The open-source framework to help you write evals for RL.**
|
|
124
122
|
|
|
125
123
|
## 🚀 Features
|
|
126
124
|
|
|
@@ -3,9 +3,7 @@
|
|
|
3
3
|
[](https://pypi.org/project/eval-protocol/)
|
|
4
4
|
[](https://deepwiki.com/eval-protocol/python-sdk)
|
|
5
5
|
|
|
6
|
-
**
|
|
7
|
-
|
|
8
|
-
With hundreds of models and configs, you need objective data to choose the right one for your use case. EP helps you evaluate real traces, compare models, and visualize results locally.
|
|
6
|
+
**The open-source framework to help you write evals for RL.**
|
|
9
7
|
|
|
10
8
|
## 🚀 Features
|
|
11
9
|
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-11-
|
|
11
|
+
"date": "2025-11-11T17:37:10-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "3b2340644bd8894822a9e34445bc60552e3843cf",
|
|
15
|
+
"version": "0.2.86"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -407,7 +407,7 @@ def parse_args(args=None):
|
|
|
407
407
|
rft_parser.add_argument("--eval-auto-carveout", dest="eval_auto_carveout", action="store_true", default=True)
|
|
408
408
|
rft_parser.add_argument("--no-eval-auto-carveout", dest="eval_auto_carveout", action="store_false")
|
|
409
409
|
# Rollout chunking
|
|
410
|
-
rft_parser.add_argument("--chunk-size", type=int, default=
|
|
410
|
+
rft_parser.add_argument("--chunk-size", type=int, default=100, help="Data chunk size for rollout batching")
|
|
411
411
|
# Inference params
|
|
412
412
|
rft_parser.add_argument("--temperature", type=float)
|
|
413
413
|
rft_parser.add_argument("--top-p", type=float)
|
|
@@ -447,6 +447,16 @@ def parse_args(args=None):
|
|
|
447
447
|
action="store_true",
|
|
448
448
|
help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
|
|
449
449
|
)
|
|
450
|
+
local_test_parser.add_argument(
|
|
451
|
+
"--docker-build-extra",
|
|
452
|
+
default="",
|
|
453
|
+
help="Extra flags to pass to 'docker build' (quoted string, e.g. \"--no-cache --pull --progress=plain\")",
|
|
454
|
+
)
|
|
455
|
+
local_test_parser.add_argument(
|
|
456
|
+
"--docker-run-extra",
|
|
457
|
+
default="",
|
|
458
|
+
help="Extra flags to pass to 'docker run' (quoted string, e.g. \"--env-file .env --memory=8g\")",
|
|
459
|
+
)
|
|
450
460
|
|
|
451
461
|
# Run command (for Hydra-based evaluations)
|
|
452
462
|
# This subparser intentionally defines no arguments itself.
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/local_test.py
RENAMED
|
@@ -2,6 +2,7 @@ import argparse
|
|
|
2
2
|
import os
|
|
3
3
|
import subprocess
|
|
4
4
|
import sys
|
|
5
|
+
import shlex
|
|
5
6
|
from typing import List
|
|
6
7
|
|
|
7
8
|
from .upload import _discover_tests, _prompt_select
|
|
@@ -24,16 +25,15 @@ def _run_pytest_host(pytest_target: str) -> int:
|
|
|
24
25
|
return proc.returncode
|
|
25
26
|
|
|
26
27
|
|
|
27
|
-
def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
|
|
28
|
+
def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List[str] | None = None) -> bool:
|
|
28
29
|
context_dir = os.path.dirname(dockerfile_path)
|
|
29
30
|
print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
|
|
30
31
|
try:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
)
|
|
32
|
+
base_cmd = ["docker", "build"]
|
|
33
|
+
if build_extras:
|
|
34
|
+
base_cmd += build_extras
|
|
35
|
+
base_cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir]
|
|
36
|
+
proc = subprocess.run(base_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
|
37
37
|
print(proc.stdout)
|
|
38
38
|
return proc.returncode == 0
|
|
39
39
|
except FileNotFoundError:
|
|
@@ -41,7 +41,9 @@ def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
|
|
|
41
41
|
return False
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
def _run_pytest_in_docker(
|
|
44
|
+
def _run_pytest_in_docker(
|
|
45
|
+
project_root: str, image_tag: str, pytest_target: str, run_extras: List[str] | None = None
|
|
46
|
+
) -> int:
|
|
45
47
|
workdir = "/workspace"
|
|
46
48
|
# Host HOME logs directory to map into container
|
|
47
49
|
host_home = os.path.expanduser("~")
|
|
@@ -73,6 +75,8 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str)
|
|
|
73
75
|
cmd += ["--user", f"{uid}:{gid}"]
|
|
74
76
|
except Exception:
|
|
75
77
|
pass
|
|
78
|
+
if run_extras:
|
|
79
|
+
cmd += run_extras
|
|
76
80
|
cmd += [image_tag, "pytest", pytest_target, "-vs"]
|
|
77
81
|
print("Running in Docker:", " ".join(cmd))
|
|
78
82
|
try:
|
|
@@ -91,11 +95,16 @@ def local_test_command(args: argparse.Namespace) -> int:
|
|
|
91
95
|
entry = getattr(args, "entry", None)
|
|
92
96
|
if entry:
|
|
93
97
|
if "::" in entry:
|
|
94
|
-
file_part = entry.split("::", 1)
|
|
98
|
+
file_part, func_part = entry.split("::", 1)
|
|
95
99
|
file_path = (
|
|
96
100
|
file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
|
|
97
101
|
)
|
|
98
|
-
|
|
102
|
+
# Convert to project-relative like the non-:: path
|
|
103
|
+
try:
|
|
104
|
+
rel = os.path.relpath(file_path, project_root)
|
|
105
|
+
except Exception:
|
|
106
|
+
rel = file_path
|
|
107
|
+
pytest_target = f"{rel}::{func_part}"
|
|
99
108
|
else:
|
|
100
109
|
file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
|
|
101
110
|
# Use path relative to project_root when possible
|
|
@@ -126,6 +135,10 @@ def local_test_command(args: argparse.Namespace) -> int:
|
|
|
126
135
|
pytest_target = rel
|
|
127
136
|
|
|
128
137
|
ignore_docker = bool(getattr(args, "ignore_docker", False))
|
|
138
|
+
build_extras_str = getattr(args, "docker_build_extra", "") or ""
|
|
139
|
+
run_extras_str = getattr(args, "docker_run_extra", "") or ""
|
|
140
|
+
build_extras = shlex.split(build_extras_str) if build_extras_str else []
|
|
141
|
+
run_extras = shlex.split(run_extras_str) if run_extras_str else []
|
|
129
142
|
if ignore_docker:
|
|
130
143
|
if not pytest_target:
|
|
131
144
|
print("Error: Failed to resolve a pytest target to run.")
|
|
@@ -146,14 +159,14 @@ def local_test_command(args: argparse.Namespace) -> int:
|
|
|
146
159
|
except Exception:
|
|
147
160
|
pass
|
|
148
161
|
image_tag = "ep-evaluator:local"
|
|
149
|
-
ok = _build_docker_image(dockerfiles[0], image_tag)
|
|
162
|
+
ok = _build_docker_image(dockerfiles[0], image_tag, build_extras=build_extras)
|
|
150
163
|
if not ok:
|
|
151
164
|
print("Docker build failed. See logs above.")
|
|
152
165
|
return 1
|
|
153
166
|
if not pytest_target:
|
|
154
167
|
print("Error: Failed to resolve a pytest target to run.")
|
|
155
168
|
return 1
|
|
156
|
-
return _run_pytest_in_docker(project_root, image_tag, pytest_target)
|
|
169
|
+
return _run_pytest_in_docker(project_root, image_tag, pytest_target, run_extras=run_extras)
|
|
157
170
|
|
|
158
171
|
# No Dockerfile: run on host
|
|
159
172
|
if not pytest_target:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.86
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -118,9 +118,7 @@ Dynamic: license-file
|
|
|
118
118
|
[](https://pypi.org/project/eval-protocol/)
|
|
119
119
|
[](https://deepwiki.com/eval-protocol/python-sdk)
|
|
120
120
|
|
|
121
|
-
**
|
|
122
|
-
|
|
123
|
-
With hundreds of models and configs, you need objective data to choose the right one for your use case. EP helps you evaluate real traces, compare models, and visualize results locally.
|
|
121
|
+
**The open-source framework to help you write evals for RL.**
|
|
124
122
|
|
|
125
123
|
## 🚀 Features
|
|
126
124
|
|
|
@@ -95,11 +95,11 @@ def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch):
|
|
|
95
95
|
from eval_protocol.cli_commands import local_test as lt
|
|
96
96
|
|
|
97
97
|
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
|
|
98
|
-
monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag,
|
|
98
|
+
monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, build_extras=None: True)
|
|
99
99
|
|
|
100
100
|
captured = {"target": "", "image": ""}
|
|
101
101
|
|
|
102
|
-
def _fake_run_docker(root: str, image_tag: str, pytest_target: str,
|
|
102
|
+
def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int:
|
|
103
103
|
captured["target"] = pytest_target
|
|
104
104
|
captured["image"] = image_tag
|
|
105
105
|
return 0
|
|
@@ -123,12 +123,11 @@ def test_local_test_selector_single_test(tmp_path, monkeypatch):
|
|
|
123
123
|
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
124
124
|
|
|
125
125
|
from eval_protocol.cli_commands import local_test as lt
|
|
126
|
-
from eval_protocol.cli_commands import upload as up
|
|
127
126
|
|
|
128
127
|
# No entry; force discover + selector
|
|
129
128
|
disc = SimpleNamespace(qualname="metric.test_sel", file_path=str(test_file))
|
|
130
129
|
monkeypatch.setattr(lt, "_discover_tests", lambda root: [disc])
|
|
131
|
-
monkeypatch.setattr(
|
|
130
|
+
monkeypatch.setattr(lt, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
|
|
132
131
|
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
|
|
133
132
|
|
|
134
133
|
called = {"host": False}
|
|
@@ -143,3 +142,115 @@ def test_local_test_selector_single_test(tmp_path, monkeypatch):
|
|
|
143
142
|
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
144
143
|
assert rc == 0
|
|
145
144
|
assert called["host"] is True
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def test_local_test_passes_docker_build_extra(tmp_path, monkeypatch):
|
|
148
|
+
project = tmp_path / "proj"
|
|
149
|
+
project.mkdir()
|
|
150
|
+
monkeypatch.chdir(project)
|
|
151
|
+
|
|
152
|
+
test_file = project / "metric" / "test_build_extra.py"
|
|
153
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
154
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
155
|
+
|
|
156
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
157
|
+
|
|
158
|
+
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
|
|
159
|
+
|
|
160
|
+
captured = {"extras": None}
|
|
161
|
+
|
|
162
|
+
def _fake_build(dockerfile, tag, build_extras=None):
|
|
163
|
+
captured["extras"] = build_extras
|
|
164
|
+
return True
|
|
165
|
+
|
|
166
|
+
def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int:
|
|
167
|
+
return 0
|
|
168
|
+
|
|
169
|
+
monkeypatch.setattr(lt, "_build_docker_image", _fake_build)
|
|
170
|
+
monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker)
|
|
171
|
+
|
|
172
|
+
# Extras string with multiple flags and equals-arg
|
|
173
|
+
args = SimpleNamespace(
|
|
174
|
+
entry=str(test_file),
|
|
175
|
+
ignore_docker=False,
|
|
176
|
+
yes=True,
|
|
177
|
+
docker_build_extra="--no-cache --pull --progress=plain --build-arg KEY=VAL",
|
|
178
|
+
docker_run_extra="",
|
|
179
|
+
)
|
|
180
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
181
|
+
assert rc == 0
|
|
182
|
+
# Expect split list preserving tokens order
|
|
183
|
+
assert captured["extras"] == ["--no-cache", "--pull", "--progress=plain", "--build-arg", "KEY=VAL"]
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def test_local_test_passes_docker_run_extra(tmp_path, monkeypatch):
|
|
187
|
+
project = tmp_path / "proj"
|
|
188
|
+
project.mkdir()
|
|
189
|
+
monkeypatch.chdir(project)
|
|
190
|
+
|
|
191
|
+
test_file = project / "metric" / "test_run_extra.py"
|
|
192
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
193
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
194
|
+
|
|
195
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
196
|
+
|
|
197
|
+
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
|
|
198
|
+
monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, build_extras=None: True)
|
|
199
|
+
|
|
200
|
+
captured = {"extras": None}
|
|
201
|
+
|
|
202
|
+
def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int:
|
|
203
|
+
captured["extras"] = run_extras
|
|
204
|
+
return 0
|
|
205
|
+
|
|
206
|
+
monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker)
|
|
207
|
+
|
|
208
|
+
args = SimpleNamespace(
|
|
209
|
+
entry=str(test_file),
|
|
210
|
+
ignore_docker=False,
|
|
211
|
+
yes=True,
|
|
212
|
+
docker_build_extra="",
|
|
213
|
+
docker_run_extra="--env-file .env --memory=8g --cpus=2 --add-host=host.docker.internal:host-gateway",
|
|
214
|
+
)
|
|
215
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
216
|
+
assert rc == 0
|
|
217
|
+
assert captured["extras"] == [
|
|
218
|
+
"--env-file",
|
|
219
|
+
".env",
|
|
220
|
+
"--memory=8g",
|
|
221
|
+
"--cpus=2",
|
|
222
|
+
"--add-host=host.docker.internal:host-gateway",
|
|
223
|
+
]
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def test_local_test_normalizes_entry_with_selector(tmp_path, monkeypatch):
|
|
227
|
+
project = tmp_path / "proj"
|
|
228
|
+
project.mkdir()
|
|
229
|
+
monkeypatch.chdir(project)
|
|
230
|
+
|
|
231
|
+
# Create a dummy test file
|
|
232
|
+
test_file = project / "metric" / "test_sel_abs.py"
|
|
233
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
234
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
235
|
+
|
|
236
|
+
abs_entry = f"{str(test_file)}::test_dummy"
|
|
237
|
+
|
|
238
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
239
|
+
|
|
240
|
+
# Avoid Docker path
|
|
241
|
+
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
|
|
242
|
+
|
|
243
|
+
captured = {"target": ""}
|
|
244
|
+
|
|
245
|
+
def _fake_host(target: str) -> int:
|
|
246
|
+
captured["target"] = target
|
|
247
|
+
return 0
|
|
248
|
+
|
|
249
|
+
monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
|
|
250
|
+
|
|
251
|
+
args = SimpleNamespace(entry=abs_entry, ignore_docker=False, yes=True)
|
|
252
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
253
|
+
assert rc == 0
|
|
254
|
+
# Expect project-relative path plus selector
|
|
255
|
+
rel = os.path.relpath(str(test_file), str(project))
|
|
256
|
+
assert captured["target"] == f"{rel}::test_dummy"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/adapters/openai_responses.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/docker_resource.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/agent/resources/sql_resource.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/benchmarks/test_frozen_lake.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/agent_eval_cmd.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/create_rft.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/deploy_mcp.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/cli_commands/run_eval_cmd.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/data_loader/dynamic_data_loader.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/data_loader/factory_data_loader.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/data_loader/inline_data_loader.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/data_loader/jsonl_data_loader.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/dataset_logger/__init__.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/dataset_logger/dataset_logger.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/event_bus/sqlite_event_bus.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/generation/clients/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/elasticsearch_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/rollout_context.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev10 → eval_protocol-0.2.86}/eval_protocol/log_utils/rollout_id_filter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|