eval-protocol 0.2.84.dev1__tar.gz → 0.2.84.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.84.dev1/eval_protocol.egg-info → eval_protocol-0.2.84.dev3}/PKG-INFO +1 -1
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli.py +25 -0
- eval_protocol-0.2.84.dev3/eval_protocol/cli_commands/local_test.py +151 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/upload.py +2 -2
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol.egg-info/SOURCES.txt +2 -0
- eval_protocol-0.2.84.dev3/tests/test_cli_local_test.py +145 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/LICENSE +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/README.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/create_rft.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/pyproject.toml +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/setup.cfg +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/setup.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_cli_create_rft_infer.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_config.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_format.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_length.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_math.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_models.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/versioneer.py +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.84.
|
|
3
|
+
Version: 0.2.84.dev3
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-11-
|
|
11
|
+
"date": "2025-11-10T18:00:39-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.84.dev.
|
|
14
|
+
"full-revisionid": "e7615d7ec75524b19ed38241d1c6165cf32dd79f",
|
|
15
|
+
"version": "0.2.84.dev.3"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -427,6 +427,27 @@ def parse_args(args=None):
|
|
|
427
427
|
rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
|
|
428
428
|
rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
|
|
429
429
|
|
|
430
|
+
# Local test command
|
|
431
|
+
local_test_parser = subparsers.add_parser(
|
|
432
|
+
"local-test",
|
|
433
|
+
help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.",
|
|
434
|
+
)
|
|
435
|
+
local_test_parser.add_argument(
|
|
436
|
+
"--entry",
|
|
437
|
+
help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).",
|
|
438
|
+
)
|
|
439
|
+
local_test_parser.add_argument(
|
|
440
|
+
"--ignore-docker",
|
|
441
|
+
action="store_true",
|
|
442
|
+
help="Ignore Dockerfile even if present; run pytest on host",
|
|
443
|
+
)
|
|
444
|
+
local_test_parser.add_argument(
|
|
445
|
+
"--yes",
|
|
446
|
+
"-y",
|
|
447
|
+
action="store_true",
|
|
448
|
+
help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
|
|
449
|
+
)
|
|
450
|
+
|
|
430
451
|
# Run command (for Hydra-based evaluations)
|
|
431
452
|
# This subparser intentionally defines no arguments itself.
|
|
432
453
|
# All arguments after 'run' will be passed to Hydra by parse_known_args.
|
|
@@ -559,6 +580,10 @@ def main():
|
|
|
559
580
|
return create_rft_command(args)
|
|
560
581
|
print("Error: missing subcommand for 'create'. Try: eval-protocol create rft")
|
|
561
582
|
return 1
|
|
583
|
+
elif args.command == "local-test":
|
|
584
|
+
from .cli_commands.local_test import local_test_command
|
|
585
|
+
|
|
586
|
+
return local_test_command(args)
|
|
562
587
|
elif args.command == "run":
|
|
563
588
|
# For the 'run' command, Hydra takes over argument parsing.
|
|
564
589
|
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
import sys
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
from .upload import _discover_tests, _prompt_select
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _find_dockerfiles(root: str) -> List[str]:
|
|
11
|
+
skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"}
|
|
12
|
+
dockerfiles: List[str] = []
|
|
13
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
14
|
+
dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
|
|
15
|
+
for name in filenames:
|
|
16
|
+
if name == "Dockerfile":
|
|
17
|
+
dockerfiles.append(os.path.join(dirpath, name))
|
|
18
|
+
return dockerfiles
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _run_pytest_host(pytest_target: str) -> int:
|
|
22
|
+
print(f"Running locally: pytest {pytest_target} -vs")
|
|
23
|
+
proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"])
|
|
24
|
+
return proc.returncode
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
|
|
28
|
+
context_dir = os.path.dirname(dockerfile_path)
|
|
29
|
+
print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
|
|
30
|
+
try:
|
|
31
|
+
proc = subprocess.run(
|
|
32
|
+
["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir],
|
|
33
|
+
stdout=subprocess.PIPE,
|
|
34
|
+
stderr=subprocess.STDOUT,
|
|
35
|
+
text=True,
|
|
36
|
+
)
|
|
37
|
+
print(proc.stdout)
|
|
38
|
+
return proc.returncode == 0
|
|
39
|
+
except FileNotFoundError:
|
|
40
|
+
print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int:
|
|
45
|
+
workdir = "/workspace"
|
|
46
|
+
# Mount read-only is safer; but tests may write artifacts. Use read-write.
|
|
47
|
+
cmd = [
|
|
48
|
+
"docker",
|
|
49
|
+
"run",
|
|
50
|
+
"--rm",
|
|
51
|
+
"-v",
|
|
52
|
+
f"{project_root}:{workdir}",
|
|
53
|
+
"-e",
|
|
54
|
+
f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol",
|
|
55
|
+
"-w",
|
|
56
|
+
workdir,
|
|
57
|
+
]
|
|
58
|
+
# Try to match host user to avoid permission problems on mounted volume
|
|
59
|
+
try:
|
|
60
|
+
uid = os.getuid() # type: ignore[attr-defined]
|
|
61
|
+
gid = os.getgid() # type: ignore[attr-defined]
|
|
62
|
+
cmd += ["--user", f"{uid}:{gid}"]
|
|
63
|
+
except Exception:
|
|
64
|
+
pass
|
|
65
|
+
cmd += [image_tag, "pytest", pytest_target, "-vs"]
|
|
66
|
+
print("Running in Docker:", " ".join(cmd))
|
|
67
|
+
try:
|
|
68
|
+
proc = subprocess.run(cmd)
|
|
69
|
+
return proc.returncode
|
|
70
|
+
except FileNotFoundError:
|
|
71
|
+
print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
|
|
72
|
+
return 1
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def local_test_command(args: argparse.Namespace) -> int:
|
|
76
|
+
project_root = os.getcwd()
|
|
77
|
+
|
|
78
|
+
# Selection and pytest target resolution
|
|
79
|
+
pytest_target: str = ""
|
|
80
|
+
entry = getattr(args, "entry", None)
|
|
81
|
+
if entry:
|
|
82
|
+
if "::" in entry:
|
|
83
|
+
file_part = entry.split("::", 1)[0]
|
|
84
|
+
file_path = (
|
|
85
|
+
file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
|
|
86
|
+
)
|
|
87
|
+
pytest_target = entry
|
|
88
|
+
else:
|
|
89
|
+
file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
|
|
90
|
+
# Use path relative to project_root when possible
|
|
91
|
+
try:
|
|
92
|
+
rel = os.path.relpath(file_path, project_root)
|
|
93
|
+
except Exception:
|
|
94
|
+
rel = file_path
|
|
95
|
+
pytest_target = rel
|
|
96
|
+
else:
|
|
97
|
+
tests = _discover_tests(project_root)
|
|
98
|
+
if not tests:
|
|
99
|
+
print("No evaluation tests found.\nHint: Ensure @evaluation_test is applied.")
|
|
100
|
+
return 1
|
|
101
|
+
non_interactive = bool(getattr(args, "yes", False))
|
|
102
|
+
selected = _prompt_select(tests, non_interactive=non_interactive)
|
|
103
|
+
if not selected:
|
|
104
|
+
print("No tests selected.")
|
|
105
|
+
return 1
|
|
106
|
+
if len(selected) != 1:
|
|
107
|
+
print("Error: Please select exactly one evaluation test for 'local-test'.")
|
|
108
|
+
return 1
|
|
109
|
+
chosen = selected[0]
|
|
110
|
+
abs_path = os.path.abspath(chosen.file_path)
|
|
111
|
+
try:
|
|
112
|
+
rel = os.path.relpath(abs_path, project_root)
|
|
113
|
+
except Exception:
|
|
114
|
+
rel = abs_path
|
|
115
|
+
pytest_target = rel
|
|
116
|
+
|
|
117
|
+
ignore_docker = bool(getattr(args, "ignore_docker", False))
|
|
118
|
+
if ignore_docker:
|
|
119
|
+
if not pytest_target:
|
|
120
|
+
print("Error: Failed to resolve a pytest target to run.")
|
|
121
|
+
return 1
|
|
122
|
+
return _run_pytest_host(pytest_target)
|
|
123
|
+
|
|
124
|
+
dockerfiles = _find_dockerfiles(project_root)
|
|
125
|
+
if len(dockerfiles) > 1:
|
|
126
|
+
print("Error: Multiple Dockerfiles found. Only one Dockerfile is allowed for local-test.")
|
|
127
|
+
for df in dockerfiles:
|
|
128
|
+
print(f" - {df}")
|
|
129
|
+
print("Hint: use --ignore-docker to bypass Docker.")
|
|
130
|
+
return 1
|
|
131
|
+
if len(dockerfiles) == 1:
|
|
132
|
+
# Ensure shared logs directory exists on host so container writes are visible to host ep logs
|
|
133
|
+
try:
|
|
134
|
+
os.makedirs(os.path.join(project_root, ".eval_protocol"), exist_ok=True)
|
|
135
|
+
except Exception:
|
|
136
|
+
pass
|
|
137
|
+
image_tag = "ep-evaluator:local"
|
|
138
|
+
ok = _build_docker_image(dockerfiles[0], image_tag)
|
|
139
|
+
if not ok:
|
|
140
|
+
print("Docker build failed. See logs above.")
|
|
141
|
+
return 1
|
|
142
|
+
if not pytest_target:
|
|
143
|
+
print("Error: Failed to resolve a pytest target to run.")
|
|
144
|
+
return 1
|
|
145
|
+
return _run_pytest_in_docker(project_root, image_tag, pytest_target)
|
|
146
|
+
|
|
147
|
+
# No Dockerfile: run on host
|
|
148
|
+
if not pytest_target:
|
|
149
|
+
print("Error: Failed to resolve a pytest target to run.")
|
|
150
|
+
return 1
|
|
151
|
+
return _run_pytest_host(pytest_target)
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/upload.py
RENAMED
|
@@ -437,7 +437,7 @@ def _prompt_select_interactive(tests: list[DiscoveredTest]) -> list[DiscoveredTe
|
|
|
437
437
|
# Check if only one test - auto-select it
|
|
438
438
|
if len(tests) == 1:
|
|
439
439
|
print(f"\nFound 1 test: {_format_test_choice(tests[0], 1)}")
|
|
440
|
-
confirm = questionary.confirm("
|
|
440
|
+
confirm = questionary.confirm("Select this test?", default=True, style=custom_style).ask()
|
|
441
441
|
if confirm:
|
|
442
442
|
return tests
|
|
443
443
|
else:
|
|
@@ -500,7 +500,7 @@ def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest]
|
|
|
500
500
|
|
|
501
501
|
print("=" * 80)
|
|
502
502
|
try:
|
|
503
|
-
choice = input("Enter the number to
|
|
503
|
+
choice = input("Enter the number to select: ").strip()
|
|
504
504
|
except KeyboardInterrupt:
|
|
505
505
|
print("\n\nUpload cancelled.")
|
|
506
506
|
return []
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.84.
|
|
3
|
+
Version: 0.2.84.dev3
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -91,6 +91,7 @@ eval_protocol/cli_commands/common.py
|
|
|
91
91
|
eval_protocol/cli_commands/create_rft.py
|
|
92
92
|
eval_protocol/cli_commands/deploy.py
|
|
93
93
|
eval_protocol/cli_commands/deploy_mcp.py
|
|
94
|
+
eval_protocol/cli_commands/local_test.py
|
|
94
95
|
eval_protocol/cli_commands/logs.py
|
|
95
96
|
eval_protocol/cli_commands/preview.py
|
|
96
97
|
eval_protocol/cli_commands/run_eval_cmd.py
|
|
@@ -277,6 +278,7 @@ tests/test_cli.py
|
|
|
277
278
|
tests/test_cli_agent.py
|
|
278
279
|
tests/test_cli_args.py
|
|
279
280
|
tests/test_cli_create_rft_infer.py
|
|
281
|
+
tests/test_cli_local_test.py
|
|
280
282
|
tests/test_code_execution.py
|
|
281
283
|
tests/test_config.py
|
|
282
284
|
tests/test_control_plane_separation.py
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from types import SimpleNamespace
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_local_test_runs_host_pytest_with_entry(tmp_path, monkeypatch):
|
|
8
|
+
project = tmp_path / "proj"
|
|
9
|
+
project.mkdir()
|
|
10
|
+
monkeypatch.chdir(project)
|
|
11
|
+
|
|
12
|
+
# Create a dummy test file
|
|
13
|
+
test_file = project / "metric" / "test_one.py"
|
|
14
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
16
|
+
|
|
17
|
+
# Import module under test
|
|
18
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
19
|
+
|
|
20
|
+
# Avoid Docker path
|
|
21
|
+
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
|
|
22
|
+
|
|
23
|
+
captured = {"target": ""}
|
|
24
|
+
|
|
25
|
+
def _fake_host(target: str) -> int:
|
|
26
|
+
captured["target"] = target
|
|
27
|
+
return 0
|
|
28
|
+
|
|
29
|
+
monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
|
|
30
|
+
|
|
31
|
+
args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
|
|
32
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
33
|
+
assert rc == 0
|
|
34
|
+
# Expect relative path target
|
|
35
|
+
assert captured["target"] == os.path.relpath(str(test_file), str(project))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_local_test_ignores_docker_when_flag_set(tmp_path, monkeypatch):
|
|
39
|
+
project = tmp_path / "proj"
|
|
40
|
+
project.mkdir()
|
|
41
|
+
monkeypatch.chdir(project)
|
|
42
|
+
|
|
43
|
+
test_file = project / "metric" / "test_two.py"
|
|
44
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
46
|
+
|
|
47
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
48
|
+
|
|
49
|
+
# Pretend we have Dockerfile(s), but ignore_docker=True should skip
|
|
50
|
+
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
|
|
51
|
+
|
|
52
|
+
called = {"host": False}
|
|
53
|
+
|
|
54
|
+
def _fake_host(target: str) -> int:
|
|
55
|
+
called["host"] = True
|
|
56
|
+
return 0
|
|
57
|
+
|
|
58
|
+
monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
|
|
59
|
+
|
|
60
|
+
args = SimpleNamespace(entry=str(test_file), ignore_docker=True, yes=True)
|
|
61
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
62
|
+
assert rc == 0
|
|
63
|
+
assert called["host"] is True
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_local_test_errors_on_multiple_dockerfiles(tmp_path, monkeypatch):
|
|
67
|
+
project = tmp_path / "proj"
|
|
68
|
+
project.mkdir()
|
|
69
|
+
monkeypatch.chdir(project)
|
|
70
|
+
|
|
71
|
+
test_file = project / "metric" / "test_three.py"
|
|
72
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
74
|
+
|
|
75
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
76
|
+
|
|
77
|
+
monkeypatch.setattr(
|
|
78
|
+
lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile"), str(project / "another" / "Dockerfile")]
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
|
|
82
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
83
|
+
assert rc == 1
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch):
|
|
87
|
+
project = tmp_path / "proj"
|
|
88
|
+
project.mkdir()
|
|
89
|
+
monkeypatch.chdir(project)
|
|
90
|
+
|
|
91
|
+
test_file = project / "metric" / "test_four.py"
|
|
92
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
94
|
+
|
|
95
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
96
|
+
|
|
97
|
+
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
|
|
98
|
+
monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag: True)
|
|
99
|
+
|
|
100
|
+
captured = {"target": "", "image": ""}
|
|
101
|
+
|
|
102
|
+
def _fake_run_docker(root: str, image_tag: str, pytest_target: str) -> int:
|
|
103
|
+
captured["target"] = pytest_target
|
|
104
|
+
captured["image"] = image_tag
|
|
105
|
+
return 0
|
|
106
|
+
|
|
107
|
+
monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker)
|
|
108
|
+
|
|
109
|
+
args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
|
|
110
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
111
|
+
assert rc == 0
|
|
112
|
+
assert captured["image"] == "ep-evaluator:local"
|
|
113
|
+
assert captured["target"] == os.path.relpath(str(test_file), str(project))
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_local_test_selector_single_test(tmp_path, monkeypatch):
|
|
117
|
+
project = tmp_path / "proj"
|
|
118
|
+
project.mkdir()
|
|
119
|
+
monkeypatch.chdir(project)
|
|
120
|
+
|
|
121
|
+
test_file = project / "metric" / "test_sel.py"
|
|
122
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
123
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
124
|
+
|
|
125
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
126
|
+
from eval_protocol.cli_commands import upload as up
|
|
127
|
+
|
|
128
|
+
# No entry; force discover + selector
|
|
129
|
+
disc = SimpleNamespace(qualname="metric.test_sel", file_path=str(test_file))
|
|
130
|
+
monkeypatch.setattr(lt, "_discover_tests", lambda root: [disc])
|
|
131
|
+
monkeypatch.setattr(up, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
|
|
132
|
+
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
|
|
133
|
+
|
|
134
|
+
called = {"host": False}
|
|
135
|
+
|
|
136
|
+
def _fake_host(target: str) -> int:
|
|
137
|
+
called["host"] = True
|
|
138
|
+
return 0
|
|
139
|
+
|
|
140
|
+
monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
|
|
141
|
+
|
|
142
|
+
args = SimpleNamespace(entry=None, ignore_docker=False, yes=True)
|
|
143
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
144
|
+
assert rc == 0
|
|
145
|
+
assert called["host"] is True
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/normalize_sandbox_fusion.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/utils/generate_api_key.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/development/utils/subprocess_manager.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/braintrust.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/huggingface.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/adapters/openai_responses.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resource_pool.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/resources/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/agent/tool_registry.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_aime25.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_frozen_lake.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/benchmarks/test_gpqa.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/__init__.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/agent_eval_cmd.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/common.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/create_rft.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/deploy.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/deploy_mcp.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/preview.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/cli_commands/run_eval_cmd.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/data_loader/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/dataset_logger/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/event_bus.py
RENAMED
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/event_bus/sqlite_event_bus.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.84.dev1 → eval_protocol-0.2.84.dev3}/eval_protocol/generation/clients/base.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|