eval-protocol 0.2.84__tar.gz → 0.2.84.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.84/eval_protocol.egg-info → eval_protocol-0.2.84.dev2}/PKG-INFO +1 -1
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/auth.py +68 -3
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli.py +26 -1
- eval_protocol-0.2.84.dev2/eval_protocol/cli_commands/local_test.py +140 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/upload.py +24 -3
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/evaluation.py +3 -1
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/fireworks_rft.py +3 -1
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/handle_persist_flow.py +15 -15
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol.egg-info/SOURCES.txt +2 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_auth.py +10 -5
- eval_protocol-0.2.84.dev2/tests/test_cli_local_test.py +145 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/LICENSE +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/README.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/development/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/create_rft.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/jsonl_data_loader.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/exceptions.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/svg_agent/evaluator/test_svgagent.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/svg_agent/evaluator/utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/pyproject.toml +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/setup.cfg +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/setup.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_cli_create_rft_infer.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_config.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_exceptions.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_format.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_length.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_math.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_message_field_filtering.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_models.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/versioneer.py +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.84
|
|
3
|
+
Version: 0.2.84.dev2
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-11-
|
|
11
|
+
"date": "2025-11-10T17:41:27-0800",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.84"
|
|
14
|
+
"full-revisionid": "cd9cc91c34f975482fe05b4bf3a60b4a0bcbd746",
|
|
15
|
+
"version": "0.2.84.dev.2"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -136,6 +136,56 @@ def _get_credential_from_config_file(key_name: str) -> Optional[str]:
|
|
|
136
136
|
return None
|
|
137
137
|
|
|
138
138
|
|
|
139
|
+
def _get_credentials_from_config_file() -> Dict[str, Optional[str]]:
|
|
140
|
+
"""
|
|
141
|
+
Retrieve both api_key and account_id from auth.ini with a single read/parse.
|
|
142
|
+
Tries simple parsing first for both keys, then falls back to configparser for any missing ones.
|
|
143
|
+
Returns a dict with up to two keys: 'api_key' and 'account_id'.
|
|
144
|
+
"""
|
|
145
|
+
results: Dict[str, Optional[str]] = {}
|
|
146
|
+
auth_ini_path = _get_auth_ini_file()
|
|
147
|
+
if not auth_ini_path.exists():
|
|
148
|
+
return results
|
|
149
|
+
|
|
150
|
+
# 1) Simple key=value parsing
|
|
151
|
+
try:
|
|
152
|
+
simple_creds = _parse_simple_auth_file(auth_ini_path)
|
|
153
|
+
if "api_key" in simple_creds and simple_creds["api_key"]:
|
|
154
|
+
results["api_key"] = simple_creds["api_key"]
|
|
155
|
+
if "account_id" in simple_creds and simple_creds["account_id"]:
|
|
156
|
+
results["account_id"] = simple_creds["account_id"]
|
|
157
|
+
if "api_key" in results and "account_id" in results:
|
|
158
|
+
return results
|
|
159
|
+
except Exception as e:
|
|
160
|
+
logger.warning("Error during simple parsing of %s: %s", str(auth_ini_path), e)
|
|
161
|
+
|
|
162
|
+
# 2) ConfigParser for any missing keys
|
|
163
|
+
try:
|
|
164
|
+
config = configparser.ConfigParser()
|
|
165
|
+
config.read(auth_ini_path)
|
|
166
|
+
for key_name in ("api_key", "account_id"):
|
|
167
|
+
if key_name in results and results[key_name]:
|
|
168
|
+
continue
|
|
169
|
+
if "fireworks" in config and config.has_option("fireworks", key_name):
|
|
170
|
+
value_from_file = config.get("fireworks", key_name)
|
|
171
|
+
if value_from_file:
|
|
172
|
+
results[key_name] = value_from_file
|
|
173
|
+
continue
|
|
174
|
+
if config.has_option(config.default_section, key_name):
|
|
175
|
+
value_from_default = config.get(config.default_section, key_name)
|
|
176
|
+
if value_from_default:
|
|
177
|
+
results[key_name] = value_from_default
|
|
178
|
+
except configparser.MissingSectionHeaderError:
|
|
179
|
+
# Purely key=value file without section headers; simple parsing should have handled it already.
|
|
180
|
+
logger.debug("%s has no section headers; falling back to simple parsing results.", str(auth_ini_path))
|
|
181
|
+
except configparser.Error as e_config:
|
|
182
|
+
logger.warning("Configparser error reading %s: %s", str(auth_ini_path), e_config)
|
|
183
|
+
except Exception as e_general:
|
|
184
|
+
logger.warning("Unexpected error reading %s: %s", str(auth_ini_path), e_general)
|
|
185
|
+
|
|
186
|
+
return results
|
|
187
|
+
|
|
188
|
+
|
|
139
189
|
def get_fireworks_api_key() -> Optional[str]:
|
|
140
190
|
"""
|
|
141
191
|
Retrieves the Fireworks API key.
|
|
@@ -177,13 +227,15 @@ def get_fireworks_account_id() -> Optional[str]:
|
|
|
177
227
|
The Account ID is sourced in the following order:
|
|
178
228
|
1. FIREWORKS_ACCOUNT_ID environment variable.
|
|
179
229
|
2. 'account_id' from the [fireworks] section of ~/.fireworks/auth.ini.
|
|
230
|
+
3. If an API key is available (env or auth.ini), resolve via verifyApiKey.
|
|
180
231
|
|
|
181
232
|
Returns:
|
|
182
233
|
The Account ID if found, otherwise None.
|
|
183
234
|
"""
|
|
184
235
|
# If a profile is active, prefer profile file first, then env
|
|
185
236
|
if _is_profile_active():
|
|
186
|
-
|
|
237
|
+
creds = _get_credentials_from_config_file()
|
|
238
|
+
account_id_from_file = creds.get("account_id")
|
|
187
239
|
if account_id_from_file:
|
|
188
240
|
return account_id_from_file
|
|
189
241
|
account_id = os.environ.get("FIREWORKS_ACCOUNT_ID")
|
|
@@ -196,11 +248,24 @@ def get_fireworks_account_id() -> Optional[str]:
|
|
|
196
248
|
if account_id:
|
|
197
249
|
logger.debug("Using FIREWORKS_ACCOUNT_ID from environment variable.")
|
|
198
250
|
return account_id
|
|
199
|
-
|
|
251
|
+
creds = _get_credentials_from_config_file()
|
|
252
|
+
account_id_from_file = creds.get("account_id")
|
|
200
253
|
if account_id_from_file:
|
|
201
254
|
return account_id_from_file
|
|
202
255
|
|
|
203
|
-
|
|
256
|
+
# 3) Fallback: if API key is present, attempt to resolve via verifyApiKey (env or auth.ini)
|
|
257
|
+
try:
|
|
258
|
+
# Intentionally use get_fireworks_api_key to centralize precedence (env vs file)
|
|
259
|
+
api_key_for_verify = get_fireworks_api_key()
|
|
260
|
+
if api_key_for_verify:
|
|
261
|
+
resolved = verify_api_key_and_get_account_id(api_key=api_key_for_verify, api_base=get_fireworks_api_base())
|
|
262
|
+
if resolved:
|
|
263
|
+
logger.debug("Using FIREWORKS_ACCOUNT_ID resolved via verifyApiKey: %s", resolved)
|
|
264
|
+
return resolved
|
|
265
|
+
except Exception as e:
|
|
266
|
+
logger.debug("Failed to resolve FIREWORKS_ACCOUNT_ID via verifyApiKey: %s", e)
|
|
267
|
+
|
|
268
|
+
logger.debug("Fireworks Account ID not found in environment variables, auth.ini, or via verifyApiKey.")
|
|
204
269
|
return None
|
|
205
270
|
|
|
206
271
|
|
|
@@ -395,7 +395,7 @@ def parse_args(args=None):
|
|
|
395
395
|
rft_parser.add_argument("--base-model", help="Base model resource id")
|
|
396
396
|
rft_parser.add_argument("--warm-start-from", help="Addon model to warm start from")
|
|
397
397
|
rft_parser.add_argument("--output-model", help="Output model id (defaults from evaluator)")
|
|
398
|
-
rft_parser.add_argument("--epochs", type=int, default=
|
|
398
|
+
rft_parser.add_argument("--epochs", type=int, default=1)
|
|
399
399
|
rft_parser.add_argument("--batch-size", type=int, default=128000)
|
|
400
400
|
rft_parser.add_argument("--learning-rate", type=float, default=3e-5)
|
|
401
401
|
rft_parser.add_argument("--max-context-length", type=int, default=65536)
|
|
@@ -427,6 +427,27 @@ def parse_args(args=None):
|
|
|
427
427
|
rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
|
|
428
428
|
rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
|
|
429
429
|
|
|
430
|
+
# Local test command
|
|
431
|
+
local_test_parser = subparsers.add_parser(
|
|
432
|
+
"local-test",
|
|
433
|
+
help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.",
|
|
434
|
+
)
|
|
435
|
+
local_test_parser.add_argument(
|
|
436
|
+
"--entry",
|
|
437
|
+
help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).",
|
|
438
|
+
)
|
|
439
|
+
local_test_parser.add_argument(
|
|
440
|
+
"--ignore-docker",
|
|
441
|
+
action="store_true",
|
|
442
|
+
help="Ignore Dockerfile even if present; run pytest on host",
|
|
443
|
+
)
|
|
444
|
+
local_test_parser.add_argument(
|
|
445
|
+
"--yes",
|
|
446
|
+
"-y",
|
|
447
|
+
action="store_true",
|
|
448
|
+
help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
|
|
449
|
+
)
|
|
450
|
+
|
|
430
451
|
# Run command (for Hydra-based evaluations)
|
|
431
452
|
# This subparser intentionally defines no arguments itself.
|
|
432
453
|
# All arguments after 'run' will be passed to Hydra by parse_known_args.
|
|
@@ -559,6 +580,10 @@ def main():
|
|
|
559
580
|
return create_rft_command(args)
|
|
560
581
|
print("Error: missing subcommand for 'create'. Try: eval-protocol create rft")
|
|
561
582
|
return 1
|
|
583
|
+
elif args.command == "local-test":
|
|
584
|
+
from .cli_commands.local_test import local_test_command
|
|
585
|
+
|
|
586
|
+
return local_test_command(args)
|
|
562
587
|
elif args.command == "run":
|
|
563
588
|
# For the 'run' command, Hydra takes over argument parsing.
|
|
564
589
|
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
import sys
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
from .upload import _discover_tests, _prompt_select
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _find_dockerfiles(root: str) -> List[str]:
|
|
11
|
+
skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"}
|
|
12
|
+
dockerfiles: List[str] = []
|
|
13
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
14
|
+
dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
|
|
15
|
+
for name in filenames:
|
|
16
|
+
if name == "Dockerfile":
|
|
17
|
+
dockerfiles.append(os.path.join(dirpath, name))
|
|
18
|
+
return dockerfiles
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _run_pytest_host(pytest_target: str) -> int:
|
|
22
|
+
print(f"Running locally: pytest {pytest_target} -vs")
|
|
23
|
+
proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"])
|
|
24
|
+
return proc.returncode
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
|
|
28
|
+
context_dir = os.path.dirname(dockerfile_path)
|
|
29
|
+
print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
|
|
30
|
+
try:
|
|
31
|
+
proc = subprocess.run(
|
|
32
|
+
["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir],
|
|
33
|
+
stdout=subprocess.PIPE,
|
|
34
|
+
stderr=subprocess.STDOUT,
|
|
35
|
+
text=True,
|
|
36
|
+
)
|
|
37
|
+
print(proc.stdout)
|
|
38
|
+
return proc.returncode == 0
|
|
39
|
+
except FileNotFoundError:
|
|
40
|
+
print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int:
|
|
45
|
+
workdir = "/workspace"
|
|
46
|
+
# Mount read-only is safer; but tests may write artifacts. Use read-write.
|
|
47
|
+
cmd = [
|
|
48
|
+
"docker",
|
|
49
|
+
"run",
|
|
50
|
+
"--rm",
|
|
51
|
+
"-v",
|
|
52
|
+
f"{project_root}:{workdir}",
|
|
53
|
+
"-w",
|
|
54
|
+
workdir,
|
|
55
|
+
image_tag,
|
|
56
|
+
"pytest",
|
|
57
|
+
pytest_target,
|
|
58
|
+
"-vs",
|
|
59
|
+
]
|
|
60
|
+
print("Running in Docker:", " ".join(cmd))
|
|
61
|
+
try:
|
|
62
|
+
proc = subprocess.run(cmd)
|
|
63
|
+
return proc.returncode
|
|
64
|
+
except FileNotFoundError:
|
|
65
|
+
print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
|
|
66
|
+
return 1
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def local_test_command(args: argparse.Namespace) -> int:
|
|
70
|
+
project_root = os.getcwd()
|
|
71
|
+
|
|
72
|
+
# Selection and pytest target resolution
|
|
73
|
+
pytest_target: str = ""
|
|
74
|
+
entry = getattr(args, "entry", None)
|
|
75
|
+
if entry:
|
|
76
|
+
if "::" in entry:
|
|
77
|
+
file_part = entry.split("::", 1)[0]
|
|
78
|
+
file_path = (
|
|
79
|
+
file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
|
|
80
|
+
)
|
|
81
|
+
pytest_target = entry
|
|
82
|
+
else:
|
|
83
|
+
file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
|
|
84
|
+
# Use path relative to project_root when possible
|
|
85
|
+
try:
|
|
86
|
+
rel = os.path.relpath(file_path, project_root)
|
|
87
|
+
except Exception:
|
|
88
|
+
rel = file_path
|
|
89
|
+
pytest_target = rel
|
|
90
|
+
else:
|
|
91
|
+
tests = _discover_tests(project_root)
|
|
92
|
+
if not tests:
|
|
93
|
+
print("No evaluation tests found.\nHint: Ensure @evaluation_test is applied.")
|
|
94
|
+
return 1
|
|
95
|
+
non_interactive = bool(getattr(args, "yes", False))
|
|
96
|
+
selected = _prompt_select(tests, non_interactive=non_interactive)
|
|
97
|
+
if not selected:
|
|
98
|
+
print("No tests selected.")
|
|
99
|
+
return 1
|
|
100
|
+
if len(selected) != 1:
|
|
101
|
+
print("Error: Please select exactly one evaluation test for 'local-test'.")
|
|
102
|
+
return 1
|
|
103
|
+
chosen = selected[0]
|
|
104
|
+
abs_path = os.path.abspath(chosen.file_path)
|
|
105
|
+
try:
|
|
106
|
+
rel = os.path.relpath(abs_path, project_root)
|
|
107
|
+
except Exception:
|
|
108
|
+
rel = abs_path
|
|
109
|
+
pytest_target = rel
|
|
110
|
+
|
|
111
|
+
ignore_docker = bool(getattr(args, "ignore_docker", False))
|
|
112
|
+
if ignore_docker:
|
|
113
|
+
if not pytest_target:
|
|
114
|
+
print("Error: Failed to resolve a pytest target to run.")
|
|
115
|
+
return 1
|
|
116
|
+
return _run_pytest_host(pytest_target)
|
|
117
|
+
|
|
118
|
+
dockerfiles = _find_dockerfiles(project_root)
|
|
119
|
+
if len(dockerfiles) > 1:
|
|
120
|
+
print("Error: Multiple Dockerfiles found. Only one Dockerfile is allowed for local-test.")
|
|
121
|
+
for df in dockerfiles:
|
|
122
|
+
print(f" - {df}")
|
|
123
|
+
print("Hint: use --ignore-docker to bypass Docker.")
|
|
124
|
+
return 1
|
|
125
|
+
if len(dockerfiles) == 1:
|
|
126
|
+
image_tag = "ep-evaluator:local"
|
|
127
|
+
ok = _build_docker_image(dockerfiles[0], image_tag)
|
|
128
|
+
if not ok:
|
|
129
|
+
print("Docker build failed. See logs above.")
|
|
130
|
+
return 1
|
|
131
|
+
if not pytest_target:
|
|
132
|
+
print("Error: Failed to resolve a pytest target to run.")
|
|
133
|
+
return 1
|
|
134
|
+
return _run_pytest_in_docker(project_root, image_tag, pytest_target)
|
|
135
|
+
|
|
136
|
+
# No Dockerfile: run on host
|
|
137
|
+
if not pytest_target:
|
|
138
|
+
print("Error: Failed to resolve a pytest target to run.")
|
|
139
|
+
return 1
|
|
140
|
+
return _run_pytest_host(pytest_target)
|
|
@@ -552,6 +552,23 @@ def _load_secrets_from_env_file(env_file_path: str) -> Dict[str, str]:
|
|
|
552
552
|
return secrets
|
|
553
553
|
|
|
554
554
|
|
|
555
|
+
def _mask_secret_value(value: str) -> str:
|
|
556
|
+
"""
|
|
557
|
+
Return a masked representation of a secret showing only a small prefix/suffix.
|
|
558
|
+
Example: fw_3Z*******Xgnk
|
|
559
|
+
"""
|
|
560
|
+
try:
|
|
561
|
+
if not isinstance(value, str) or not value:
|
|
562
|
+
return "<empty>"
|
|
563
|
+
prefix_len = 6
|
|
564
|
+
suffix_len = 4
|
|
565
|
+
if len(value) <= prefix_len + suffix_len:
|
|
566
|
+
return value[0] + "***" + value[-1]
|
|
567
|
+
return f"{value[:prefix_len]}***{value[-suffix_len:]}"
|
|
568
|
+
except Exception:
|
|
569
|
+
return "<masked>"
|
|
570
|
+
|
|
571
|
+
|
|
555
572
|
def upload_command(args: argparse.Namespace) -> int:
|
|
556
573
|
root = os.path.abspath(getattr(args, "path", "."))
|
|
557
574
|
entries_arg = getattr(args, "entry", None)
|
|
@@ -602,9 +619,9 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
602
619
|
secrets_from_file = _load_secrets_from_env_file(env_file_path)
|
|
603
620
|
secrets_from_env_file = secrets_from_file.copy() # Track what came from .env file
|
|
604
621
|
|
|
605
|
-
# Also
|
|
622
|
+
# Also consider FIREWORKS_API_KEY from environment, but prefer .env value
|
|
606
623
|
fw_api_key_value = get_fireworks_api_key()
|
|
607
|
-
if fw_api_key_value:
|
|
624
|
+
if fw_api_key_value and "FIREWORKS_API_KEY" not in secrets_from_file:
|
|
608
625
|
secrets_from_file["FIREWORKS_API_KEY"] = fw_api_key_value
|
|
609
626
|
|
|
610
627
|
if not fw_account_id and fw_api_key_value:
|
|
@@ -622,7 +639,11 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
622
639
|
print(f"Loading secrets from: {env_file_path}")
|
|
623
640
|
|
|
624
641
|
for secret_name, secret_value in secrets_from_file.items():
|
|
625
|
-
|
|
642
|
+
source = ".env" if secret_name in secrets_from_env_file else "environment"
|
|
643
|
+
print(
|
|
644
|
+
f"Ensuring {secret_name} is registered as a secret on Fireworks for rollout... "
|
|
645
|
+
f"({source}: {_mask_secret_value(secret_value)})"
|
|
646
|
+
)
|
|
626
647
|
if create_or_update_fireworks_secret(
|
|
627
648
|
account_id=fw_account_id,
|
|
628
649
|
key_name=secret_name,
|
|
@@ -595,7 +595,9 @@ class Evaluator:
|
|
|
595
595
|
logger.error("Missing requirements.txt in upload directory: %s", source_dir)
|
|
596
596
|
raise ValueError(
|
|
597
597
|
"Upload requires requirements.txt in the project root. "
|
|
598
|
-
"
|
|
598
|
+
"Create a requirements.txt (it can be empty) and rerun 'eval-protocol upload' "
|
|
599
|
+
"or 'eval-protocol create rft'. If you're running in a notebook (e.g., Colab), "
|
|
600
|
+
f"create the file in your working directory (e.g., {source_dir}/requirements.txt)."
|
|
599
601
|
)
|
|
600
602
|
|
|
601
603
|
@staticmethod
|
|
@@ -5,6 +5,7 @@ import os
|
|
|
5
5
|
import sys
|
|
6
6
|
import tempfile
|
|
7
7
|
import time
|
|
8
|
+
import uuid
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import Any, Callable, Dict, Iterable, Optional, Tuple
|
|
10
11
|
|
|
@@ -205,7 +206,8 @@ def build_default_dataset_id(evaluator_id: str) -> str:
|
|
|
205
206
|
|
|
206
207
|
def build_default_output_model(evaluator_id: str) -> str:
|
|
207
208
|
base = evaluator_id.lower().replace("_", "-")
|
|
208
|
-
|
|
209
|
+
uuid_suffix = str(uuid.uuid4())[:4]
|
|
210
|
+
return f"{base}-rft-{uuid_suffix}"
|
|
209
211
|
|
|
210
212
|
|
|
211
213
|
__all__ = [
|
{eval_protocol-0.2.84 → eval_protocol-0.2.84.dev2}/eval_protocol/pytest/handle_persist_flow.py
RENAMED
|
@@ -11,6 +11,12 @@ from eval_protocol.common_utils import get_user_agent
|
|
|
11
11
|
from eval_protocol.directory_utils import find_eval_protocol_dir
|
|
12
12
|
from eval_protocol.models import EvaluationRow
|
|
13
13
|
from eval_protocol.pytest.store_experiment_link import store_experiment_link
|
|
14
|
+
from eval_protocol.auth import (
|
|
15
|
+
get_fireworks_api_key,
|
|
16
|
+
get_fireworks_account_id,
|
|
17
|
+
verify_api_key_and_get_account_id,
|
|
18
|
+
get_fireworks_api_base,
|
|
19
|
+
)
|
|
14
20
|
|
|
15
21
|
import requests
|
|
16
22
|
|
|
@@ -90,22 +96,16 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
|
|
|
90
96
|
if not should_upload:
|
|
91
97
|
continue
|
|
92
98
|
|
|
93
|
-
|
|
94
|
-
|
|
99
|
+
# Resolve credentials using centralized auth helpers with verification fallback
|
|
100
|
+
fireworks_api_key = get_fireworks_api_key()
|
|
101
|
+
fireworks_account_id = get_fireworks_account_id()
|
|
102
|
+
if not fireworks_account_id and fireworks_api_key:
|
|
95
103
|
try:
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
config.read(config_path)
|
|
100
|
-
for section in ["DEFAULT", "auth"]:
|
|
101
|
-
if config.has_section(section) and config.has_option(section, key):
|
|
102
|
-
return config.get(section, key)
|
|
104
|
+
fireworks_account_id = verify_api_key_and_get_account_id(
|
|
105
|
+
api_key=fireworks_api_key, api_base=get_fireworks_api_base()
|
|
106
|
+
)
|
|
103
107
|
except Exception:
|
|
104
|
-
|
|
105
|
-
return os.getenv(key)
|
|
106
|
-
|
|
107
|
-
fireworks_api_key = get_auth_value("FIREWORKS_API_KEY")
|
|
108
|
-
fireworks_account_id = get_auth_value("FIREWORKS_ACCOUNT_ID")
|
|
108
|
+
fireworks_account_id = None
|
|
109
109
|
|
|
110
110
|
if not fireworks_api_key and not fireworks_account_id:
|
|
111
111
|
store_experiment_link(
|
|
@@ -129,7 +129,7 @@ def handle_persist_flow(all_results: list[list[EvaluationRow]], test_func_name:
|
|
|
129
129
|
)
|
|
130
130
|
continue
|
|
131
131
|
|
|
132
|
-
api_base =
|
|
132
|
+
api_base = get_fireworks_api_base()
|
|
133
133
|
headers = {
|
|
134
134
|
"Authorization": f"Bearer {fireworks_api_key}",
|
|
135
135
|
"Content-Type": "application/json",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.84
|
|
3
|
+
Version: 0.2.84.dev2
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -91,6 +91,7 @@ eval_protocol/cli_commands/common.py
|
|
|
91
91
|
eval_protocol/cli_commands/create_rft.py
|
|
92
92
|
eval_protocol/cli_commands/deploy.py
|
|
93
93
|
eval_protocol/cli_commands/deploy_mcp.py
|
|
94
|
+
eval_protocol/cli_commands/local_test.py
|
|
94
95
|
eval_protocol/cli_commands/logs.py
|
|
95
96
|
eval_protocol/cli_commands/preview.py
|
|
96
97
|
eval_protocol/cli_commands/run_eval_cmd.py
|
|
@@ -277,6 +278,7 @@ tests/test_cli.py
|
|
|
277
278
|
tests/test_cli_agent.py
|
|
278
279
|
tests/test_cli_args.py
|
|
279
280
|
tests/test_cli_create_rft_infer.py
|
|
281
|
+
tests/test_cli_local_test.py
|
|
280
282
|
tests/test_code_execution.py
|
|
281
283
|
tests/test_config.py
|
|
282
284
|
tests/test_control_plane_separation.py
|
|
@@ -255,7 +255,8 @@ def test_get_account_id_not_found(mock_path_exists):
|
|
|
255
255
|
with patch("eval_protocol.auth._parse_simple_auth_file", return_value={}) as mock_parse_simple:
|
|
256
256
|
assert get_fireworks_account_id() is None
|
|
257
257
|
mock_parse_simple.assert_not_called()
|
|
258
|
-
|
|
258
|
+
# With verify fallback using get_fireworks_api_key, exists() may be checked more than once
|
|
259
|
+
assert mock_path_exists.call_count >= 1
|
|
259
260
|
|
|
260
261
|
|
|
261
262
|
@patch("pathlib.Path.exists", return_value=True)
|
|
@@ -269,7 +270,8 @@ def test_get_account_id_ini_exists_no_section(mock_parse_simple, mock_ConfigPars
|
|
|
269
270
|
mock_open(read_data="other_key = some_val_but_no_section_header\nanother=val"),
|
|
270
271
|
):
|
|
271
272
|
assert get_fireworks_account_id() is None
|
|
272
|
-
|
|
273
|
+
# Fallback verify path may trigger a second simple parse for api_key; ensure at least one call
|
|
274
|
+
assert mock_parse_simple.call_count >= 1
|
|
273
275
|
|
|
274
276
|
|
|
275
277
|
@patch("pathlib.Path.exists", return_value=True)
|
|
@@ -283,7 +285,8 @@ def test_get_account_id_ini_exists_no_id_option(mock_parse_simple, mock_ConfigPa
|
|
|
283
285
|
|
|
284
286
|
with patch("builtins.open", mock_open(read_data="[fireworks]\nsome_other_key=foo")):
|
|
285
287
|
assert get_fireworks_account_id() is None
|
|
286
|
-
|
|
288
|
+
# Fallback verify path may trigger a second simple parse for api_key; ensure at least one call
|
|
289
|
+
assert mock_parse_simple.call_count >= 1
|
|
287
290
|
|
|
288
291
|
|
|
289
292
|
@patch("pathlib.Path.exists", return_value=True)
|
|
@@ -301,7 +304,8 @@ def test_get_account_id_ini_empty_value(mock_parse_simple, mock_ConfigParser_cla
|
|
|
301
304
|
)
|
|
302
305
|
with patch("builtins.open", mock_open(read_data="[fireworks]\naccount_id=")):
|
|
303
306
|
assert get_fireworks_account_id() is None
|
|
304
|
-
|
|
307
|
+
# Fallback verify path may trigger a second simple parse for api_key; ensure at least one call
|
|
308
|
+
assert mock_parse_simple.call_count >= 1
|
|
305
309
|
|
|
306
310
|
|
|
307
311
|
@patch("pathlib.Path.exists", return_value=True)
|
|
@@ -372,7 +376,8 @@ def test_get_account_id_ini_parse_error(mock_parse_simple, mock_ConfigParser_cla
|
|
|
372
376
|
assert get_fireworks_account_id() is None
|
|
373
377
|
assert "Configparser error reading" in caplog.text
|
|
374
378
|
assert "Mocked Parsing Error" in caplog.text
|
|
375
|
-
|
|
379
|
+
# Fallback verify path may trigger a second simple parse for api_key; ensure at least one call
|
|
380
|
+
assert mock_parse_simple.call_count >= 1
|
|
376
381
|
|
|
377
382
|
|
|
378
383
|
@patch("pathlib.Path.exists", return_value=True)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from types import SimpleNamespace
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_local_test_runs_host_pytest_with_entry(tmp_path, monkeypatch):
|
|
8
|
+
project = tmp_path / "proj"
|
|
9
|
+
project.mkdir()
|
|
10
|
+
monkeypatch.chdir(project)
|
|
11
|
+
|
|
12
|
+
# Create a dummy test file
|
|
13
|
+
test_file = project / "metric" / "test_one.py"
|
|
14
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
16
|
+
|
|
17
|
+
# Import module under test
|
|
18
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
19
|
+
|
|
20
|
+
# Avoid Docker path
|
|
21
|
+
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
|
|
22
|
+
|
|
23
|
+
captured = {"target": ""}
|
|
24
|
+
|
|
25
|
+
def _fake_host(target: str) -> int:
|
|
26
|
+
captured["target"] = target
|
|
27
|
+
return 0
|
|
28
|
+
|
|
29
|
+
monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
|
|
30
|
+
|
|
31
|
+
args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
|
|
32
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
33
|
+
assert rc == 0
|
|
34
|
+
# Expect relative path target
|
|
35
|
+
assert captured["target"] == os.path.relpath(str(test_file), str(project))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_local_test_ignores_docker_when_flag_set(tmp_path, monkeypatch):
|
|
39
|
+
project = tmp_path / "proj"
|
|
40
|
+
project.mkdir()
|
|
41
|
+
monkeypatch.chdir(project)
|
|
42
|
+
|
|
43
|
+
test_file = project / "metric" / "test_two.py"
|
|
44
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
46
|
+
|
|
47
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
48
|
+
|
|
49
|
+
# Pretend we have Dockerfile(s), but ignore_docker=True should skip
|
|
50
|
+
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
|
|
51
|
+
|
|
52
|
+
called = {"host": False}
|
|
53
|
+
|
|
54
|
+
def _fake_host(target: str) -> int:
|
|
55
|
+
called["host"] = True
|
|
56
|
+
return 0
|
|
57
|
+
|
|
58
|
+
monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
|
|
59
|
+
|
|
60
|
+
args = SimpleNamespace(entry=str(test_file), ignore_docker=True, yes=True)
|
|
61
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
62
|
+
assert rc == 0
|
|
63
|
+
assert called["host"] is True
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_local_test_errors_on_multiple_dockerfiles(tmp_path, monkeypatch):
|
|
67
|
+
project = tmp_path / "proj"
|
|
68
|
+
project.mkdir()
|
|
69
|
+
monkeypatch.chdir(project)
|
|
70
|
+
|
|
71
|
+
test_file = project / "metric" / "test_three.py"
|
|
72
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
74
|
+
|
|
75
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
76
|
+
|
|
77
|
+
monkeypatch.setattr(
|
|
78
|
+
lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile"), str(project / "another" / "Dockerfile")]
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
|
|
82
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
83
|
+
assert rc == 1
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch):
|
|
87
|
+
project = tmp_path / "proj"
|
|
88
|
+
project.mkdir()
|
|
89
|
+
monkeypatch.chdir(project)
|
|
90
|
+
|
|
91
|
+
test_file = project / "metric" / "test_four.py"
|
|
92
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
93
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
94
|
+
|
|
95
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
96
|
+
|
|
97
|
+
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
|
|
98
|
+
monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag: True)
|
|
99
|
+
|
|
100
|
+
captured = {"target": "", "image": ""}
|
|
101
|
+
|
|
102
|
+
def _fake_run_docker(root: str, image_tag: str, pytest_target: str) -> int:
|
|
103
|
+
captured["target"] = pytest_target
|
|
104
|
+
captured["image"] = image_tag
|
|
105
|
+
return 0
|
|
106
|
+
|
|
107
|
+
monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker)
|
|
108
|
+
|
|
109
|
+
args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
|
|
110
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
111
|
+
assert rc == 0
|
|
112
|
+
assert captured["image"] == "ep-evaluator:local"
|
|
113
|
+
assert captured["target"] == os.path.relpath(str(test_file), str(project))
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_local_test_selector_single_test(tmp_path, monkeypatch):
|
|
117
|
+
project = tmp_path / "proj"
|
|
118
|
+
project.mkdir()
|
|
119
|
+
monkeypatch.chdir(project)
|
|
120
|
+
|
|
121
|
+
test_file = project / "metric" / "test_sel.py"
|
|
122
|
+
test_file.parent.mkdir(parents=True, exist_ok=True)
|
|
123
|
+
test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8")
|
|
124
|
+
|
|
125
|
+
from eval_protocol.cli_commands import local_test as lt
|
|
126
|
+
from eval_protocol.cli_commands import upload as up
|
|
127
|
+
|
|
128
|
+
# No entry; force discover + selector
|
|
129
|
+
disc = SimpleNamespace(qualname="metric.test_sel", file_path=str(test_file))
|
|
130
|
+
monkeypatch.setattr(lt, "_discover_tests", lambda root: [disc])
|
|
131
|
+
monkeypatch.setattr(up, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
|
|
132
|
+
monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
|
|
133
|
+
|
|
134
|
+
called = {"host": False}
|
|
135
|
+
|
|
136
|
+
def _fake_host(target: str) -> int:
|
|
137
|
+
called["host"] = True
|
|
138
|
+
return 0
|
|
139
|
+
|
|
140
|
+
monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
|
|
141
|
+
|
|
142
|
+
args = SimpleNamespace(entry=None, ignore_docker=False, yes=True)
|
|
143
|
+
rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType]
|
|
144
|
+
assert rc == 0
|
|
145
|
+
assert called["host"] is True
|
|
File without changes
|