eval-protocol 0.2.45.dev0__tar.gz → 0.2.46__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.45.dev0/eval_protocol.egg-info → eval_protocol-0.2.46}/PKG-INFO +1 -1
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/fireworks_tracing.py +2 -4
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/evaluation_test.py +22 -1
- eval_protocol-0.2.46/eval_protocol/utils/browser_utils.py +114 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/logs_server.py +9 -1
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol.egg-info/SOURCES.txt +1 -8
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_show_results_url.py +141 -0
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/__init__.py +0 -10
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/app.py +0 -259
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/auth.py +0 -12
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/langfuse.py +0 -358
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/litellm.py +0 -168
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/main.py +0 -10
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/models.py +0 -51
- eval_protocol-0.2.45.dev0/eval_protocol/proxy/proxy_core/redis_utils.py +0 -48
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/LICENSE +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/README.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/development/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/pyproject.toml +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/setup.cfg +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/setup.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_config.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_format.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_length.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_math.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_models.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_server.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/versioneer.py +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vite-app/dist/assets/index-C81y9r9l.js +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vite-app/dist/assets/index-C81y9r9l.js.map +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vite-app/dist/assets/index-DpYZaoAr.css +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.46
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-09T13:58:14-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "e066febd15f9056f74c40c8f4c34d3c68768fd59",
|
|
15
|
+
"version": "0.2.46"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.2.45.dev0 → eval_protocol-0.2.46}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
@@ -7,9 +7,9 @@ to pull data from Langfuse deployments with simplified retry logic handling.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
import logging
|
|
9
9
|
import requests
|
|
10
|
+
import time
|
|
10
11
|
from datetime import datetime
|
|
11
12
|
from typing import Any, Dict, List, Optional, Protocol
|
|
12
|
-
import os
|
|
13
13
|
|
|
14
14
|
from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
|
|
15
15
|
from .base import BaseAdapter
|
|
@@ -349,11 +349,9 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
349
349
|
else:
|
|
350
350
|
url = f"{self.base_url}/v1/traces"
|
|
351
351
|
|
|
352
|
-
headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
|
|
353
|
-
|
|
354
352
|
result = None
|
|
355
353
|
try:
|
|
356
|
-
response = requests.get(url, params=params, timeout=self.timeout
|
|
354
|
+
response = requests.get(url, params=params, timeout=self.timeout)
|
|
357
355
|
response.raise_for_status()
|
|
358
356
|
result = response.json()
|
|
359
357
|
except requests.exceptions.HTTPError as e:
|
|
@@ -62,7 +62,8 @@ from eval_protocol.pytest.utils import (
|
|
|
62
62
|
run_tasks_with_eval_progress,
|
|
63
63
|
run_tasks_with_run_progress,
|
|
64
64
|
)
|
|
65
|
-
from eval_protocol.utils.show_results_url import store_local_ui_results_url
|
|
65
|
+
from eval_protocol.utils.show_results_url import store_local_ui_results_url, generate_invocation_filter_url
|
|
66
|
+
from eval_protocol.utils.browser_utils import is_logs_server_running, open_browser_tab
|
|
66
67
|
|
|
67
68
|
from ..common_utils import load_jsonl
|
|
68
69
|
|
|
@@ -80,6 +81,7 @@ def evaluation_test(
|
|
|
80
81
|
rollout_processor_kwargs: RolloutProcessorInputParam | None = None,
|
|
81
82
|
aggregation_method: AggregationMethod = "mean",
|
|
82
83
|
passed_threshold: EvaluationThreshold | float | EvaluationThresholdDict | None = None,
|
|
84
|
+
disable_browser_open: bool = False,
|
|
83
85
|
num_runs: int = 1,
|
|
84
86
|
filtered_row_ids: Sequence[str] | None = None,
|
|
85
87
|
max_dataset_rows: int | None = None,
|
|
@@ -246,10 +248,29 @@ def evaluation_test(
|
|
|
246
248
|
else:
|
|
247
249
|
invocation_id = generate_id()
|
|
248
250
|
|
|
251
|
+
# Track whether we've opened browser for this invocation
|
|
252
|
+
browser_opened_for_invocation = False
|
|
253
|
+
|
|
249
254
|
async def wrapper_body(**kwargs: Unpack[ParameterizedTestKwargs]) -> None:
|
|
255
|
+
nonlocal browser_opened_for_invocation
|
|
256
|
+
|
|
250
257
|
# Store URL for viewing results (after all postprocessing is complete)
|
|
251
258
|
store_local_ui_results_url(invocation_id)
|
|
252
259
|
|
|
260
|
+
# Auto-open browser if server is running and not disabled (only once per invocation)
|
|
261
|
+
if (
|
|
262
|
+
not browser_opened_for_invocation
|
|
263
|
+
and not disable_browser_open
|
|
264
|
+
and os.environ.get("EP_DISABLE_AUTO_BROWSER") is None
|
|
265
|
+
):
|
|
266
|
+
is_running, port = is_logs_server_running()
|
|
267
|
+
if is_running:
|
|
268
|
+
# Generate URL for table view with invocation filter
|
|
269
|
+
base_url = f"http://localhost:{port}" if port else "http://localhost:8000"
|
|
270
|
+
table_url = generate_invocation_filter_url(invocation_id, f"{base_url}/table")
|
|
271
|
+
open_browser_tab(table_url)
|
|
272
|
+
browser_opened_for_invocation = True
|
|
273
|
+
|
|
253
274
|
eval_metadata = None
|
|
254
275
|
|
|
255
276
|
all_results: list[list[EvaluationRow]] = [[] for _ in range(num_runs)]
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Browser utilities for auto-opening evaluation results in the local UI.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
import webbrowser
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Tuple, Optional
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import psutil
|
|
15
|
+
|
|
16
|
+
PSUTIL_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
PSUTIL_AVAILABLE = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_pid_file_path() -> Path:
|
|
22
|
+
"""Get the path to the logs server PID file."""
|
|
23
|
+
from eval_protocol.directory_utils import find_eval_protocol_dir
|
|
24
|
+
|
|
25
|
+
return Path(find_eval_protocol_dir()) / "logs_server.pid"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def write_pid_file(pid: int, port: int) -> None:
|
|
29
|
+
"""
|
|
30
|
+
Write the server PID and port to a file for external processes to check.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
pid: The process ID of the logs server
|
|
34
|
+
port: The port the server is running on
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
pid_file = _get_pid_file_path()
|
|
38
|
+
|
|
39
|
+
data = {"pid": pid, "port": port}
|
|
40
|
+
|
|
41
|
+
with open(pid_file, "w") as f:
|
|
42
|
+
json.dump(data, f)
|
|
43
|
+
|
|
44
|
+
# Use print instead of logger to avoid circular imports
|
|
45
|
+
print(f"Wrote PID file: {pid_file} with PID {pid} and port {port}")
|
|
46
|
+
except Exception as e:
|
|
47
|
+
print(f"Warning: Failed to write PID file: {e}")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def is_logs_server_running() -> Tuple[bool, Optional[int]]:
|
|
51
|
+
"""
|
|
52
|
+
Check if the logs server is running by reading the PID file and verifying the process.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Tuple of (is_running, port) where:
|
|
56
|
+
- is_running: True if server is running, False otherwise
|
|
57
|
+
- port: The port the server is running on, or None if not running
|
|
58
|
+
"""
|
|
59
|
+
if not PSUTIL_AVAILABLE:
|
|
60
|
+
return False, None
|
|
61
|
+
|
|
62
|
+
pid_file = _get_pid_file_path()
|
|
63
|
+
if not pid_file.exists():
|
|
64
|
+
return False, None
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
with open(pid_file, "r") as f:
|
|
68
|
+
data = json.load(f)
|
|
69
|
+
pid = data.get("pid")
|
|
70
|
+
port = data.get("port")
|
|
71
|
+
except (json.JSONDecodeError, KeyError, FileNotFoundError):
|
|
72
|
+
return False, None
|
|
73
|
+
|
|
74
|
+
if pid is None:
|
|
75
|
+
return False, None
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
# Check if the process is still running
|
|
79
|
+
process = psutil.Process(pid)
|
|
80
|
+
if not process.is_running():
|
|
81
|
+
return False, None
|
|
82
|
+
|
|
83
|
+
# Optionally verify it's listening on the expected port
|
|
84
|
+
if port is not None:
|
|
85
|
+
try:
|
|
86
|
+
connections = process.net_connections()
|
|
87
|
+
for conn in connections:
|
|
88
|
+
if conn.laddr.port == port and conn.status == "LISTEN":
|
|
89
|
+
return True, port
|
|
90
|
+
except (psutil.AccessDenied, psutil.NoSuchProcess):
|
|
91
|
+
# If we can't check connections, assume it's running if process exists
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
return True, port
|
|
95
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
96
|
+
return False, None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def open_browser_tab(url: str, delay: float = 0.5) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Open a URL in a new browser tab with an optional delay.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
url: The URL to open
|
|
105
|
+
delay: Delay in seconds before opening browser (default: 0.5)
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def _open():
|
|
109
|
+
time.sleep(delay) # Give the server time to start
|
|
110
|
+
webbrowser.open_new_tab(url)
|
|
111
|
+
|
|
112
|
+
thread = threading.Thread(target=_open)
|
|
113
|
+
thread.daemon = True
|
|
114
|
+
thread.start()
|
|
@@ -6,6 +6,7 @@ import threading
|
|
|
6
6
|
import time
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from contextlib import asynccontextmanager
|
|
9
|
+
from pathlib import Path
|
|
9
10
|
from queue import Queue
|
|
10
11
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
11
12
|
|
|
@@ -23,6 +24,7 @@ from eval_protocol.utils.vite_server import ViteServer
|
|
|
23
24
|
from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
|
|
24
25
|
from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig
|
|
25
26
|
from eval_protocol.utils.logs_models import LogEntry, LogsResponse
|
|
27
|
+
from eval_protocol.utils.browser_utils import write_pid_file
|
|
26
28
|
|
|
27
29
|
if TYPE_CHECKING:
|
|
28
30
|
from eval_protocol.models import EvaluationRow
|
|
@@ -378,7 +380,7 @@ class LogsServer(ViteServer):
|
|
|
378
380
|
event_bus.subscribe(self._handle_event)
|
|
379
381
|
logger.debug("[LOGS_SERVER_INIT] Successfully subscribed to event bus")
|
|
380
382
|
|
|
381
|
-
logger.info(f"[LOGS_SERVER_INIT] LogsServer initialized on {host}:{port}")
|
|
383
|
+
logger.info(f"[LOGS_SERVER_INIT] LogsServer initialized on {self.host}:{self.port}")
|
|
382
384
|
|
|
383
385
|
def _setup_websocket_routes(self):
|
|
384
386
|
"""Set up WebSocket routes for real-time communication."""
|
|
@@ -541,6 +543,12 @@ class LogsServer(ViteServer):
|
|
|
541
543
|
)
|
|
542
544
|
|
|
543
545
|
server = uvicorn.Server(config)
|
|
546
|
+
|
|
547
|
+
# Write PID file after server is configured but before serving
|
|
548
|
+
logger.debug(f"[LOGS_SERVER_RUN_ASYNC] Writing PID file for port {self.port}")
|
|
549
|
+
write_pid_file(os.getpid(), self.port)
|
|
550
|
+
logger.debug(f"[LOGS_SERVER_RUN_ASYNC] Successfully wrote PID file for port {self.port}")
|
|
551
|
+
|
|
544
552
|
await server.serve()
|
|
545
553
|
|
|
546
554
|
except KeyboardInterrupt:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.46
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -165,14 +165,6 @@ eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py
|
|
|
165
165
|
eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md
|
|
166
166
|
eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md
|
|
167
167
|
eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md
|
|
168
|
-
eval_protocol/proxy/proxy_core/__init__.py
|
|
169
|
-
eval_protocol/proxy/proxy_core/app.py
|
|
170
|
-
eval_protocol/proxy/proxy_core/auth.py
|
|
171
|
-
eval_protocol/proxy/proxy_core/langfuse.py
|
|
172
|
-
eval_protocol/proxy/proxy_core/litellm.py
|
|
173
|
-
eval_protocol/proxy/proxy_core/main.py
|
|
174
|
-
eval_protocol/proxy/proxy_core/models.py
|
|
175
|
-
eval_protocol/proxy/proxy_core/redis_utils.py
|
|
176
168
|
eval_protocol/pytest/__init__.py
|
|
177
169
|
eval_protocol/pytest/default_agent_rollout_processor.py
|
|
178
170
|
eval_protocol/pytest/default_dataset_adapter.py
|
|
@@ -237,6 +229,7 @@ eval_protocol/types/types.py
|
|
|
237
229
|
eval_protocol/utils/__init__.py
|
|
238
230
|
eval_protocol/utils/batch_evaluation.py
|
|
239
231
|
eval_protocol/utils/batch_transformation.py
|
|
232
|
+
eval_protocol/utils/browser_utils.py
|
|
240
233
|
eval_protocol/utils/check_server_status.py
|
|
241
234
|
eval_protocol/utils/dataset_helpers.py
|
|
242
235
|
eval_protocol/utils/logs_models.py
|
|
@@ -6,6 +6,13 @@ import socket
|
|
|
6
6
|
from unittest.mock import patch, MagicMock
|
|
7
7
|
import pytest
|
|
8
8
|
|
|
9
|
+
try:
|
|
10
|
+
import psutil
|
|
11
|
+
|
|
12
|
+
PSUTIL_AVAILABLE = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
PSUTIL_AVAILABLE = False
|
|
15
|
+
|
|
9
16
|
from eval_protocol.utils.show_results_url import (
|
|
10
17
|
is_server_running,
|
|
11
18
|
generate_invocation_filter_url,
|
|
@@ -193,3 +200,137 @@ class TestIntegration:
|
|
|
193
200
|
assert "table" in call_args[2]
|
|
194
201
|
assert "integration-test" in call_args[1]
|
|
195
202
|
assert "integration-test" in call_args[2]
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class TestBrowserUtilities:
|
|
206
|
+
"""Test browser utility functions."""
|
|
207
|
+
|
|
208
|
+
def test_get_pid_file_path(self):
|
|
209
|
+
"""Test PID file path generation."""
|
|
210
|
+
from eval_protocol.utils.browser_utils import _get_pid_file_path
|
|
211
|
+
from eval_protocol.directory_utils import find_eval_protocol_dir
|
|
212
|
+
from pathlib import Path
|
|
213
|
+
|
|
214
|
+
pid_file = _get_pid_file_path()
|
|
215
|
+
expected = Path(find_eval_protocol_dir()) / "logs_server.pid"
|
|
216
|
+
assert pid_file == expected
|
|
217
|
+
|
|
218
|
+
def test_is_logs_server_running_no_pid_file(self, tmp_path, monkeypatch):
|
|
219
|
+
"""Test server detection when PID file doesn't exist."""
|
|
220
|
+
from eval_protocol.utils.browser_utils import is_logs_server_running
|
|
221
|
+
|
|
222
|
+
# Mock the PID file path to a non-existent file
|
|
223
|
+
monkeypatch.setattr(
|
|
224
|
+
"eval_protocol.utils.browser_utils._get_pid_file_path", lambda: tmp_path / "nonexistent.pid"
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
is_running, port = is_logs_server_running()
|
|
228
|
+
assert not is_running
|
|
229
|
+
assert port is None
|
|
230
|
+
|
|
231
|
+
def test_is_logs_server_running_invalid_pid_file(self, tmp_path, monkeypatch):
|
|
232
|
+
"""Test server detection with invalid PID file content."""
|
|
233
|
+
from eval_protocol.utils.browser_utils import is_logs_server_running
|
|
234
|
+
|
|
235
|
+
# Create invalid PID file
|
|
236
|
+
pid_file = tmp_path / "invalid.pid"
|
|
237
|
+
pid_file.write_text("invalid json")
|
|
238
|
+
monkeypatch.setattr("eval_protocol.utils.browser_utils._get_pid_file_path", lambda: pid_file)
|
|
239
|
+
|
|
240
|
+
is_running, port = is_logs_server_running()
|
|
241
|
+
assert not is_running
|
|
242
|
+
assert port is None
|
|
243
|
+
|
|
244
|
+
def test_is_logs_server_running_missing_pid_key(self, tmp_path, monkeypatch):
|
|
245
|
+
"""Test server detection with PID file missing required keys."""
|
|
246
|
+
from eval_protocol.utils.browser_utils import is_logs_server_running
|
|
247
|
+
import json
|
|
248
|
+
|
|
249
|
+
# Create PID file with missing pid key
|
|
250
|
+
pid_file = tmp_path / "missing_pid.pid"
|
|
251
|
+
pid_file.write_text(json.dumps({"port": 8000}))
|
|
252
|
+
monkeypatch.setattr("eval_protocol.utils.browser_utils._get_pid_file_path", lambda: pid_file)
|
|
253
|
+
|
|
254
|
+
is_running, port = is_logs_server_running()
|
|
255
|
+
assert not is_running
|
|
256
|
+
assert port is None
|
|
257
|
+
|
|
258
|
+
@pytest.mark.skipif(not PSUTIL_AVAILABLE, reason="psutil not available")
|
|
259
|
+
def test_is_logs_server_running_nonexistent_process(self, tmp_path, monkeypatch):
|
|
260
|
+
"""Test server detection with PID file pointing to non-existent process."""
|
|
261
|
+
from eval_protocol.utils.browser_utils import is_logs_server_running
|
|
262
|
+
import json
|
|
263
|
+
|
|
264
|
+
# Create PID file with non-existent PID
|
|
265
|
+
pid_file = tmp_path / "nonexistent_process.pid"
|
|
266
|
+
pid_file.write_text(json.dumps({"pid": 999999, "port": 8000}))
|
|
267
|
+
monkeypatch.setattr("eval_protocol.utils.browser_utils._get_pid_file_path", lambda: pid_file)
|
|
268
|
+
|
|
269
|
+
is_running, port = is_logs_server_running()
|
|
270
|
+
assert not is_running
|
|
271
|
+
assert port is None
|
|
272
|
+
|
|
273
|
+
@pytest.mark.skipif(not PSUTIL_AVAILABLE, reason="psutil not available")
|
|
274
|
+
def test_is_logs_server_running_current_process(self, tmp_path, monkeypatch):
|
|
275
|
+
"""Test server detection with PID file pointing to current process."""
|
|
276
|
+
from eval_protocol.utils.browser_utils import is_logs_server_running
|
|
277
|
+
import json
|
|
278
|
+
import os
|
|
279
|
+
|
|
280
|
+
# Create PID file with current process PID
|
|
281
|
+
pid_file = tmp_path / "current_process.pid"
|
|
282
|
+
pid_file.write_text(json.dumps({"pid": os.getpid(), "port": 8000}))
|
|
283
|
+
monkeypatch.setattr("eval_protocol.utils.browser_utils._get_pid_file_path", lambda: pid_file)
|
|
284
|
+
|
|
285
|
+
is_running, port = is_logs_server_running()
|
|
286
|
+
assert is_running
|
|
287
|
+
assert port == 8000
|
|
288
|
+
|
|
289
|
+
def test_open_browser_tab(self, monkeypatch):
|
|
290
|
+
"""Test browser tab opening."""
|
|
291
|
+
from eval_protocol.utils.browser_utils import open_browser_tab
|
|
292
|
+
|
|
293
|
+
opened_urls = []
|
|
294
|
+
|
|
295
|
+
def mock_open_new_tab(url):
|
|
296
|
+
opened_urls.append(url)
|
|
297
|
+
|
|
298
|
+
monkeypatch.setattr("webbrowser.open_new_tab", mock_open_new_tab)
|
|
299
|
+
|
|
300
|
+
# Test with delay
|
|
301
|
+
open_browser_tab("http://example.com", delay=0.01)
|
|
302
|
+
|
|
303
|
+
# Wait a bit for the thread to execute
|
|
304
|
+
import time
|
|
305
|
+
|
|
306
|
+
time.sleep(0.02)
|
|
307
|
+
|
|
308
|
+
assert len(opened_urls) == 1
|
|
309
|
+
assert opened_urls[0] == "http://example.com"
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
class TestLogsServerPidFile:
|
|
313
|
+
"""Test logs server PID file functionality."""
|
|
314
|
+
|
|
315
|
+
def test_write_pid_file(self, tmp_path, monkeypatch):
|
|
316
|
+
"""Test PID file writing."""
|
|
317
|
+
from eval_protocol.utils.browser_utils import write_pid_file
|
|
318
|
+
import json
|
|
319
|
+
|
|
320
|
+
# Mock the find_eval_protocol_dir function
|
|
321
|
+
monkeypatch.setattr("eval_protocol.directory_utils.find_eval_protocol_dir", lambda: str(tmp_path))
|
|
322
|
+
|
|
323
|
+
# Test writing PID file
|
|
324
|
+
write_pid_file(12345, 8000)
|
|
325
|
+
|
|
326
|
+
# Check that PID file was created
|
|
327
|
+
pid_file = tmp_path / "logs_server.pid"
|
|
328
|
+
assert pid_file.exists()
|
|
329
|
+
|
|
330
|
+
# Check content
|
|
331
|
+
with open(pid_file, "r") as f:
|
|
332
|
+
data = json.load(f)
|
|
333
|
+
assert "pid" in data
|
|
334
|
+
assert "port" in data
|
|
335
|
+
assert data["port"] == 8000
|
|
336
|
+
assert data["pid"] == 12345
|