eval-protocol 0.2.51.dev0__tar.gz → 0.2.52__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.51.dev0/eval_protocol.egg-info → eval_protocol-0.2.52}/PKG-INFO +1 -1
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/__init__.py +7 -10
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/fireworks_tracing.py +6 -8
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/openai_responses.py +29 -1
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/auth.py +39 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli.py +6 -0
- eval_protocol-0.2.52/eval_protocol/cli_commands/logs.py +76 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/upload.py +27 -54
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/evaluation.py +125 -40
- eval_protocol-0.2.52/eval_protocol/log_utils/fireworks_tracing_http_handler.py +63 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/evaluation_test.py +22 -1
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/remote_rollout_processor.py +22 -3
- eval_protocol-0.2.52/eval_protocol/utils/browser_utils.py +114 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/logs_server.py +9 -1
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol.egg-info/SOURCES.txt +5 -12
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_show_results_url.py +141 -0
- eval_protocol-0.2.52/vite-app/dist/assets/index-BGlGI2LH.css +1 -0
- eval_protocol-0.2.51.dev0/vite-app/dist/assets/index-C81y9r9l.js → eval_protocol-0.2.52/vite-app/dist/assets/index-zf20-zFD.js +25 -25
- eval_protocol-0.2.51.dev0/vite-app/dist/assets/index-C81y9r9l.js.map → eval_protocol-0.2.52/vite-app/dist/assets/index-zf20-zFD.js.map +1 -1
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vite-app/dist/index.html +2 -2
- eval_protocol-0.2.51.dev0/eval_protocol/cli_commands/logs.py +0 -36
- eval_protocol-0.2.51.dev0/eval_protocol/proxy/__init__.py +0 -18
- eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/__init__.py +0 -13
- eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/app.py +0 -305
- eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/auth.py +0 -17
- eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/langfuse.py +0 -528
- eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/litellm.py +0 -170
- eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/main.py +0 -10
- eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/models.py +0 -104
- eval_protocol-0.2.51.dev0/eval_protocol/proxy/proxy_core/redis_utils.py +0 -48
- eval_protocol-0.2.51.dev0/vite-app/dist/assets/index-DpYZaoAr.css +0 -1
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/LICENSE +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/README.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/development/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/pyproject.toml +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/setup.cfg +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/setup.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_config.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_format.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_length.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_math.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_models.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_server.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/versioneer.py +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.52
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -31,10 +31,14 @@ from .reward_function import RewardFunction
|
|
|
31
31
|
from .typed_interface import reward_function
|
|
32
32
|
from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
|
|
33
33
|
from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
|
|
34
|
+
from .pytest.remote_rollout_processor import create_elasticsearch_config_from_env
|
|
34
35
|
from .pytest.parameterize import DefaultParameterIdGenerator
|
|
35
36
|
from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
|
|
36
37
|
from .log_utils.rollout_id_filter import RolloutIdFilter
|
|
37
38
|
from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
|
|
39
|
+
from .log_utils.fireworks_tracing_http_handler import FireworksTracingHttpHandler
|
|
40
|
+
from .log_utils.elasticsearch_client import ElasticsearchConfig
|
|
41
|
+
|
|
38
42
|
|
|
39
43
|
from .types.remote_rollout_processor import (
|
|
40
44
|
InitRequest,
|
|
@@ -70,16 +74,11 @@ try:
|
|
|
70
74
|
except ImportError:
|
|
71
75
|
WeaveAdapter = None
|
|
72
76
|
|
|
73
|
-
try:
|
|
74
|
-
from .proxy import create_app, AuthProvider
|
|
75
|
-
except ImportError:
|
|
76
|
-
create_app = None
|
|
77
|
-
AuthProvider = None
|
|
78
|
-
|
|
79
|
-
|
|
80
77
|
warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
|
|
81
78
|
|
|
82
79
|
__all__ = [
|
|
80
|
+
"create_elasticsearch_config_from_env",
|
|
81
|
+
"ElasticsearchConfig",
|
|
83
82
|
"ElasticsearchDirectHttpHandler",
|
|
84
83
|
"RolloutIdFilter",
|
|
85
84
|
"setup_rollout_logging_for_elasticsearch_handler",
|
|
@@ -102,6 +101,7 @@ __all__ = [
|
|
|
102
101
|
"BraintrustAdapter",
|
|
103
102
|
"create_braintrust_adapter",
|
|
104
103
|
"LangSmithAdapter",
|
|
104
|
+
"FireworksTracingHttpHandler",
|
|
105
105
|
# Core interfaces
|
|
106
106
|
"Message",
|
|
107
107
|
"MetricResult",
|
|
@@ -137,9 +137,6 @@ __all__ = [
|
|
|
137
137
|
"RolloutMetadata",
|
|
138
138
|
"StatusResponse",
|
|
139
139
|
"create_langfuse_config_tags",
|
|
140
|
-
# Proxy
|
|
141
|
-
"create_app",
|
|
142
|
-
"AuthProvider",
|
|
143
140
|
]
|
|
144
141
|
|
|
145
142
|
from . import _version
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-13T00:31:45-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "270a91e21f730169bd3ff7f94c44f8c0502ace33",
|
|
15
|
+
"version": "0.2.52"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
@@ -7,9 +7,9 @@ to pull data from Langfuse deployments with simplified retry logic handling.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
import logging
|
|
9
9
|
import requests
|
|
10
|
+
import time
|
|
10
11
|
from datetime import datetime
|
|
11
12
|
from typing import Any, Dict, List, Optional, Protocol
|
|
12
|
-
import os
|
|
13
13
|
|
|
14
14
|
from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
|
|
15
15
|
from .base import BaseAdapter
|
|
@@ -343,17 +343,15 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
343
343
|
# Remove None values
|
|
344
344
|
params = {k: v for k, v in params.items() if v is not None}
|
|
345
345
|
|
|
346
|
-
# Make request to proxy
|
|
346
|
+
# Make request to proxy
|
|
347
347
|
if self.project_id:
|
|
348
|
-
url = f"{self.base_url}/v1/project_id/{self.project_id}/traces
|
|
348
|
+
url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
|
|
349
349
|
else:
|
|
350
|
-
url = f"{self.base_url}/v1/traces
|
|
351
|
-
|
|
352
|
-
headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
|
|
350
|
+
url = f"{self.base_url}/v1/traces"
|
|
353
351
|
|
|
354
352
|
result = None
|
|
355
353
|
try:
|
|
356
|
-
response = requests.get(url, params=params, timeout=self.timeout
|
|
354
|
+
response = requests.get(url, params=params, timeout=self.timeout)
|
|
357
355
|
response.raise_for_status()
|
|
358
356
|
result = response.json()
|
|
359
357
|
except requests.exceptions.HTTPError as e:
|
|
@@ -367,7 +365,7 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
367
365
|
except Exception: # In case e.response.json() fails
|
|
368
366
|
error_msg = f"Proxy error: {e.response.text}"
|
|
369
367
|
|
|
370
|
-
logger.error("Failed to fetch traces from proxy
|
|
368
|
+
logger.error("Failed to fetch traces from proxy: %s", error_msg)
|
|
371
369
|
return eval_rows
|
|
372
370
|
except requests.exceptions.RequestException as e:
|
|
373
371
|
# Non-HTTP errors (network issues, timeouts, etc.)
|
{eval_protocol-0.2.51.dev0 → eval_protocol-0.2.52}/eval_protocol/adapters/openai_responses.py
RENAMED
|
@@ -169,7 +169,9 @@ class OpenAIResponsesAdapter(BaseAdapter):
|
|
|
169
169
|
raise NotImplementedError(f"Unsupported content type: {content_item.type}")
|
|
170
170
|
elif item.type == "function_call_output":
|
|
171
171
|
# Collect tool call outputs to add before assistant message
|
|
172
|
-
tool_call_outputs.append(
|
|
172
|
+
tool_call_outputs.append(
|
|
173
|
+
Message(role="tool", content=self._coerce_tool_output(item.output), tool_call_id=item.call_id)
|
|
174
|
+
)
|
|
173
175
|
elif item.type == "function_call":
|
|
174
176
|
tool_call = ChatCompletionMessageToolCall(
|
|
175
177
|
id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
|
|
@@ -186,3 +188,29 @@ class OpenAIResponsesAdapter(BaseAdapter):
|
|
|
186
188
|
messages.append(Message(role="assistant", tool_calls=current_tool_calls))
|
|
187
189
|
|
|
188
190
|
return reversed(messages)
|
|
191
|
+
|
|
192
|
+
def _coerce_tool_output(self, output: Any) -> str:
|
|
193
|
+
"""Coerce OpenAI Responses tool output into a string for Message.content.
|
|
194
|
+
|
|
195
|
+
The Responses API may return structured content lists. For our purposes,
|
|
196
|
+
we stringify non-string outputs to satisfy the Message.content type.
|
|
197
|
+
"""
|
|
198
|
+
if isinstance(output, str):
|
|
199
|
+
return output
|
|
200
|
+
try:
|
|
201
|
+
# Attempt to join list of objects with any 'text' fields
|
|
202
|
+
if isinstance(output, list):
|
|
203
|
+
parts: list[str] = []
|
|
204
|
+
for part in output:
|
|
205
|
+
text = None
|
|
206
|
+
if isinstance(part, dict):
|
|
207
|
+
text = part.get("text")
|
|
208
|
+
if text:
|
|
209
|
+
parts.append(str(text))
|
|
210
|
+
else:
|
|
211
|
+
parts.append(str(part))
|
|
212
|
+
return "\n".join(parts)
|
|
213
|
+
# Fallback to string conversion
|
|
214
|
+
return str(output)
|
|
215
|
+
except Exception:
|
|
216
|
+
return str(output)
|
|
@@ -4,6 +4,8 @@ import os
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Dict, Optional # Added Dict
|
|
6
6
|
|
|
7
|
+
import requests
|
|
8
|
+
|
|
7
9
|
logger = logging.getLogger(__name__)
|
|
8
10
|
|
|
9
11
|
# Default locations (used for tests and as fallback). Actual resolution is dynamic via _get_auth_ini_file().
|
|
@@ -218,3 +220,40 @@ def get_fireworks_api_base() -> str:
|
|
|
218
220
|
else:
|
|
219
221
|
logger.debug("FIREWORKS_API_BASE not set in environment, defaulting to %s.", api_base)
|
|
220
222
|
return api_base
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def verify_api_key_and_get_account_id(
|
|
226
|
+
api_key: Optional[str] = None,
|
|
227
|
+
api_base: Optional[str] = None,
|
|
228
|
+
) -> Optional[str]:
|
|
229
|
+
"""
|
|
230
|
+
Calls the Fireworks API verify endpoint to validate the API key and returns the
|
|
231
|
+
account id from response headers when available.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
|
|
235
|
+
api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
The resolved account id if verification succeeds and the header is present; otherwise None.
|
|
239
|
+
"""
|
|
240
|
+
try:
|
|
241
|
+
resolved_key = api_key or get_fireworks_api_key()
|
|
242
|
+
if not resolved_key:
|
|
243
|
+
return None
|
|
244
|
+
resolved_base = api_base or get_fireworks_api_base()
|
|
245
|
+
url = f"{resolved_base.rstrip('/')}/verifyApiKey"
|
|
246
|
+
headers = {"Authorization": f"Bearer {resolved_key}"}
|
|
247
|
+
resp = requests.get(url, headers=headers, timeout=10)
|
|
248
|
+
if resp.status_code != 200:
|
|
249
|
+
logger.debug("verifyApiKey returned status %s", resp.status_code)
|
|
250
|
+
return None
|
|
251
|
+
# Header keys could vary in case; requests provides case-insensitive dict
|
|
252
|
+
account_id = resp.headers.get("x-fireworks-account-id") or resp.headers.get("X-Fireworks-Account-Id")
|
|
253
|
+
if account_id and account_id.strip():
|
|
254
|
+
logger.debug("Resolved FIREWORKS_ACCOUNT_ID via verifyApiKey: %s", account_id)
|
|
255
|
+
return account_id.strip()
|
|
256
|
+
return None
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.debug("Failed to verify API key for account id resolution: %s", e)
|
|
259
|
+
return None
|
|
@@ -301,6 +301,12 @@ def parse_args(args=None):
|
|
|
301
301
|
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
|
|
302
302
|
logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
|
|
303
303
|
logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
|
304
|
+
logs_parser.add_argument("--disable-elasticsearch-setup", action="store_true", help="Disable Elasticsearch setup")
|
|
305
|
+
logs_parser.add_argument(
|
|
306
|
+
"--use-env-elasticsearch-config",
|
|
307
|
+
action="store_true",
|
|
308
|
+
help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
|
|
309
|
+
)
|
|
304
310
|
|
|
305
311
|
# Upload command
|
|
306
312
|
upload_parser = subparsers.add_parser(
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI command for serving logs with file watching and real-time updates.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from ..utils.logs_server import serve_logs
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def logs_command(args):
|
|
12
|
+
"""Serve logs with file watching and real-time updates"""
|
|
13
|
+
|
|
14
|
+
port = args.port
|
|
15
|
+
print("🚀 Starting Eval Protocol Logs Server")
|
|
16
|
+
print(f"🌐 URL: http://localhost:{port}")
|
|
17
|
+
print(f"🔌 WebSocket: ws://localhost:{port}/ws")
|
|
18
|
+
print(f"👀 Watching paths: {['current directory']}")
|
|
19
|
+
print(f"🔍 Debug mode: {args.debug}")
|
|
20
|
+
print("Press Ctrl+C to stop the server")
|
|
21
|
+
print("-" * 50)
|
|
22
|
+
|
|
23
|
+
# Setup Elasticsearch based on flags
|
|
24
|
+
elasticsearch_config = None
|
|
25
|
+
try:
|
|
26
|
+
if getattr(args, "use_env_elasticsearch_config", False):
|
|
27
|
+
# Use environment variables for configuration
|
|
28
|
+
print("⚙️ Using environment variables for Elasticsearch config")
|
|
29
|
+
from eval_protocol.pytest.remote_rollout_processor import (
|
|
30
|
+
create_elasticsearch_config_from_env,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
elasticsearch_config = create_elasticsearch_config_from_env()
|
|
34
|
+
# Ensure index exists with correct mapping, mirroring Docker setup path
|
|
35
|
+
try:
|
|
36
|
+
from eval_protocol.log_utils.elasticsearch_index_manager import (
|
|
37
|
+
ElasticsearchIndexManager,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
index_manager = ElasticsearchIndexManager(
|
|
41
|
+
elasticsearch_config.url,
|
|
42
|
+
elasticsearch_config.index_name,
|
|
43
|
+
elasticsearch_config.api_key,
|
|
44
|
+
)
|
|
45
|
+
created = index_manager.create_logging_index_mapping()
|
|
46
|
+
if created:
|
|
47
|
+
print(
|
|
48
|
+
f"🧭 Verified Elasticsearch index '{elasticsearch_config.index_name}' mapping (created or already correct)"
|
|
49
|
+
)
|
|
50
|
+
else:
|
|
51
|
+
print(
|
|
52
|
+
f"⚠️ Could not verify/create mapping for index '{elasticsearch_config.index_name}'. Searches may behave unexpectedly."
|
|
53
|
+
)
|
|
54
|
+
except Exception as e:
|
|
55
|
+
print(f"⚠️ Failed to ensure index mapping via IndexManager: {e}")
|
|
56
|
+
elif not getattr(args, "disable_elasticsearch_setup", False):
|
|
57
|
+
# Default behavior: start or connect to local Elasticsearch via Docker helper
|
|
58
|
+
from eval_protocol.pytest.elasticsearch_setup import ElasticsearchSetup
|
|
59
|
+
|
|
60
|
+
print("🧰 Auto-configuring local Elasticsearch (Docker)")
|
|
61
|
+
elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
|
|
62
|
+
else:
|
|
63
|
+
print("🚫 Elasticsearch setup disabled; running without Elasticsearch integration")
|
|
64
|
+
except Exception as e:
|
|
65
|
+
print(f"❌ Failed to configure Elasticsearch: {e}")
|
|
66
|
+
return 1
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
serve_logs(port=args.port, elasticsearch_config=elasticsearch_config, debug=args.debug)
|
|
70
|
+
return 0
|
|
71
|
+
except KeyboardInterrupt:
|
|
72
|
+
print("\n🛑 Server stopped by user")
|
|
73
|
+
return 0
|
|
74
|
+
except Exception as e:
|
|
75
|
+
print(f"❌ Error starting server: {e}")
|
|
76
|
+
return 1
|
|
@@ -12,7 +12,12 @@ from pathlib import Path
|
|
|
12
12
|
from typing import Any, Callable, Iterable, Optional
|
|
13
13
|
|
|
14
14
|
import pytest
|
|
15
|
-
from eval_protocol.auth import
|
|
15
|
+
from eval_protocol.auth import (
|
|
16
|
+
get_fireworks_account_id,
|
|
17
|
+
get_fireworks_api_key,
|
|
18
|
+
get_fireworks_api_base,
|
|
19
|
+
verify_api_key_and_get_account_id,
|
|
20
|
+
)
|
|
16
21
|
from eval_protocol.platform_api import create_or_update_fireworks_secret
|
|
17
22
|
|
|
18
23
|
from eval_protocol.evaluation import create_evaluation
|
|
@@ -259,7 +264,7 @@ def _parse_entry(entry: str, cwd: str) -> tuple[str, str]:
|
|
|
259
264
|
raise ValueError("--entry must be in 'module::function', 'path::function', or 'module:function' format")
|
|
260
265
|
|
|
261
266
|
|
|
262
|
-
def
|
|
267
|
+
def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]:
|
|
263
268
|
target, func = _parse_entry(entry, cwd)
|
|
264
269
|
|
|
265
270
|
# Check if target looks like a file path
|
|
@@ -293,47 +298,12 @@ def _generate_ts_mode_code_from_entry(entry: str, cwd: str) -> tuple[str, str, s
|
|
|
293
298
|
raise ValueError(f"Function '{func}' not found in module '{module_name}'")
|
|
294
299
|
|
|
295
300
|
qualname = f"{module_name}.{func}"
|
|
296
|
-
|
|
297
|
-
DiscoveredTest(
|
|
298
|
-
module_path=module_name,
|
|
299
|
-
module_name=module_name,
|
|
300
|
-
qualname=qualname,
|
|
301
|
-
file_path=getattr(module, "__file__", module_name),
|
|
302
|
-
lineno=None,
|
|
303
|
-
has_parametrize=False,
|
|
304
|
-
param_count=0,
|
|
305
|
-
nodeids=[],
|
|
306
|
-
)
|
|
307
|
-
)
|
|
308
|
-
return code, file_name, qualname, os.path.abspath(source_file_path) if source_file_path else ""
|
|
301
|
+
return qualname, os.path.abspath(source_file_path) if source_file_path else ""
|
|
309
302
|
|
|
310
303
|
|
|
311
304
|
def _generate_ts_mode_code(test: DiscoveredTest) -> tuple[str, str]:
|
|
312
|
-
#
|
|
313
|
-
|
|
314
|
-
func = test.qualname.split(".")[-1]
|
|
315
|
-
code = f"""
|
|
316
|
-
from typing import Any, Dict, List, Optional, Union
|
|
317
|
-
|
|
318
|
-
from eval_protocol.models import EvaluationRow, Message
|
|
319
|
-
from {module} import {func} as _ep_test
|
|
320
|
-
|
|
321
|
-
def evaluate(messages: List[Dict[str, Any]], ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs):
|
|
322
|
-
row = EvaluationRow(messages=[Message(**m) for m in messages], ground_truth=ground_truth)
|
|
323
|
-
result = _ep_test(row) # Supports sync/async via decorator's dual-mode
|
|
324
|
-
if hasattr(result, "__await__"):
|
|
325
|
-
import asyncio
|
|
326
|
-
result = asyncio.get_event_loop().run_until_complete(result)
|
|
327
|
-
if result.evaluation_result is None:
|
|
328
|
-
return {{"score": 0.0, "reason": "No evaluation_result set"}}
|
|
329
|
-
out = {{
|
|
330
|
-
"score": float(result.evaluation_result.score or 0.0),
|
|
331
|
-
"reason": result.evaluation_result.reason,
|
|
332
|
-
"metrics": {{k: (v.model_dump() if hasattr(v, "model_dump") else v) for k, v in (result.evaluation_result.metrics or {{}}).items()}},
|
|
333
|
-
}}
|
|
334
|
-
return out
|
|
335
|
-
"""
|
|
336
|
-
return (code, "main.py")
|
|
305
|
+
# Deprecated: we no longer generate a shim; keep stub for import compatibility
|
|
306
|
+
return ("", "main.py")
|
|
337
307
|
|
|
338
308
|
|
|
339
309
|
def _normalize_evaluator_id(evaluator_id: str) -> str:
|
|
@@ -522,10 +492,10 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
522
492
|
entries_arg = getattr(args, "entry", None)
|
|
523
493
|
if entries_arg:
|
|
524
494
|
entries = [e.strip() for e in re.split(r"[,\s]+", entries_arg) if e.strip()]
|
|
525
|
-
selected_specs: list[tuple[str, str
|
|
495
|
+
selected_specs: list[tuple[str, str]] = []
|
|
526
496
|
for e in entries:
|
|
527
|
-
|
|
528
|
-
selected_specs.append((
|
|
497
|
+
qualname, resolved_path = _resolve_entry_to_qual_and_source(e, root)
|
|
498
|
+
selected_specs.append((qualname, resolved_path))
|
|
529
499
|
else:
|
|
530
500
|
print("Scanning for evaluation tests...")
|
|
531
501
|
tests = _discover_tests(root)
|
|
@@ -545,11 +515,7 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
545
515
|
print(" handles all parameter combinations. The evaluator will work with")
|
|
546
516
|
print(" the same logic regardless of which model/parameters are used.")
|
|
547
517
|
|
|
548
|
-
selected_specs = []
|
|
549
|
-
for t in selected_tests:
|
|
550
|
-
code, file_name = _generate_ts_mode_code(t)
|
|
551
|
-
# Store test info for better ID generation
|
|
552
|
-
selected_specs.append((code, file_name, t.qualname, t.file_path))
|
|
518
|
+
selected_specs = [(t.qualname, t.file_path) for t in selected_tests]
|
|
553
519
|
|
|
554
520
|
base_id = getattr(args, "id", None)
|
|
555
521
|
display_name = getattr(args, "display_name", None)
|
|
@@ -560,6 +526,14 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
560
526
|
try:
|
|
561
527
|
fw_account_id = get_fireworks_account_id()
|
|
562
528
|
fw_api_key_value = get_fireworks_api_key()
|
|
529
|
+
if not fw_account_id and fw_api_key_value:
|
|
530
|
+
# Attempt to verify and resolve account id from server headers
|
|
531
|
+
resolved = verify_api_key_and_get_account_id(api_key=fw_api_key_value, api_base=get_fireworks_api_base())
|
|
532
|
+
if resolved:
|
|
533
|
+
fw_account_id = resolved
|
|
534
|
+
# Propagate to environment so downstream calls use it if needed
|
|
535
|
+
os.environ["FIREWORKS_ACCOUNT_ID"] = fw_account_id
|
|
536
|
+
print(f"Resolved FIREWORKS_ACCOUNT_ID via API verification: {fw_account_id}")
|
|
563
537
|
if fw_account_id and fw_api_key_value:
|
|
564
538
|
print("Ensuring FIREWORKS_API_KEY is registered as a secret on Fireworks for rollout...")
|
|
565
539
|
if create_or_update_fireworks_secret(
|
|
@@ -579,8 +553,7 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
579
553
|
print(f"Warning: Skipped Fireworks secret registration due to error: {e}")
|
|
580
554
|
|
|
581
555
|
exit_code = 0
|
|
582
|
-
for i, (
|
|
583
|
-
# Use ts_mode to upload evaluator
|
|
556
|
+
for i, (qualname, source_file_path) in enumerate(selected_specs):
|
|
584
557
|
# Generate a short default ID from just the test function name
|
|
585
558
|
if base_id:
|
|
586
559
|
evaluator_id = base_id
|
|
@@ -618,12 +591,12 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
618
591
|
|
|
619
592
|
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
|
|
620
593
|
try:
|
|
594
|
+
# Always treat as a single evaluator (single-metric) even if folder has helper modules
|
|
595
|
+
test_dir = os.path.dirname(source_file_path) if source_file_path else root
|
|
596
|
+
metric_name = os.path.basename(test_dir) or "metric"
|
|
621
597
|
result = create_evaluation(
|
|
622
598
|
evaluator_id=evaluator_id,
|
|
623
|
-
|
|
624
|
-
python_file_name_for_code=file_name,
|
|
625
|
-
criterion_name_for_code=qualname,
|
|
626
|
-
criterion_description_for_code=description or f"Evaluator for {qualname}",
|
|
599
|
+
metric_folders=[f"{metric_name}={test_dir}"],
|
|
627
600
|
display_name=display_name or evaluator_id,
|
|
628
601
|
description=description or f"Evaluator for {qualname}",
|
|
629
602
|
force=force,
|