eval-protocol 0.2.46.dev3__tar.gz → 0.2.48__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.46.dev3/eval_protocol.egg-info → eval_protocol-0.2.48}/PKG-INFO +1 -1
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/__init__.py +5 -10
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/fireworks_tracing.py +6 -8
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/openai_responses.py +29 -1
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli.py +6 -0
- eval_protocol-0.2.48/eval_protocol/cli_commands/logs.py +54 -0
- eval_protocol-0.2.48/eval_protocol/log_utils/fireworks_tracing_http_handler.py +63 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/evaluation_test.py +22 -1
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/remote_rollout_processor.py +22 -3
- eval_protocol-0.2.48/eval_protocol/utils/browser_utils.py +114 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/logs_server.py +9 -1
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol.egg-info/SOURCES.txt +5 -12
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_show_results_url.py +141 -0
- eval_protocol-0.2.48/vite-app/dist/assets/index-34WaHH5W.css +1 -0
- eval_protocol-0.2.46.dev3/vite-app/dist/assets/index-C81y9r9l.js → eval_protocol-0.2.48/vite-app/dist/assets/index-DOPsfOMT.js +4 -4
- eval_protocol-0.2.46.dev3/vite-app/dist/assets/index-C81y9r9l.js.map → eval_protocol-0.2.48/vite-app/dist/assets/index-DOPsfOMT.js.map +1 -1
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vite-app/dist/index.html +2 -2
- eval_protocol-0.2.46.dev3/eval_protocol/cli_commands/logs.py +0 -36
- eval_protocol-0.2.46.dev3/eval_protocol/proxy/__init__.py +0 -17
- eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/__init__.py +0 -12
- eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/app.py +0 -305
- eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/auth.py +0 -18
- eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/langfuse.py +0 -526
- eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/litellm.py +0 -171
- eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/main.py +0 -10
- eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/models.py +0 -92
- eval_protocol-0.2.46.dev3/eval_protocol/proxy/proxy_core/redis_utils.py +0 -48
- eval_protocol-0.2.46.dev3/vite-app/dist/assets/index-DpYZaoAr.css +0 -1
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/LICENSE +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/README.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/development/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/pyproject.toml +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/setup.cfg +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/setup.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_config.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_format.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_length.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_math.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_models.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_server.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/versioneer.py +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.48
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -35,6 +35,9 @@ from .pytest.parameterize import DefaultParameterIdGenerator
|
|
|
35
35
|
from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
|
|
36
36
|
from .log_utils.rollout_id_filter import RolloutIdFilter
|
|
37
37
|
from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
|
|
38
|
+
from .log_utils.fireworks_tracing_http_handler import FireworksTracingHttpHandler
|
|
39
|
+
from .log_utils.elasticsearch_client import ElasticsearchConfig
|
|
40
|
+
|
|
38
41
|
|
|
39
42
|
from .types.remote_rollout_processor import (
|
|
40
43
|
InitRequest,
|
|
@@ -70,16 +73,10 @@ try:
|
|
|
70
73
|
except ImportError:
|
|
71
74
|
WeaveAdapter = None
|
|
72
75
|
|
|
73
|
-
try:
|
|
74
|
-
from .proxy import create_app, AuthProvider
|
|
75
|
-
except ImportError:
|
|
76
|
-
create_app = None
|
|
77
|
-
AuthProvider = None
|
|
78
|
-
|
|
79
|
-
|
|
80
76
|
warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
|
|
81
77
|
|
|
82
78
|
__all__ = [
|
|
79
|
+
"ElasticsearchConfig",
|
|
83
80
|
"ElasticsearchDirectHttpHandler",
|
|
84
81
|
"RolloutIdFilter",
|
|
85
82
|
"setup_rollout_logging_for_elasticsearch_handler",
|
|
@@ -102,6 +99,7 @@ __all__ = [
|
|
|
102
99
|
"BraintrustAdapter",
|
|
103
100
|
"create_braintrust_adapter",
|
|
104
101
|
"LangSmithAdapter",
|
|
102
|
+
"FireworksTracingHttpHandler",
|
|
105
103
|
# Core interfaces
|
|
106
104
|
"Message",
|
|
107
105
|
"MetricResult",
|
|
@@ -137,9 +135,6 @@ __all__ = [
|
|
|
137
135
|
"RolloutMetadata",
|
|
138
136
|
"StatusResponse",
|
|
139
137
|
"create_langfuse_config_tags",
|
|
140
|
-
# Proxy
|
|
141
|
-
"create_app",
|
|
142
|
-
"AuthProvider",
|
|
143
138
|
]
|
|
144
139
|
|
|
145
140
|
from . import _version
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-10T13:52:05-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "8e5d3a5f347613eafe384a726d3598cf58719822",
|
|
15
|
+
"version": "0.2.48"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
@@ -7,9 +7,9 @@ to pull data from Langfuse deployments with simplified retry logic handling.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
import logging
|
|
9
9
|
import requests
|
|
10
|
+
import time
|
|
10
11
|
from datetime import datetime
|
|
11
12
|
from typing import Any, Dict, List, Optional, Protocol
|
|
12
|
-
import os
|
|
13
13
|
|
|
14
14
|
from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
|
|
15
15
|
from .base import BaseAdapter
|
|
@@ -343,17 +343,15 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
343
343
|
# Remove None values
|
|
344
344
|
params = {k: v for k, v in params.items() if v is not None}
|
|
345
345
|
|
|
346
|
-
# Make request to proxy
|
|
346
|
+
# Make request to proxy
|
|
347
347
|
if self.project_id:
|
|
348
|
-
url = f"{self.base_url}/v1/project_id/{self.project_id}/traces
|
|
348
|
+
url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
|
|
349
349
|
else:
|
|
350
|
-
url = f"{self.base_url}/v1/traces
|
|
351
|
-
|
|
352
|
-
headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
|
|
350
|
+
url = f"{self.base_url}/v1/traces"
|
|
353
351
|
|
|
354
352
|
result = None
|
|
355
353
|
try:
|
|
356
|
-
response = requests.get(url, params=params, timeout=self.timeout
|
|
354
|
+
response = requests.get(url, params=params, timeout=self.timeout)
|
|
357
355
|
response.raise_for_status()
|
|
358
356
|
result = response.json()
|
|
359
357
|
except requests.exceptions.HTTPError as e:
|
|
@@ -367,7 +365,7 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
367
365
|
except Exception: # In case e.response.json() fails
|
|
368
366
|
error_msg = f"Proxy error: {e.response.text}"
|
|
369
367
|
|
|
370
|
-
logger.error("Failed to fetch traces from proxy
|
|
368
|
+
logger.error("Failed to fetch traces from proxy: %s", error_msg)
|
|
371
369
|
return eval_rows
|
|
372
370
|
except requests.exceptions.RequestException as e:
|
|
373
371
|
# Non-HTTP errors (network issues, timeouts, etc.)
|
{eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/adapters/openai_responses.py
RENAMED
|
@@ -169,7 +169,9 @@ class OpenAIResponsesAdapter(BaseAdapter):
|
|
|
169
169
|
raise NotImplementedError(f"Unsupported content type: {content_item.type}")
|
|
170
170
|
elif item.type == "function_call_output":
|
|
171
171
|
# Collect tool call outputs to add before assistant message
|
|
172
|
-
tool_call_outputs.append(
|
|
172
|
+
tool_call_outputs.append(
|
|
173
|
+
Message(role="tool", content=self._coerce_tool_output(item.output), tool_call_id=item.call_id)
|
|
174
|
+
)
|
|
173
175
|
elif item.type == "function_call":
|
|
174
176
|
tool_call = ChatCompletionMessageToolCall(
|
|
175
177
|
id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
|
|
@@ -186,3 +188,29 @@ class OpenAIResponsesAdapter(BaseAdapter):
|
|
|
186
188
|
messages.append(Message(role="assistant", tool_calls=current_tool_calls))
|
|
187
189
|
|
|
188
190
|
return reversed(messages)
|
|
191
|
+
|
|
192
|
+
def _coerce_tool_output(self, output: Any) -> str:
|
|
193
|
+
"""Coerce OpenAI Responses tool output into a string for Message.content.
|
|
194
|
+
|
|
195
|
+
The Responses API may return structured content lists. For our purposes,
|
|
196
|
+
we stringify non-string outputs to satisfy the Message.content type.
|
|
197
|
+
"""
|
|
198
|
+
if isinstance(output, str):
|
|
199
|
+
return output
|
|
200
|
+
try:
|
|
201
|
+
# Attempt to join list of objects with any 'text' fields
|
|
202
|
+
if isinstance(output, list):
|
|
203
|
+
parts: list[str] = []
|
|
204
|
+
for part in output:
|
|
205
|
+
text = None
|
|
206
|
+
if isinstance(part, dict):
|
|
207
|
+
text = part.get("text")
|
|
208
|
+
if text:
|
|
209
|
+
parts.append(str(text))
|
|
210
|
+
else:
|
|
211
|
+
parts.append(str(part))
|
|
212
|
+
return "\n".join(parts)
|
|
213
|
+
# Fallback to string conversion
|
|
214
|
+
return str(output)
|
|
215
|
+
except Exception:
|
|
216
|
+
return str(output)
|
|
@@ -301,6 +301,12 @@ def parse_args(args=None):
|
|
|
301
301
|
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
|
|
302
302
|
logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
|
|
303
303
|
logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
|
304
|
+
logs_parser.add_argument("--disable-elasticsearch-setup", action="store_true", help="Disable Elasticsearch setup")
|
|
305
|
+
logs_parser.add_argument(
|
|
306
|
+
"--use-env-elasticsearch-confi",
|
|
307
|
+
action="store_true",
|
|
308
|
+
help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
|
|
309
|
+
)
|
|
304
310
|
|
|
305
311
|
# Upload command
|
|
306
312
|
upload_parser = subparsers.add_parser(
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI command for serving logs with file watching and real-time updates.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from ..utils.logs_server import serve_logs
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def logs_command(args):
|
|
12
|
+
"""Serve logs with file watching and real-time updates"""
|
|
13
|
+
|
|
14
|
+
port = args.port
|
|
15
|
+
print("🚀 Starting Eval Protocol Logs Server")
|
|
16
|
+
print(f"🌐 URL: http://localhost:{port}")
|
|
17
|
+
print(f"🔌 WebSocket: ws://localhost:{port}/ws")
|
|
18
|
+
print(f"👀 Watching paths: {['current directory']}")
|
|
19
|
+
print(f"🔍 Debug mode: {args.debug}")
|
|
20
|
+
print("Press Ctrl+C to stop the server")
|
|
21
|
+
print("-" * 50)
|
|
22
|
+
|
|
23
|
+
# Setup Elasticsearch based on flags
|
|
24
|
+
elasticsearch_config = None
|
|
25
|
+
try:
|
|
26
|
+
if getattr(args, "use_env_elasticsearch_config", False):
|
|
27
|
+
# Use environment variables for configuration
|
|
28
|
+
print("⚙️ Using environment variables for Elasticsearch config")
|
|
29
|
+
from eval_protocol.pytest.remote_rollout_processor import (
|
|
30
|
+
create_elasticsearch_config_from_env,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
elasticsearch_config = create_elasticsearch_config_from_env()
|
|
34
|
+
elif not getattr(args, "disable_elasticsearch_setup", False):
|
|
35
|
+
# Default behavior: start or connect to local Elasticsearch via Docker helper
|
|
36
|
+
from eval_protocol.pytest.elasticsearch_setup import ElasticsearchSetup
|
|
37
|
+
|
|
38
|
+
print("🧰 Auto-configuring local Elasticsearch (Docker)")
|
|
39
|
+
elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
|
|
40
|
+
else:
|
|
41
|
+
print("🚫 Elasticsearch setup disabled; running without Elasticsearch integration")
|
|
42
|
+
except Exception as e:
|
|
43
|
+
print(f"❌ Failed to configure Elasticsearch: {e}")
|
|
44
|
+
return 1
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
serve_logs(port=args.port, elasticsearch_config=elasticsearch_config, debug=args.debug)
|
|
48
|
+
return 0
|
|
49
|
+
except KeyboardInterrupt:
|
|
50
|
+
print("\n🛑 Server stopped by user")
|
|
51
|
+
return 0
|
|
52
|
+
except Exception as e:
|
|
53
|
+
print(f"❌ Error starting server: {e}")
|
|
54
|
+
return 1
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import threading
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import Optional, Any, Dict, List, cast
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FireworksTracingHttpHandler(logging.Handler):
|
|
11
|
+
"""Logging handler that posts structured logs to tracing.fireworks gateway /logs endpoint."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, gateway_base_url: Optional[str] = None, rollout_id_env: str = "EP_ROLLOUT_ID") -> None:
|
|
14
|
+
super().__init__()
|
|
15
|
+
self.gateway_base_url = gateway_base_url or os.getenv("FW_TRACING_GATEWAY_BASE_URL")
|
|
16
|
+
self.rollout_id_env = rollout_id_env
|
|
17
|
+
self._session = requests.Session()
|
|
18
|
+
self._lock = threading.Lock()
|
|
19
|
+
|
|
20
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
21
|
+
try:
|
|
22
|
+
if not self.gateway_base_url:
|
|
23
|
+
return
|
|
24
|
+
rollout_id = self._get_rollout_id(record)
|
|
25
|
+
if not rollout_id:
|
|
26
|
+
return
|
|
27
|
+
payload = self._build_payload(record, rollout_id)
|
|
28
|
+
url = f"{self.gateway_base_url.rstrip('/')}/logs"
|
|
29
|
+
with self._lock:
|
|
30
|
+
self._session.post(url, json=payload, timeout=5)
|
|
31
|
+
except Exception:
|
|
32
|
+
# Avoid raising exceptions from logging
|
|
33
|
+
self.handleError(record)
|
|
34
|
+
|
|
35
|
+
def _get_rollout_id(self, record: logging.LogRecord) -> Optional[str]:
|
|
36
|
+
if hasattr(record, "rollout_id") and cast(Any, getattr(record, "rollout_id")) is not None:
|
|
37
|
+
return str(cast(Any, getattr(record, "rollout_id")))
|
|
38
|
+
return os.getenv(self.rollout_id_env)
|
|
39
|
+
|
|
40
|
+
def _build_payload(self, record: logging.LogRecord, rollout_id: str) -> Dict[str, Any]:
|
|
41
|
+
timestamp = datetime.fromtimestamp(record.created, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
|
|
42
|
+
message = record.getMessage()
|
|
43
|
+
tags: List[str] = [f"rollout_id:{rollout_id}"]
|
|
44
|
+
# Optional additional tags
|
|
45
|
+
if hasattr(record, "experiment_id") and cast(Any, getattr(record, "experiment_id")):
|
|
46
|
+
tags.append(f"experiment_id:{cast(Any, getattr(record, 'experiment_id'))}")
|
|
47
|
+
if hasattr(record, "run_id") and cast(Any, getattr(record, "run_id")):
|
|
48
|
+
tags.append(f"run_id:{cast(Any, getattr(record, 'run_id'))}")
|
|
49
|
+
program = cast(Optional[str], getattr(record, "program", None)) or "eval_protocol"
|
|
50
|
+
status_val = cast(Any, getattr(record, "status", None))
|
|
51
|
+
status = status_val if isinstance(status_val, str) else None
|
|
52
|
+
return {
|
|
53
|
+
"program": program,
|
|
54
|
+
"status": status,
|
|
55
|
+
"message": message,
|
|
56
|
+
"tags": tags,
|
|
57
|
+
"metadata": cast(Any, getattr(record, "metadata", None)),
|
|
58
|
+
"extras": {
|
|
59
|
+
"logger_name": record.name,
|
|
60
|
+
"level": record.levelname,
|
|
61
|
+
"timestamp": timestamp,
|
|
62
|
+
},
|
|
63
|
+
}
|
|
@@ -62,7 +62,8 @@ from eval_protocol.pytest.utils import (
|
|
|
62
62
|
run_tasks_with_eval_progress,
|
|
63
63
|
run_tasks_with_run_progress,
|
|
64
64
|
)
|
|
65
|
-
from eval_protocol.utils.show_results_url import store_local_ui_results_url
|
|
65
|
+
from eval_protocol.utils.show_results_url import store_local_ui_results_url, generate_invocation_filter_url
|
|
66
|
+
from eval_protocol.utils.browser_utils import is_logs_server_running, open_browser_tab
|
|
66
67
|
|
|
67
68
|
from ..common_utils import load_jsonl
|
|
68
69
|
|
|
@@ -80,6 +81,7 @@ def evaluation_test(
|
|
|
80
81
|
rollout_processor_kwargs: RolloutProcessorInputParam | None = None,
|
|
81
82
|
aggregation_method: AggregationMethod = "mean",
|
|
82
83
|
passed_threshold: EvaluationThreshold | float | EvaluationThresholdDict | None = None,
|
|
84
|
+
disable_browser_open: bool = False,
|
|
83
85
|
num_runs: int = 1,
|
|
84
86
|
filtered_row_ids: Sequence[str] | None = None,
|
|
85
87
|
max_dataset_rows: int | None = None,
|
|
@@ -246,10 +248,29 @@ def evaluation_test(
|
|
|
246
248
|
else:
|
|
247
249
|
invocation_id = generate_id()
|
|
248
250
|
|
|
251
|
+
# Track whether we've opened browser for this invocation
|
|
252
|
+
browser_opened_for_invocation = False
|
|
253
|
+
|
|
249
254
|
async def wrapper_body(**kwargs: Unpack[ParameterizedTestKwargs]) -> None:
|
|
255
|
+
nonlocal browser_opened_for_invocation
|
|
256
|
+
|
|
250
257
|
# Store URL for viewing results (after all postprocessing is complete)
|
|
251
258
|
store_local_ui_results_url(invocation_id)
|
|
252
259
|
|
|
260
|
+
# Auto-open browser if server is running and not disabled (only once per invocation)
|
|
261
|
+
if (
|
|
262
|
+
not browser_opened_for_invocation
|
|
263
|
+
and not disable_browser_open
|
|
264
|
+
and os.environ.get("EP_DISABLE_AUTO_BROWSER") is None
|
|
265
|
+
):
|
|
266
|
+
is_running, port = is_logs_server_running()
|
|
267
|
+
if is_running:
|
|
268
|
+
# Generate URL for table view with invocation filter
|
|
269
|
+
base_url = f"http://localhost:{port}" if port else "http://localhost:8000"
|
|
270
|
+
table_url = generate_invocation_filter_url(invocation_id, f"{base_url}/table")
|
|
271
|
+
open_browser_tab(table_url)
|
|
272
|
+
browser_opened_for_invocation = True
|
|
273
|
+
|
|
253
274
|
eval_metadata = None
|
|
254
275
|
|
|
255
276
|
all_results: list[list[EvaluationRow]] = [[] for _ in range(num_runs)]
|
{eval_protocol-0.2.46.dev3 → eval_protocol-0.2.48}/eval_protocol/pytest/remote_rollout_processor.py
RENAMED
|
@@ -26,6 +26,25 @@ import os
|
|
|
26
26
|
logger = logging.getLogger(__name__)
|
|
27
27
|
|
|
28
28
|
|
|
29
|
+
def create_elasticsearch_config_from_env() -> ElasticsearchConfig:
|
|
30
|
+
"""Setup Elasticsearch config from environment variables."""
|
|
31
|
+
url = os.getenv("ELASTICSEARCH_URL")
|
|
32
|
+
api_key = os.getenv("ELASTICSEARCH_API_KEY")
|
|
33
|
+
index_name = os.getenv("ELASTICSEARCH_INDEX_NAME")
|
|
34
|
+
|
|
35
|
+
if url is None:
|
|
36
|
+
raise ValueError("ELASTICSEARCH_URL must be set")
|
|
37
|
+
if api_key is None:
|
|
38
|
+
raise ValueError("ELASTICSEARCH_API_KEY must be set")
|
|
39
|
+
if index_name is None:
|
|
40
|
+
raise ValueError("ELASTICSEARCH_INDEX_NAME must be set")
|
|
41
|
+
return ElasticsearchConfig(
|
|
42
|
+
url=url,
|
|
43
|
+
api_key=api_key,
|
|
44
|
+
index_name=index_name,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
29
48
|
def _build_fireworks_tracing_url(
|
|
30
49
|
base_url: str, metadata: RolloutMetadata, completion_params_base_url: Optional[str] = None
|
|
31
50
|
) -> str:
|
|
@@ -93,7 +112,7 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
93
112
|
poll_interval: float = 1.0,
|
|
94
113
|
timeout_seconds: float = 120.0,
|
|
95
114
|
output_data_loader: Optional[Callable[[DataLoaderConfig], DynamicDataLoader]] = None,
|
|
96
|
-
|
|
115
|
+
disable_elastic_search_setup: bool = False,
|
|
97
116
|
elastic_search_config: Optional[ElasticsearchConfig] = None,
|
|
98
117
|
):
|
|
99
118
|
# Prefer constructor-provided configuration. These can be overridden via
|
|
@@ -108,11 +127,11 @@ class RemoteRolloutProcessor(RolloutProcessor):
|
|
|
108
127
|
self._poll_interval = poll_interval
|
|
109
128
|
self._timeout_seconds = timeout_seconds
|
|
110
129
|
self._output_data_loader = output_data_loader or _default_output_data_loader
|
|
111
|
-
self.
|
|
130
|
+
self._disable_elastic_search_setup = disable_elastic_search_setup
|
|
112
131
|
self._elastic_search_config = elastic_search_config
|
|
113
132
|
|
|
114
133
|
def setup(self) -> None:
|
|
115
|
-
if self.
|
|
134
|
+
if self._disable_elastic_search_setup:
|
|
116
135
|
logger.info("Elasticsearch is disabled, skipping setup")
|
|
117
136
|
return
|
|
118
137
|
logger.info("Setting up Elasticsearch")
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Browser utilities for auto-opening evaluation results in the local UI.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
import webbrowser
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Tuple, Optional
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import psutil
|
|
15
|
+
|
|
16
|
+
PSUTIL_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
PSUTIL_AVAILABLE = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_pid_file_path() -> Path:
|
|
22
|
+
"""Get the path to the logs server PID file."""
|
|
23
|
+
from eval_protocol.directory_utils import find_eval_protocol_dir
|
|
24
|
+
|
|
25
|
+
return Path(find_eval_protocol_dir()) / "logs_server.pid"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def write_pid_file(pid: int, port: int) -> None:
|
|
29
|
+
"""
|
|
30
|
+
Write the server PID and port to a file for external processes to check.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
pid: The process ID of the logs server
|
|
34
|
+
port: The port the server is running on
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
pid_file = _get_pid_file_path()
|
|
38
|
+
|
|
39
|
+
data = {"pid": pid, "port": port}
|
|
40
|
+
|
|
41
|
+
with open(pid_file, "w") as f:
|
|
42
|
+
json.dump(data, f)
|
|
43
|
+
|
|
44
|
+
# Use print instead of logger to avoid circular imports
|
|
45
|
+
print(f"Wrote PID file: {pid_file} with PID {pid} and port {port}")
|
|
46
|
+
except Exception as e:
|
|
47
|
+
print(f"Warning: Failed to write PID file: {e}")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def is_logs_server_running() -> Tuple[bool, Optional[int]]:
|
|
51
|
+
"""
|
|
52
|
+
Check if the logs server is running by reading the PID file and verifying the process.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Tuple of (is_running, port) where:
|
|
56
|
+
- is_running: True if server is running, False otherwise
|
|
57
|
+
- port: The port the server is running on, or None if not running
|
|
58
|
+
"""
|
|
59
|
+
if not PSUTIL_AVAILABLE:
|
|
60
|
+
return False, None
|
|
61
|
+
|
|
62
|
+
pid_file = _get_pid_file_path()
|
|
63
|
+
if not pid_file.exists():
|
|
64
|
+
return False, None
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
with open(pid_file, "r") as f:
|
|
68
|
+
data = json.load(f)
|
|
69
|
+
pid = data.get("pid")
|
|
70
|
+
port = data.get("port")
|
|
71
|
+
except (json.JSONDecodeError, KeyError, FileNotFoundError):
|
|
72
|
+
return False, None
|
|
73
|
+
|
|
74
|
+
if pid is None:
|
|
75
|
+
return False, None
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
# Check if the process is still running
|
|
79
|
+
process = psutil.Process(pid)
|
|
80
|
+
if not process.is_running():
|
|
81
|
+
return False, None
|
|
82
|
+
|
|
83
|
+
# Optionally verify it's listening on the expected port
|
|
84
|
+
if port is not None:
|
|
85
|
+
try:
|
|
86
|
+
connections = process.net_connections()
|
|
87
|
+
for conn in connections:
|
|
88
|
+
if conn.laddr.port == port and conn.status == "LISTEN":
|
|
89
|
+
return True, port
|
|
90
|
+
except (psutil.AccessDenied, psutil.NoSuchProcess):
|
|
91
|
+
# If we can't check connections, assume it's running if process exists
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
return True, port
|
|
95
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
96
|
+
return False, None
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def open_browser_tab(url: str, delay: float = 0.5) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Open a URL in a new browser tab with an optional delay.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
url: The URL to open
|
|
105
|
+
delay: Delay in seconds before opening browser (default: 0.5)
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def _open():
|
|
109
|
+
time.sleep(delay) # Give the server time to start
|
|
110
|
+
webbrowser.open_new_tab(url)
|
|
111
|
+
|
|
112
|
+
thread = threading.Thread(target=_open)
|
|
113
|
+
thread.daemon = True
|
|
114
|
+
thread.start()
|
|
@@ -6,6 +6,7 @@ import threading
|
|
|
6
6
|
import time
|
|
7
7
|
from datetime import datetime
|
|
8
8
|
from contextlib import asynccontextmanager
|
|
9
|
+
from pathlib import Path
|
|
9
10
|
from queue import Queue
|
|
10
11
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
11
12
|
|
|
@@ -23,6 +24,7 @@ from eval_protocol.utils.vite_server import ViteServer
|
|
|
23
24
|
from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
|
|
24
25
|
from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig
|
|
25
26
|
from eval_protocol.utils.logs_models import LogEntry, LogsResponse
|
|
27
|
+
from eval_protocol.utils.browser_utils import write_pid_file
|
|
26
28
|
|
|
27
29
|
if TYPE_CHECKING:
|
|
28
30
|
from eval_protocol.models import EvaluationRow
|
|
@@ -378,7 +380,7 @@ class LogsServer(ViteServer):
|
|
|
378
380
|
event_bus.subscribe(self._handle_event)
|
|
379
381
|
logger.debug("[LOGS_SERVER_INIT] Successfully subscribed to event bus")
|
|
380
382
|
|
|
381
|
-
logger.info(f"[LOGS_SERVER_INIT] LogsServer initialized on {host}:{port}")
|
|
383
|
+
logger.info(f"[LOGS_SERVER_INIT] LogsServer initialized on {self.host}:{self.port}")
|
|
382
384
|
|
|
383
385
|
def _setup_websocket_routes(self):
|
|
384
386
|
"""Set up WebSocket routes for real-time communication."""
|
|
@@ -541,6 +543,12 @@ class LogsServer(ViteServer):
|
|
|
541
543
|
)
|
|
542
544
|
|
|
543
545
|
server = uvicorn.Server(config)
|
|
546
|
+
|
|
547
|
+
# Write PID file after server is configured but before serving
|
|
548
|
+
logger.debug(f"[LOGS_SERVER_RUN_ASYNC] Writing PID file for port {self.port}")
|
|
549
|
+
write_pid_file(os.getpid(), self.port)
|
|
550
|
+
logger.debug(f"[LOGS_SERVER_RUN_ASYNC] Successfully wrote PID file for port {self.port}")
|
|
551
|
+
|
|
544
552
|
await server.serve()
|
|
545
553
|
|
|
546
554
|
except KeyboardInterrupt:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.48
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -40,9 +40,9 @@ eval_protocol.egg-info/requires.txt
|
|
|
40
40
|
eval_protocol.egg-info/top_level.txt
|
|
41
41
|
eval_protocol/../vite-app/dist/index.html
|
|
42
42
|
eval_protocol/../vite-app/dist/assets/favicon-BkAAWQga.png
|
|
43
|
-
eval_protocol/../vite-app/dist/assets/index-
|
|
44
|
-
eval_protocol/../vite-app/dist/assets/index-
|
|
45
|
-
eval_protocol/../vite-app/dist/assets/index-
|
|
43
|
+
eval_protocol/../vite-app/dist/assets/index-34WaHH5W.css
|
|
44
|
+
eval_protocol/../vite-app/dist/assets/index-DOPsfOMT.js
|
|
45
|
+
eval_protocol/../vite-app/dist/assets/index-DOPsfOMT.js.map
|
|
46
46
|
eval_protocol/../vite-app/dist/assets/logo-light-BprIBJQW.png
|
|
47
47
|
eval_protocol/adapters/__init__.py
|
|
48
48
|
eval_protocol/adapters/base.py
|
|
@@ -124,6 +124,7 @@ eval_protocol/log_utils/__init__.py
|
|
|
124
124
|
eval_protocol/log_utils/elasticsearch_client.py
|
|
125
125
|
eval_protocol/log_utils/elasticsearch_direct_http_handler.py
|
|
126
126
|
eval_protocol/log_utils/elasticsearch_index_manager.py
|
|
127
|
+
eval_protocol/log_utils/fireworks_tracing_http_handler.py
|
|
127
128
|
eval_protocol/log_utils/rollout_id_filter.py
|
|
128
129
|
eval_protocol/log_utils/util.py
|
|
129
130
|
eval_protocol/mcp/__init__.py
|
|
@@ -165,15 +166,6 @@ eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py
|
|
|
165
166
|
eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md
|
|
166
167
|
eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md
|
|
167
168
|
eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md
|
|
168
|
-
eval_protocol/proxy/__init__.py
|
|
169
|
-
eval_protocol/proxy/proxy_core/__init__.py
|
|
170
|
-
eval_protocol/proxy/proxy_core/app.py
|
|
171
|
-
eval_protocol/proxy/proxy_core/auth.py
|
|
172
|
-
eval_protocol/proxy/proxy_core/langfuse.py
|
|
173
|
-
eval_protocol/proxy/proxy_core/litellm.py
|
|
174
|
-
eval_protocol/proxy/proxy_core/main.py
|
|
175
|
-
eval_protocol/proxy/proxy_core/models.py
|
|
176
|
-
eval_protocol/proxy/proxy_core/redis_utils.py
|
|
177
169
|
eval_protocol/pytest/__init__.py
|
|
178
170
|
eval_protocol/pytest/default_agent_rollout_processor.py
|
|
179
171
|
eval_protocol/pytest/default_dataset_adapter.py
|
|
@@ -238,6 +230,7 @@ eval_protocol/types/types.py
|
|
|
238
230
|
eval_protocol/utils/__init__.py
|
|
239
231
|
eval_protocol/utils/batch_evaluation.py
|
|
240
232
|
eval_protocol/utils/batch_transformation.py
|
|
233
|
+
eval_protocol/utils/browser_utils.py
|
|
241
234
|
eval_protocol/utils/check_server_status.py
|
|
242
235
|
eval_protocol/utils/dataset_helpers.py
|
|
243
236
|
eval_protocol/utils/logs_models.py
|