eval-protocol 0.2.55.dev1__tar.gz → 0.2.57.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.55.dev1/eval_protocol.egg-info → eval_protocol-0.2.57.dev2}/PKG-INFO +1 -1
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/__init__.py +14 -2
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/fireworks_tracing.py +49 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/openai_responses.py +29 -1
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/auth.py +39 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli.py +16 -0
- eval_protocol-0.2.57.dev2/eval_protocol/cli_commands/logs.py +57 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/upload.py +40 -71
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/evaluation.py +125 -40
- eval_protocol-0.2.57.dev2/eval_protocol/event_bus/__init__.py +25 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +11 -0
- eval_protocol-0.2.57.dev2/eval_protocol/log_utils/fireworks_tracing_http_handler.py +138 -0
- eval_protocol-0.2.57.dev2/eval_protocol/log_utils/init.py +69 -0
- eval_protocol-0.2.57.dev2/eval_protocol/log_utils/rollout_context.py +84 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/policy.py +18 -6
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/models.py +3 -1
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/redis_utils.py +11 -2
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/__init__.py +2 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +19 -6
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_single_turn_rollout_process.py +23 -11
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/evaluation_test.py +71 -16
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/evaluation_test_postprocess.py +6 -1
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/generate_parameter_combinations.py +1 -1
- eval_protocol-0.2.57.dev2/eval_protocol/pytest/github_action_rollout_processor.py +223 -0
- eval_protocol-0.2.57.dev2/eval_protocol/pytest/remote_rollout_processor.py +207 -0
- eval_protocol-0.2.57.dev2/eval_protocol/pytest/tracing_utils.py +165 -0
- eval_protocol-0.2.57.dev2/eval_protocol/quickstart/__init__.py +8 -0
- eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge/__init__.py +4 -0
- eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge/llm_judge.py +90 -0
- eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +63 -0
- {eval_protocol-0.2.55.dev1/eval_protocol/quickstart → eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge}/llm_judge_langfuse.py +1 -3
- eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge/utils.py +133 -0
- eval_protocol-0.2.57.dev2/eval_protocol/utils/browser_utils.py +114 -0
- eval_protocol-0.2.57.dev2/eval_protocol/utils/evaluation_row_utils.py +136 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/logs_server.py +87 -6
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/SOURCES.txt +18 -7
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_evaluation_postprocess.py +1 -2
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_quickstart_utils.py +1 -1
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_show_results_url.py +141 -0
- eval_protocol-0.2.57.dev2/vite-app/dist/assets/index-BnDJont9.css +1 -0
- eval_protocol-0.2.57.dev2/vite-app/dist/assets/index-Cu9t0G5i.js +137 -0
- eval_protocol-0.2.57.dev2/vite-app/dist/assets/index-Cu9t0G5i.js.map +1 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vite-app/dist/index.html +2 -2
- eval_protocol-0.2.55.dev1/eval_protocol/cli_commands/logs.py +0 -36
- eval_protocol-0.2.55.dev1/eval_protocol/event_bus/__init__.py +0 -5
- eval_protocol-0.2.55.dev1/eval_protocol/pytest/remote_rollout_processor.py +0 -364
- eval_protocol-0.2.55.dev1/eval_protocol/quickstart/__init__.py +0 -4
- eval_protocol-0.2.55.dev1/vite-app/dist/assets/index-C81y9r9l.js +0 -136
- eval_protocol-0.2.55.dev1/vite-app/dist/assets/index-C81y9r9l.js.map +0 -1
- eval_protocol-0.2.55.dev1/vite-app/dist/assets/index-DpYZaoAr.css +0 -1
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/LICENSE +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/README.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/development/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- eval_protocol-0.2.55.dev1/eval_protocol/pytest/utils.py → eval_protocol-0.2.57.dev2/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.55.dev1/eval_protocol/quickstart → eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge}/llm_judge_langsmith.py +1 -1
- {eval_protocol-0.2.55.dev1/eval_protocol/quickstart → eval_protocol-0.2.57.dev2/eval_protocol/quickstart/aha_judge}/llm_judge_openai_responses.py +1 -1
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/pyproject.toml +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/setup.cfg +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/setup.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_config.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_format.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_length.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_math.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_models.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_server.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/versioneer.py +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.57.dev2
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -29,12 +29,20 @@ from .playback_policy import PlaybackPolicyBase
|
|
|
29
29
|
from .resources import create_llm_resource
|
|
30
30
|
from .reward_function import RewardFunction
|
|
31
31
|
from .typed_interface import reward_function
|
|
32
|
-
from .quickstart import aha_judge
|
|
33
|
-
from .
|
|
32
|
+
from .quickstart.aha_judge import aha_judge
|
|
33
|
+
from .utils.evaluation_row_utils import (
|
|
34
|
+
multi_turn_assistant_to_ground_truth,
|
|
35
|
+
assistant_to_ground_truth,
|
|
36
|
+
filter_longest_conversation,
|
|
37
|
+
)
|
|
38
|
+
from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor, GithubActionRolloutProcessor
|
|
34
39
|
from .pytest.parameterize import DefaultParameterIdGenerator
|
|
35
40
|
from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
|
|
36
41
|
from .log_utils.rollout_id_filter import RolloutIdFilter
|
|
37
42
|
from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
|
|
43
|
+
from .log_utils.fireworks_tracing_http_handler import FireworksTracingHttpHandler
|
|
44
|
+
from .log_utils.elasticsearch_client import ElasticsearchConfig
|
|
45
|
+
|
|
38
46
|
|
|
39
47
|
from .types.remote_rollout_processor import (
|
|
40
48
|
InitRequest,
|
|
@@ -81,12 +89,14 @@ except ImportError:
|
|
|
81
89
|
warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
|
|
82
90
|
|
|
83
91
|
__all__ = [
|
|
92
|
+
"ElasticsearchConfig",
|
|
84
93
|
"ElasticsearchDirectHttpHandler",
|
|
85
94
|
"RolloutIdFilter",
|
|
86
95
|
"setup_rollout_logging_for_elasticsearch_handler",
|
|
87
96
|
"DataLoaderConfig",
|
|
88
97
|
"Status",
|
|
89
98
|
"RemoteRolloutProcessor",
|
|
99
|
+
"GithubActionRolloutProcessor",
|
|
90
100
|
"InputMetadata",
|
|
91
101
|
"EvaluationRow",
|
|
92
102
|
"DefaultParameterIdGenerator",
|
|
@@ -95,6 +105,7 @@ __all__ = [
|
|
|
95
105
|
"aha_judge",
|
|
96
106
|
"multi_turn_assistant_to_ground_truth",
|
|
97
107
|
"assistant_to_ground_truth",
|
|
108
|
+
"filter_longest_conversation",
|
|
98
109
|
"evaluation_test",
|
|
99
110
|
"SingleTurnRolloutProcessor",
|
|
100
111
|
"OpenAIResponsesAdapter",
|
|
@@ -103,6 +114,7 @@ __all__ = [
|
|
|
103
114
|
"BraintrustAdapter",
|
|
104
115
|
"create_braintrust_adapter",
|
|
105
116
|
"LangSmithAdapter",
|
|
117
|
+
"FireworksTracingHttpHandler",
|
|
106
118
|
# Core interfaces
|
|
107
119
|
"Message",
|
|
108
120
|
"MetricResult",
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-21T14:44:45-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "5a0eb89e557f1362bc17acd8a02c25a072dc3092",
|
|
15
|
+
"version": "0.2.57-dev2"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
@@ -265,6 +265,55 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
265
265
|
self.base_url = base_url.rstrip("/")
|
|
266
266
|
self.timeout = timeout
|
|
267
267
|
|
|
268
|
+
def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
|
|
269
|
+
"""Fetch logs from Fireworks tracing gateway /logs endpoint.
|
|
270
|
+
|
|
271
|
+
Returns entries with keys: timestamp, message, severity, tags.
|
|
272
|
+
"""
|
|
273
|
+
if not tags:
|
|
274
|
+
raise ValueError("At least one tag is required to fetch logs")
|
|
275
|
+
|
|
276
|
+
headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
|
|
277
|
+
params: Dict[str, Any] = {"tags": tags, "limit": limit, "hours_back": hours_back, "program": "eval_protocol"}
|
|
278
|
+
|
|
279
|
+
# Try /logs first, fall back to /v1/logs if not found
|
|
280
|
+
urls_to_try = [f"{self.base_url}/logs", f"{self.base_url}/v1/logs"]
|
|
281
|
+
data: Dict[str, Any] = {}
|
|
282
|
+
last_error: Optional[str] = None
|
|
283
|
+
for url in urls_to_try:
|
|
284
|
+
try:
|
|
285
|
+
response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
|
|
286
|
+
if response.status_code == 404:
|
|
287
|
+
# Try next variant
|
|
288
|
+
last_error = f"404 for {url}"
|
|
289
|
+
continue
|
|
290
|
+
response.raise_for_status()
|
|
291
|
+
data = response.json() or {}
|
|
292
|
+
break
|
|
293
|
+
except requests.exceptions.RequestException as e:
|
|
294
|
+
last_error = str(e)
|
|
295
|
+
continue
|
|
296
|
+
else:
|
|
297
|
+
# All attempts failed
|
|
298
|
+
if last_error:
|
|
299
|
+
logger.error("Failed to fetch logs from Fireworks (tried %s): %s", urls_to_try, last_error)
|
|
300
|
+
return []
|
|
301
|
+
|
|
302
|
+
entries: List[Dict[str, Any]] = data.get("entries", []) or []
|
|
303
|
+
# Normalize minimal shape
|
|
304
|
+
results: List[Dict[str, Any]] = []
|
|
305
|
+
for e in entries:
|
|
306
|
+
results.append(
|
|
307
|
+
{
|
|
308
|
+
"timestamp": e.get("timestamp"),
|
|
309
|
+
"message": e.get("message"),
|
|
310
|
+
"severity": e.get("severity", "INFO"),
|
|
311
|
+
"tags": e.get("tags", []),
|
|
312
|
+
"status": e.get("status"),
|
|
313
|
+
}
|
|
314
|
+
)
|
|
315
|
+
return results
|
|
316
|
+
|
|
268
317
|
def get_evaluation_rows(
|
|
269
318
|
self,
|
|
270
319
|
tags: List[str],
|
{eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/openai_responses.py
RENAMED
|
@@ -169,7 +169,9 @@ class OpenAIResponsesAdapter(BaseAdapter):
|
|
|
169
169
|
raise NotImplementedError(f"Unsupported content type: {content_item.type}")
|
|
170
170
|
elif item.type == "function_call_output":
|
|
171
171
|
# Collect tool call outputs to add before assistant message
|
|
172
|
-
tool_call_outputs.append(
|
|
172
|
+
tool_call_outputs.append(
|
|
173
|
+
Message(role="tool", content=self._coerce_tool_output(item.output), tool_call_id=item.call_id)
|
|
174
|
+
)
|
|
173
175
|
elif item.type == "function_call":
|
|
174
176
|
tool_call = ChatCompletionMessageToolCall(
|
|
175
177
|
id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
|
|
@@ -186,3 +188,29 @@ class OpenAIResponsesAdapter(BaseAdapter):
|
|
|
186
188
|
messages.append(Message(role="assistant", tool_calls=current_tool_calls))
|
|
187
189
|
|
|
188
190
|
return reversed(messages)
|
|
191
|
+
|
|
192
|
+
def _coerce_tool_output(self, output: Any) -> str:
|
|
193
|
+
"""Coerce OpenAI Responses tool output into a string for Message.content.
|
|
194
|
+
|
|
195
|
+
The Responses API may return structured content lists. For our purposes,
|
|
196
|
+
we stringify non-string outputs to satisfy the Message.content type.
|
|
197
|
+
"""
|
|
198
|
+
if isinstance(output, str):
|
|
199
|
+
return output
|
|
200
|
+
try:
|
|
201
|
+
# Attempt to join list of objects with any 'text' fields
|
|
202
|
+
if isinstance(output, list):
|
|
203
|
+
parts: list[str] = []
|
|
204
|
+
for part in output:
|
|
205
|
+
text = None
|
|
206
|
+
if isinstance(part, dict):
|
|
207
|
+
text = part.get("text")
|
|
208
|
+
if text:
|
|
209
|
+
parts.append(str(text))
|
|
210
|
+
else:
|
|
211
|
+
parts.append(str(part))
|
|
212
|
+
return "\n".join(parts)
|
|
213
|
+
# Fallback to string conversion
|
|
214
|
+
return str(output)
|
|
215
|
+
except Exception:
|
|
216
|
+
return str(output)
|
|
@@ -4,6 +4,8 @@ import os
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Dict, Optional # Added Dict
|
|
6
6
|
|
|
7
|
+
import requests
|
|
8
|
+
|
|
7
9
|
logger = logging.getLogger(__name__)
|
|
8
10
|
|
|
9
11
|
# Default locations (used for tests and as fallback). Actual resolution is dynamic via _get_auth_ini_file().
|
|
@@ -218,3 +220,40 @@ def get_fireworks_api_base() -> str:
|
|
|
218
220
|
else:
|
|
219
221
|
logger.debug("FIREWORKS_API_BASE not set in environment, defaulting to %s.", api_base)
|
|
220
222
|
return api_base
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def verify_api_key_and_get_account_id(
|
|
226
|
+
api_key: Optional[str] = None,
|
|
227
|
+
api_base: Optional[str] = None,
|
|
228
|
+
) -> Optional[str]:
|
|
229
|
+
"""
|
|
230
|
+
Calls the Fireworks API verify endpoint to validate the API key and returns the
|
|
231
|
+
account id from response headers when available.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
|
|
235
|
+
api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
The resolved account id if verification succeeds and the header is present; otherwise None.
|
|
239
|
+
"""
|
|
240
|
+
try:
|
|
241
|
+
resolved_key = api_key or get_fireworks_api_key()
|
|
242
|
+
if not resolved_key:
|
|
243
|
+
return None
|
|
244
|
+
resolved_base = api_base or get_fireworks_api_base()
|
|
245
|
+
url = f"{resolved_base.rstrip('/')}/verifyApiKey"
|
|
246
|
+
headers = {"Authorization": f"Bearer {resolved_key}"}
|
|
247
|
+
resp = requests.get(url, headers=headers, timeout=10)
|
|
248
|
+
if resp.status_code != 200:
|
|
249
|
+
logger.debug("verifyApiKey returned status %s", resp.status_code)
|
|
250
|
+
return None
|
|
251
|
+
# Header keys could vary in case; requests provides case-insensitive dict
|
|
252
|
+
account_id = resp.headers.get("x-fireworks-account-id") or resp.headers.get("X-Fireworks-Account-Id")
|
|
253
|
+
if account_id and account_id.strip():
|
|
254
|
+
logger.debug("Resolved FIREWORKS_ACCOUNT_ID via verifyApiKey: %s", account_id)
|
|
255
|
+
return account_id.strip()
|
|
256
|
+
return None
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.debug("Failed to verify API key for account id resolution: %s", e)
|
|
259
|
+
return None
|
|
@@ -301,6 +301,22 @@ def parse_args(args=None):
|
|
|
301
301
|
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
|
|
302
302
|
logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
|
|
303
303
|
logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
|
304
|
+
logs_parser.add_argument("--disable-elasticsearch-setup", action="store_true", help="Disable Elasticsearch setup")
|
|
305
|
+
logs_parser.add_argument(
|
|
306
|
+
"--use-env-elasticsearch-config",
|
|
307
|
+
action="store_true",
|
|
308
|
+
help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
|
|
309
|
+
)
|
|
310
|
+
logs_parser.add_argument(
|
|
311
|
+
"--use-fireworks",
|
|
312
|
+
action="store_true",
|
|
313
|
+
help="Force Fireworks tracing backend for logs UI (overrides env auto-detection)",
|
|
314
|
+
)
|
|
315
|
+
logs_parser.add_argument(
|
|
316
|
+
"--use-elasticsearch",
|
|
317
|
+
action="store_true",
|
|
318
|
+
help="Force Elasticsearch backend for logs UI (overrides env auto-detection)",
|
|
319
|
+
)
|
|
304
320
|
|
|
305
321
|
# Upload command
|
|
306
322
|
upload_parser = subparsers.add_parser(
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI command for serving logs with file watching and real-time updates.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from ..utils.logs_server import serve_logs
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def logs_command(args):
|
|
13
|
+
"""Serve logs with file watching and real-time updates"""
|
|
14
|
+
|
|
15
|
+
port = args.port
|
|
16
|
+
print("🚀 Starting Eval Protocol Logs Server")
|
|
17
|
+
print(f"🌐 URL: http://localhost:{port}")
|
|
18
|
+
print(f"🔌 WebSocket: ws://localhost:{port}/ws")
|
|
19
|
+
print(f"👀 Watching paths: {['current directory']}")
|
|
20
|
+
print(f"🔍 Debug mode: {args.debug}")
|
|
21
|
+
print("Press Ctrl+C to stop the server")
|
|
22
|
+
print("-" * 50)
|
|
23
|
+
|
|
24
|
+
# Backend selection: Fireworks first when API key present, unless overridden
|
|
25
|
+
use_fireworks = False
|
|
26
|
+
if getattr(args, "use_fireworks", False):
|
|
27
|
+
use_fireworks = True
|
|
28
|
+
elif getattr(args, "use_elasticsearch", False):
|
|
29
|
+
use_fireworks = False
|
|
30
|
+
else:
|
|
31
|
+
use_fireworks = bool(os.environ.get("FIREWORKS_API_KEY"))
|
|
32
|
+
|
|
33
|
+
# Setup backend configs
|
|
34
|
+
elasticsearch_config = None
|
|
35
|
+
# Prefer explicit FW_TRACING_GATEWAY_BASE_URL, then GATEWAY_URL from env (remote validation),
|
|
36
|
+
# finally default to public tracing.fireworks.ai
|
|
37
|
+
fireworks_base_url = (
|
|
38
|
+
os.environ.get("FW_TRACING_GATEWAY_BASE_URL")
|
|
39
|
+
or os.environ.get("GATEWAY_URL")
|
|
40
|
+
or "https://tracing.fireworks.ai"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
serve_logs(
|
|
45
|
+
port=args.port,
|
|
46
|
+
elasticsearch_config=elasticsearch_config,
|
|
47
|
+
debug=args.debug,
|
|
48
|
+
backend="fireworks" if use_fireworks else "elasticsearch",
|
|
49
|
+
fireworks_base_url=fireworks_base_url if use_fireworks else None,
|
|
50
|
+
)
|
|
51
|
+
return 0
|
|
52
|
+
except KeyboardInterrupt:
|
|
53
|
+
print("\n🛑 Server stopped by user")
|
|
54
|
+
return 0
|
|
55
|
+
except Exception as e:
|
|
56
|
+
print(f"❌ Error starting server: {e}")
|
|
57
|
+
return 1
|
{eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/upload.py
RENAMED
|
@@ -12,7 +12,12 @@ from pathlib import Path
|
|
|
12
12
|
from typing import Any, Callable, Iterable, Optional
|
|
13
13
|
|
|
14
14
|
import pytest
|
|
15
|
-
from eval_protocol.auth import
|
|
15
|
+
from eval_protocol.auth import (
|
|
16
|
+
get_fireworks_account_id,
|
|
17
|
+
get_fireworks_api_key,
|
|
18
|
+
get_fireworks_api_base,
|
|
19
|
+
verify_api_key_and_get_account_id,
|
|
20
|
+
)
|
|
16
21
|
from eval_protocol.platform_api import create_or_update_fireworks_secret
|
|
17
22
|
|
|
18
23
|
from eval_protocol.evaluation import create_evaluation
|
|
@@ -259,81 +264,43 @@ def _parse_entry(entry: str, cwd: str) -> tuple[str, str]:
|
|
|
259
264
|
raise ValueError("--entry must be in 'module::function', 'path::function', or 'module:function' format")
|
|
260
265
|
|
|
261
266
|
|
|
262
|
-
def
|
|
267
|
+
def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]:
|
|
263
268
|
target, func = _parse_entry(entry, cwd)
|
|
264
269
|
|
|
265
|
-
#
|
|
270
|
+
# Determine the file path to load
|
|
266
271
|
if "/" in target or "\\" in target or os.path.exists(target):
|
|
267
|
-
# It's a file path - convert to absolute
|
|
272
|
+
# It's a file path - convert to absolute
|
|
268
273
|
if not os.path.isabs(target):
|
|
269
274
|
target = os.path.abspath(os.path.join(cwd, target))
|
|
270
|
-
|
|
271
275
|
if not target.endswith(".py"):
|
|
272
276
|
target = target + ".py"
|
|
273
|
-
|
|
274
277
|
if not os.path.isfile(target):
|
|
275
278
|
raise ValueError(f"File not found: {target}")
|
|
276
|
-
|
|
277
|
-
# Import module from file path
|
|
278
|
-
spec = importlib.util.spec_from_file_location(Path(target).stem, target)
|
|
279
|
-
if not spec or not spec.loader:
|
|
280
|
-
raise ValueError(f"Unable to load module from path: {target}")
|
|
281
|
-
module = importlib.util.module_from_spec(spec)
|
|
282
|
-
sys.modules[spec.name] = module
|
|
283
|
-
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
|
284
|
-
module_name = spec.name
|
|
285
279
|
source_file_path = target
|
|
286
280
|
else:
|
|
287
|
-
# Treat as
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
281
|
+
# Treat dotted name as a file path
|
|
282
|
+
dotted_as_path = target.replace(".", "/") + ".py"
|
|
283
|
+
source_file_path = os.path.join(cwd, dotted_as_path)
|
|
284
|
+
|
|
285
|
+
# Load the module from the file path
|
|
286
|
+
spec = importlib.util.spec_from_file_location(Path(source_file_path).stem, source_file_path)
|
|
287
|
+
if not spec or not spec.loader:
|
|
288
|
+
raise ValueError(f"Unable to load module from path: {source_file_path}")
|
|
289
|
+
module = importlib.util.module_from_spec(spec)
|
|
290
|
+
sys.modules[spec.name] = module
|
|
291
|
+
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
|
292
|
+
module_name = spec.name
|
|
291
293
|
|
|
292
294
|
if not hasattr(module, func):
|
|
293
295
|
raise ValueError(f"Function '{func}' not found in module '{module_name}'")
|
|
294
296
|
|
|
295
297
|
qualname = f"{module_name}.{func}"
|
|
296
|
-
|
|
297
|
-
DiscoveredTest(
|
|
298
|
-
module_path=module_name,
|
|
299
|
-
module_name=module_name,
|
|
300
|
-
qualname=qualname,
|
|
301
|
-
file_path=getattr(module, "__file__", module_name),
|
|
302
|
-
lineno=None,
|
|
303
|
-
has_parametrize=False,
|
|
304
|
-
param_count=0,
|
|
305
|
-
nodeids=[],
|
|
306
|
-
)
|
|
307
|
-
)
|
|
308
|
-
return code, file_name, qualname, os.path.abspath(source_file_path) if source_file_path else ""
|
|
298
|
+
return qualname, os.path.abspath(source_file_path) if source_file_path else ""
|
|
309
299
|
|
|
310
300
|
|
|
311
301
|
def _generate_ts_mode_code(test: DiscoveredTest) -> tuple[str, str]:
|
|
312
|
-
#
|
|
313
|
-
|
|
314
|
-
func = test.qualname.split(".")[-1]
|
|
315
|
-
code = f"""
|
|
316
|
-
from typing import Any, Dict, List, Optional, Union
|
|
317
|
-
|
|
318
|
-
from eval_protocol.models import EvaluationRow, Message
|
|
319
|
-
from {module} import {func} as _ep_test
|
|
320
|
-
|
|
321
|
-
def evaluate(messages: List[Dict[str, Any]], ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs):
|
|
322
|
-
row = EvaluationRow(messages=[Message(**m) for m in messages], ground_truth=ground_truth)
|
|
323
|
-
result = _ep_test(row) # Supports sync/async via decorator's dual-mode
|
|
324
|
-
if hasattr(result, "__await__"):
|
|
325
|
-
import asyncio
|
|
326
|
-
result = asyncio.get_event_loop().run_until_complete(result)
|
|
327
|
-
if result.evaluation_result is None:
|
|
328
|
-
return {{"score": 0.0, "reason": "No evaluation_result set"}}
|
|
329
|
-
out = {{
|
|
330
|
-
"score": float(result.evaluation_result.score or 0.0),
|
|
331
|
-
"reason": result.evaluation_result.reason,
|
|
332
|
-
"metrics": {{k: (v.model_dump() if hasattr(v, "model_dump") else v) for k, v in (result.evaluation_result.metrics or {{}}).items()}},
|
|
333
|
-
}}
|
|
334
|
-
return out
|
|
335
|
-
"""
|
|
336
|
-
return (code, "main.py")
|
|
302
|
+
# Deprecated: we no longer generate a shim; keep stub for import compatibility
|
|
303
|
+
return ("", "main.py")
|
|
337
304
|
|
|
338
305
|
|
|
339
306
|
def _normalize_evaluator_id(evaluator_id: str) -> str:
|
|
@@ -522,10 +489,10 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
522
489
|
entries_arg = getattr(args, "entry", None)
|
|
523
490
|
if entries_arg:
|
|
524
491
|
entries = [e.strip() for e in re.split(r"[,\s]+", entries_arg) if e.strip()]
|
|
525
|
-
selected_specs: list[tuple[str, str
|
|
492
|
+
selected_specs: list[tuple[str, str]] = []
|
|
526
493
|
for e in entries:
|
|
527
|
-
|
|
528
|
-
selected_specs.append((
|
|
494
|
+
qualname, resolved_path = _resolve_entry_to_qual_and_source(e, root)
|
|
495
|
+
selected_specs.append((qualname, resolved_path))
|
|
529
496
|
else:
|
|
530
497
|
print("Scanning for evaluation tests...")
|
|
531
498
|
tests = _discover_tests(root)
|
|
@@ -545,11 +512,7 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
545
512
|
print(" handles all parameter combinations. The evaluator will work with")
|
|
546
513
|
print(" the same logic regardless of which model/parameters are used.")
|
|
547
514
|
|
|
548
|
-
selected_specs = []
|
|
549
|
-
for t in selected_tests:
|
|
550
|
-
code, file_name = _generate_ts_mode_code(t)
|
|
551
|
-
# Store test info for better ID generation
|
|
552
|
-
selected_specs.append((code, file_name, t.qualname, t.file_path))
|
|
515
|
+
selected_specs = [(t.qualname, t.file_path) for t in selected_tests]
|
|
553
516
|
|
|
554
517
|
base_id = getattr(args, "id", None)
|
|
555
518
|
display_name = getattr(args, "display_name", None)
|
|
@@ -560,6 +523,14 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
560
523
|
try:
|
|
561
524
|
fw_account_id = get_fireworks_account_id()
|
|
562
525
|
fw_api_key_value = get_fireworks_api_key()
|
|
526
|
+
if not fw_account_id and fw_api_key_value:
|
|
527
|
+
# Attempt to verify and resolve account id from server headers
|
|
528
|
+
resolved = verify_api_key_and_get_account_id(api_key=fw_api_key_value, api_base=get_fireworks_api_base())
|
|
529
|
+
if resolved:
|
|
530
|
+
fw_account_id = resolved
|
|
531
|
+
# Propagate to environment so downstream calls use it if needed
|
|
532
|
+
os.environ["FIREWORKS_ACCOUNT_ID"] = fw_account_id
|
|
533
|
+
print(f"Resolved FIREWORKS_ACCOUNT_ID via API verification: {fw_account_id}")
|
|
563
534
|
if fw_account_id and fw_api_key_value:
|
|
564
535
|
print("Ensuring FIREWORKS_API_KEY is registered as a secret on Fireworks for rollout...")
|
|
565
536
|
if create_or_update_fireworks_secret(
|
|
@@ -579,8 +550,7 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
579
550
|
print(f"Warning: Skipped Fireworks secret registration due to error: {e}")
|
|
580
551
|
|
|
581
552
|
exit_code = 0
|
|
582
|
-
for i, (
|
|
583
|
-
# Use ts_mode to upload evaluator
|
|
553
|
+
for i, (qualname, source_file_path) in enumerate(selected_specs):
|
|
584
554
|
# Generate a short default ID from just the test function name
|
|
585
555
|
if base_id:
|
|
586
556
|
evaluator_id = base_id
|
|
@@ -618,12 +588,11 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
618
588
|
|
|
619
589
|
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
|
|
620
590
|
try:
|
|
591
|
+
test_dir = root
|
|
592
|
+
metric_name = os.path.basename(test_dir) or "metric"
|
|
621
593
|
result = create_evaluation(
|
|
622
594
|
evaluator_id=evaluator_id,
|
|
623
|
-
|
|
624
|
-
python_file_name_for_code=file_name,
|
|
625
|
-
criterion_name_for_code=qualname,
|
|
626
|
-
criterion_description_for_code=description or f"Evaluator for {qualname}",
|
|
595
|
+
metric_folders=[f"{metric_name}={test_dir}"],
|
|
627
596
|
display_name=display_name or evaluator_id,
|
|
628
597
|
description=description or f"Evaluator for {qualname}",
|
|
629
598
|
force=force,
|