eval-protocol 0.2.53__tar.gz → 0.2.54.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.53/eval_protocol.egg-info → eval_protocol-0.2.54.dev0}/PKG-INFO +1 -1
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/__init__.py +10 -7
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/fireworks_tracing.py +8 -6
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/openai_responses.py +1 -29
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/auth.py +0 -39
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli.py +0 -6
- eval_protocol-0.2.54.dev0/eval_protocol/cli_commands/logs.py +36 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/upload.py +54 -27
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/evaluation.py +40 -125
- eval_protocol-0.2.54.dev0/eval_protocol/proxy/__init__.py +18 -0
- eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/__init__.py +13 -0
- eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/app.py +305 -0
- eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/auth.py +17 -0
- eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/langfuse.py +528 -0
- eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/litellm.py +170 -0
- eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/main.py +10 -0
- eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/models.py +98 -0
- eval_protocol-0.2.54.dev0/eval_protocol/proxy/proxy_core/redis_utils.py +48 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/evaluation_test.py +1 -22
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/remote_rollout_processor.py +3 -22
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/logs_server.py +1 -9
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol.egg-info/SOURCES.txt +12 -5
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_show_results_url.py +0 -141
- eval_protocol-0.2.53/vite-app/dist/assets/index-zf20-zFD.js → eval_protocol-0.2.54.dev0/vite-app/dist/assets/index-C81y9r9l.js +25 -25
- eval_protocol-0.2.53/vite-app/dist/assets/index-zf20-zFD.js.map → eval_protocol-0.2.54.dev0/vite-app/dist/assets/index-C81y9r9l.js.map +1 -1
- eval_protocol-0.2.54.dev0/vite-app/dist/assets/index-DpYZaoAr.css +1 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vite-app/dist/index.html +2 -2
- eval_protocol-0.2.53/eval_protocol/cli_commands/logs.py +0 -76
- eval_protocol-0.2.53/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -63
- eval_protocol-0.2.53/eval_protocol/utils/browser_utils.py +0 -114
- eval_protocol-0.2.53/vite-app/dist/assets/index-BGlGI2LH.css +0 -1
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/LICENSE +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/README.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/development/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/models.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/pyproject.toml +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/setup.cfg +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/setup.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_config.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_format.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_length.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_math.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_models.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_server.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/versioneer.py +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.54.dev0
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -31,14 +31,10 @@ from .reward_function import RewardFunction
|
|
|
31
31
|
from .typed_interface import reward_function
|
|
32
32
|
from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
|
|
33
33
|
from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
|
|
34
|
-
from .pytest.remote_rollout_processor import create_elasticsearch_config_from_env
|
|
35
34
|
from .pytest.parameterize import DefaultParameterIdGenerator
|
|
36
35
|
from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
|
|
37
36
|
from .log_utils.rollout_id_filter import RolloutIdFilter
|
|
38
37
|
from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
|
|
39
|
-
from .log_utils.fireworks_tracing_http_handler import FireworksTracingHttpHandler
|
|
40
|
-
from .log_utils.elasticsearch_client import ElasticsearchConfig
|
|
41
|
-
|
|
42
38
|
|
|
43
39
|
from .types.remote_rollout_processor import (
|
|
44
40
|
InitRequest,
|
|
@@ -74,11 +70,16 @@ try:
|
|
|
74
70
|
except ImportError:
|
|
75
71
|
WeaveAdapter = None
|
|
76
72
|
|
|
73
|
+
try:
|
|
74
|
+
from .proxy import create_app, AuthProvider
|
|
75
|
+
except ImportError:
|
|
76
|
+
create_app = None
|
|
77
|
+
AuthProvider = None
|
|
78
|
+
|
|
79
|
+
|
|
77
80
|
warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
|
|
78
81
|
|
|
79
82
|
__all__ = [
|
|
80
|
-
"create_elasticsearch_config_from_env",
|
|
81
|
-
"ElasticsearchConfig",
|
|
82
83
|
"ElasticsearchDirectHttpHandler",
|
|
83
84
|
"RolloutIdFilter",
|
|
84
85
|
"setup_rollout_logging_for_elasticsearch_handler",
|
|
@@ -101,7 +102,6 @@ __all__ = [
|
|
|
101
102
|
"BraintrustAdapter",
|
|
102
103
|
"create_braintrust_adapter",
|
|
103
104
|
"LangSmithAdapter",
|
|
104
|
-
"FireworksTracingHttpHandler",
|
|
105
105
|
# Core interfaces
|
|
106
106
|
"Message",
|
|
107
107
|
"MetricResult",
|
|
@@ -137,6 +137,9 @@ __all__ = [
|
|
|
137
137
|
"RolloutMetadata",
|
|
138
138
|
"StatusResponse",
|
|
139
139
|
"create_langfuse_config_tags",
|
|
140
|
+
# Proxy
|
|
141
|
+
"create_app",
|
|
142
|
+
"AuthProvider",
|
|
140
143
|
]
|
|
141
144
|
|
|
142
145
|
from . import _version
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-13T17:28:46-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "bfe8e3146c3971cadf5c7e43d259b40e7e26163a",
|
|
15
|
+
"version": "0.2.54-dev"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
{eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
@@ -7,9 +7,9 @@ to pull data from Langfuse deployments with simplified retry logic handling.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
import logging
|
|
9
9
|
import requests
|
|
10
|
-
import time
|
|
11
10
|
from datetime import datetime
|
|
12
11
|
from typing import Any, Dict, List, Optional, Protocol
|
|
12
|
+
import os
|
|
13
13
|
|
|
14
14
|
from eval_protocol.models import EvaluationRow, InputMetadata, ExecutionMetadata, Message
|
|
15
15
|
from .base import BaseAdapter
|
|
@@ -343,15 +343,17 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
343
343
|
# Remove None values
|
|
344
344
|
params = {k: v for k, v in params.items() if v is not None}
|
|
345
345
|
|
|
346
|
-
# Make request to proxy
|
|
346
|
+
# Make request to proxy (using pointwise for efficiency)
|
|
347
347
|
if self.project_id:
|
|
348
|
-
url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
|
|
348
|
+
url = f"{self.base_url}/v1/project_id/{self.project_id}/traces/pointwise"
|
|
349
349
|
else:
|
|
350
|
-
url = f"{self.base_url}/v1/traces"
|
|
350
|
+
url = f"{self.base_url}/v1/traces/pointwise"
|
|
351
|
+
|
|
352
|
+
headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
|
|
351
353
|
|
|
352
354
|
result = None
|
|
353
355
|
try:
|
|
354
|
-
response = requests.get(url, params=params, timeout=self.timeout)
|
|
356
|
+
response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
|
|
355
357
|
response.raise_for_status()
|
|
356
358
|
result = response.json()
|
|
357
359
|
except requests.exceptions.HTTPError as e:
|
|
@@ -365,7 +367,7 @@ class FireworksTracingAdapter(BaseAdapter):
|
|
|
365
367
|
except Exception: # In case e.response.json() fails
|
|
366
368
|
error_msg = f"Proxy error: {e.response.text}"
|
|
367
369
|
|
|
368
|
-
logger.error("Failed to fetch traces from proxy: %s", error_msg)
|
|
370
|
+
logger.error("Failed to fetch traces from proxy (HTTP %s): %s", e.response.status_code, error_msg)
|
|
369
371
|
return eval_rows
|
|
370
372
|
except requests.exceptions.RequestException as e:
|
|
371
373
|
# Non-HTTP errors (network issues, timeouts, etc.)
|
{eval_protocol-0.2.53 → eval_protocol-0.2.54.dev0}/eval_protocol/adapters/openai_responses.py
RENAMED
|
@@ -169,9 +169,7 @@ class OpenAIResponsesAdapter(BaseAdapter):
|
|
|
169
169
|
raise NotImplementedError(f"Unsupported content type: {content_item.type}")
|
|
170
170
|
elif item.type == "function_call_output":
|
|
171
171
|
# Collect tool call outputs to add before assistant message
|
|
172
|
-
tool_call_outputs.append(
|
|
173
|
-
Message(role="tool", content=self._coerce_tool_output(item.output), tool_call_id=item.call_id)
|
|
174
|
-
)
|
|
172
|
+
tool_call_outputs.append(Message(role="tool", content=item.output, tool_call_id=item.call_id))
|
|
175
173
|
elif item.type == "function_call":
|
|
176
174
|
tool_call = ChatCompletionMessageToolCall(
|
|
177
175
|
id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
|
|
@@ -188,29 +186,3 @@ class OpenAIResponsesAdapter(BaseAdapter):
|
|
|
188
186
|
messages.append(Message(role="assistant", tool_calls=current_tool_calls))
|
|
189
187
|
|
|
190
188
|
return reversed(messages)
|
|
191
|
-
|
|
192
|
-
def _coerce_tool_output(self, output: Any) -> str:
|
|
193
|
-
"""Coerce OpenAI Responses tool output into a string for Message.content.
|
|
194
|
-
|
|
195
|
-
The Responses API may return structured content lists. For our purposes,
|
|
196
|
-
we stringify non-string outputs to satisfy the Message.content type.
|
|
197
|
-
"""
|
|
198
|
-
if isinstance(output, str):
|
|
199
|
-
return output
|
|
200
|
-
try:
|
|
201
|
-
# Attempt to join list of objects with any 'text' fields
|
|
202
|
-
if isinstance(output, list):
|
|
203
|
-
parts: list[str] = []
|
|
204
|
-
for part in output:
|
|
205
|
-
text = None
|
|
206
|
-
if isinstance(part, dict):
|
|
207
|
-
text = part.get("text")
|
|
208
|
-
if text:
|
|
209
|
-
parts.append(str(text))
|
|
210
|
-
else:
|
|
211
|
-
parts.append(str(part))
|
|
212
|
-
return "\n".join(parts)
|
|
213
|
-
# Fallback to string conversion
|
|
214
|
-
return str(output)
|
|
215
|
-
except Exception:
|
|
216
|
-
return str(output)
|
|
@@ -4,8 +4,6 @@ import os
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Dict, Optional # Added Dict
|
|
6
6
|
|
|
7
|
-
import requests
|
|
8
|
-
|
|
9
7
|
logger = logging.getLogger(__name__)
|
|
10
8
|
|
|
11
9
|
# Default locations (used for tests and as fallback). Actual resolution is dynamic via _get_auth_ini_file().
|
|
@@ -220,40 +218,3 @@ def get_fireworks_api_base() -> str:
|
|
|
220
218
|
else:
|
|
221
219
|
logger.debug("FIREWORKS_API_BASE not set in environment, defaulting to %s.", api_base)
|
|
222
220
|
return api_base
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
def verify_api_key_and_get_account_id(
|
|
226
|
-
api_key: Optional[str] = None,
|
|
227
|
-
api_base: Optional[str] = None,
|
|
228
|
-
) -> Optional[str]:
|
|
229
|
-
"""
|
|
230
|
-
Calls the Fireworks API verify endpoint to validate the API key and returns the
|
|
231
|
-
account id from response headers when available.
|
|
232
|
-
|
|
233
|
-
Args:
|
|
234
|
-
api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
|
|
235
|
-
api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
|
|
236
|
-
|
|
237
|
-
Returns:
|
|
238
|
-
The resolved account id if verification succeeds and the header is present; otherwise None.
|
|
239
|
-
"""
|
|
240
|
-
try:
|
|
241
|
-
resolved_key = api_key or get_fireworks_api_key()
|
|
242
|
-
if not resolved_key:
|
|
243
|
-
return None
|
|
244
|
-
resolved_base = api_base or get_fireworks_api_base()
|
|
245
|
-
url = f"{resolved_base.rstrip('/')}/verifyApiKey"
|
|
246
|
-
headers = {"Authorization": f"Bearer {resolved_key}"}
|
|
247
|
-
resp = requests.get(url, headers=headers, timeout=10)
|
|
248
|
-
if resp.status_code != 200:
|
|
249
|
-
logger.debug("verifyApiKey returned status %s", resp.status_code)
|
|
250
|
-
return None
|
|
251
|
-
# Header keys could vary in case; requests provides case-insensitive dict
|
|
252
|
-
account_id = resp.headers.get("x-fireworks-account-id") or resp.headers.get("X-Fireworks-Account-Id")
|
|
253
|
-
if account_id and account_id.strip():
|
|
254
|
-
logger.debug("Resolved FIREWORKS_ACCOUNT_ID via verifyApiKey: %s", account_id)
|
|
255
|
-
return account_id.strip()
|
|
256
|
-
return None
|
|
257
|
-
except Exception as e:
|
|
258
|
-
logger.debug("Failed to verify API key for account id resolution: %s", e)
|
|
259
|
-
return None
|
|
@@ -301,12 +301,6 @@ def parse_args(args=None):
|
|
|
301
301
|
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
|
|
302
302
|
logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
|
|
303
303
|
logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
|
|
304
|
-
logs_parser.add_argument("--disable-elasticsearch-setup", action="store_true", help="Disable Elasticsearch setup")
|
|
305
|
-
logs_parser.add_argument(
|
|
306
|
-
"--use-env-elasticsearch-config",
|
|
307
|
-
action="store_true",
|
|
308
|
-
help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
|
|
309
|
-
)
|
|
310
304
|
|
|
311
305
|
# Upload command
|
|
312
306
|
upload_parser = subparsers.add_parser(
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI command for serving logs with file watching and real-time updates.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from ..utils.logs_server import serve_logs
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def logs_command(args):
|
|
12
|
+
"""Serve logs with file watching and real-time updates"""
|
|
13
|
+
|
|
14
|
+
port = args.port
|
|
15
|
+
print("🚀 Starting Eval Protocol Logs Server")
|
|
16
|
+
print(f"🌐 URL: http://localhost:{port}")
|
|
17
|
+
print(f"🔌 WebSocket: ws://localhost:{port}/ws")
|
|
18
|
+
print(f"👀 Watching paths: {['current directory']}")
|
|
19
|
+
print(f"🔍 Debug mode: {args.debug}")
|
|
20
|
+
print("Press Ctrl+C to stop the server")
|
|
21
|
+
print("-" * 50)
|
|
22
|
+
|
|
23
|
+
# setup Elasticsearch
|
|
24
|
+
from eval_protocol.pytest.elasticsearch_setup import ElasticsearchSetup
|
|
25
|
+
|
|
26
|
+
elasticsearch_config = ElasticsearchSetup().setup_elasticsearch()
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
serve_logs(port=args.port, elasticsearch_config=elasticsearch_config, debug=args.debug)
|
|
30
|
+
return 0
|
|
31
|
+
except KeyboardInterrupt:
|
|
32
|
+
print("\n🛑 Server stopped by user")
|
|
33
|
+
return 0
|
|
34
|
+
except Exception as e:
|
|
35
|
+
print(f"❌ Error starting server: {e}")
|
|
36
|
+
return 1
|
|
@@ -12,12 +12,7 @@ from pathlib import Path
|
|
|
12
12
|
from typing import Any, Callable, Iterable, Optional
|
|
13
13
|
|
|
14
14
|
import pytest
|
|
15
|
-
from eval_protocol.auth import
|
|
16
|
-
get_fireworks_account_id,
|
|
17
|
-
get_fireworks_api_key,
|
|
18
|
-
get_fireworks_api_base,
|
|
19
|
-
verify_api_key_and_get_account_id,
|
|
20
|
-
)
|
|
15
|
+
from eval_protocol.auth import get_fireworks_account_id, get_fireworks_api_key
|
|
21
16
|
from eval_protocol.platform_api import create_or_update_fireworks_secret
|
|
22
17
|
|
|
23
18
|
from eval_protocol.evaluation import create_evaluation
|
|
@@ -264,7 +259,7 @@ def _parse_entry(entry: str, cwd: str) -> tuple[str, str]:
|
|
|
264
259
|
raise ValueError("--entry must be in 'module::function', 'path::function', or 'module:function' format")
|
|
265
260
|
|
|
266
261
|
|
|
267
|
-
def
|
|
262
|
+
def _generate_ts_mode_code_from_entry(entry: str, cwd: str) -> tuple[str, str, str, str]:
|
|
268
263
|
target, func = _parse_entry(entry, cwd)
|
|
269
264
|
|
|
270
265
|
# Check if target looks like a file path
|
|
@@ -298,12 +293,47 @@ def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]:
|
|
|
298
293
|
raise ValueError(f"Function '{func}' not found in module '{module_name}'")
|
|
299
294
|
|
|
300
295
|
qualname = f"{module_name}.{func}"
|
|
301
|
-
|
|
296
|
+
code, file_name = _generate_ts_mode_code(
|
|
297
|
+
DiscoveredTest(
|
|
298
|
+
module_path=module_name,
|
|
299
|
+
module_name=module_name,
|
|
300
|
+
qualname=qualname,
|
|
301
|
+
file_path=getattr(module, "__file__", module_name),
|
|
302
|
+
lineno=None,
|
|
303
|
+
has_parametrize=False,
|
|
304
|
+
param_count=0,
|
|
305
|
+
nodeids=[],
|
|
306
|
+
)
|
|
307
|
+
)
|
|
308
|
+
return code, file_name, qualname, os.path.abspath(source_file_path) if source_file_path else ""
|
|
302
309
|
|
|
303
310
|
|
|
304
311
|
def _generate_ts_mode_code(test: DiscoveredTest) -> tuple[str, str]:
|
|
305
|
-
#
|
|
306
|
-
|
|
312
|
+
# Generate a minimal main.py that imports the test module and calls the function
|
|
313
|
+
module = test.module_name
|
|
314
|
+
func = test.qualname.split(".")[-1]
|
|
315
|
+
code = f"""
|
|
316
|
+
from typing import Any, Dict, List, Optional, Union
|
|
317
|
+
|
|
318
|
+
from eval_protocol.models import EvaluationRow, Message
|
|
319
|
+
from {module} import {func} as _ep_test
|
|
320
|
+
|
|
321
|
+
def evaluate(messages: List[Dict[str, Any]], ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs):
|
|
322
|
+
row = EvaluationRow(messages=[Message(**m) for m in messages], ground_truth=ground_truth)
|
|
323
|
+
result = _ep_test(row) # Supports sync/async via decorator's dual-mode
|
|
324
|
+
if hasattr(result, "__await__"):
|
|
325
|
+
import asyncio
|
|
326
|
+
result = asyncio.get_event_loop().run_until_complete(result)
|
|
327
|
+
if result.evaluation_result is None:
|
|
328
|
+
return {{"score": 0.0, "reason": "No evaluation_result set"}}
|
|
329
|
+
out = {{
|
|
330
|
+
"score": float(result.evaluation_result.score or 0.0),
|
|
331
|
+
"reason": result.evaluation_result.reason,
|
|
332
|
+
"metrics": {{k: (v.model_dump() if hasattr(v, "model_dump") else v) for k, v in (result.evaluation_result.metrics or {{}}).items()}},
|
|
333
|
+
}}
|
|
334
|
+
return out
|
|
335
|
+
"""
|
|
336
|
+
return (code, "main.py")
|
|
307
337
|
|
|
308
338
|
|
|
309
339
|
def _normalize_evaluator_id(evaluator_id: str) -> str:
|
|
@@ -492,10 +522,10 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
492
522
|
entries_arg = getattr(args, "entry", None)
|
|
493
523
|
if entries_arg:
|
|
494
524
|
entries = [e.strip() for e in re.split(r"[,\s]+", entries_arg) if e.strip()]
|
|
495
|
-
selected_specs: list[tuple[str, str]] = []
|
|
525
|
+
selected_specs: list[tuple[str, str, str, str]] = []
|
|
496
526
|
for e in entries:
|
|
497
|
-
qualname, resolved_path =
|
|
498
|
-
selected_specs.append((qualname, resolved_path))
|
|
527
|
+
code, file_name, qualname, resolved_path = _generate_ts_mode_code_from_entry(e, root)
|
|
528
|
+
selected_specs.append((code, file_name, qualname, resolved_path))
|
|
499
529
|
else:
|
|
500
530
|
print("Scanning for evaluation tests...")
|
|
501
531
|
tests = _discover_tests(root)
|
|
@@ -515,7 +545,11 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
515
545
|
print(" handles all parameter combinations. The evaluator will work with")
|
|
516
546
|
print(" the same logic regardless of which model/parameters are used.")
|
|
517
547
|
|
|
518
|
-
selected_specs = [
|
|
548
|
+
selected_specs = []
|
|
549
|
+
for t in selected_tests:
|
|
550
|
+
code, file_name = _generate_ts_mode_code(t)
|
|
551
|
+
# Store test info for better ID generation
|
|
552
|
+
selected_specs.append((code, file_name, t.qualname, t.file_path))
|
|
519
553
|
|
|
520
554
|
base_id = getattr(args, "id", None)
|
|
521
555
|
display_name = getattr(args, "display_name", None)
|
|
@@ -526,14 +560,6 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
526
560
|
try:
|
|
527
561
|
fw_account_id = get_fireworks_account_id()
|
|
528
562
|
fw_api_key_value = get_fireworks_api_key()
|
|
529
|
-
if not fw_account_id and fw_api_key_value:
|
|
530
|
-
# Attempt to verify and resolve account id from server headers
|
|
531
|
-
resolved = verify_api_key_and_get_account_id(api_key=fw_api_key_value, api_base=get_fireworks_api_base())
|
|
532
|
-
if resolved:
|
|
533
|
-
fw_account_id = resolved
|
|
534
|
-
# Propagate to environment so downstream calls use it if needed
|
|
535
|
-
os.environ["FIREWORKS_ACCOUNT_ID"] = fw_account_id
|
|
536
|
-
print(f"Resolved FIREWORKS_ACCOUNT_ID via API verification: {fw_account_id}")
|
|
537
563
|
if fw_account_id and fw_api_key_value:
|
|
538
564
|
print("Ensuring FIREWORKS_API_KEY is registered as a secret on Fireworks for rollout...")
|
|
539
565
|
if create_or_update_fireworks_secret(
|
|
@@ -553,7 +579,8 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
553
579
|
print(f"Warning: Skipped Fireworks secret registration due to error: {e}")
|
|
554
580
|
|
|
555
581
|
exit_code = 0
|
|
556
|
-
for i, (qualname, source_file_path) in enumerate(selected_specs):
|
|
582
|
+
for i, (code, file_name, qualname, source_file_path) in enumerate(selected_specs):
|
|
583
|
+
# Use ts_mode to upload evaluator
|
|
557
584
|
# Generate a short default ID from just the test function name
|
|
558
585
|
if base_id:
|
|
559
586
|
evaluator_id = base_id
|
|
@@ -591,12 +618,12 @@ def upload_command(args: argparse.Namespace) -> int:
|
|
|
591
618
|
|
|
592
619
|
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
|
|
593
620
|
try:
|
|
594
|
-
# Always treat as a single evaluator (single-metric) even if folder has helper modules
|
|
595
|
-
test_dir = os.path.dirname(source_file_path) if source_file_path else root
|
|
596
|
-
metric_name = os.path.basename(test_dir) or "metric"
|
|
597
621
|
result = create_evaluation(
|
|
598
622
|
evaluator_id=evaluator_id,
|
|
599
|
-
|
|
623
|
+
python_code_to_evaluate=code,
|
|
624
|
+
python_file_name_for_code=file_name,
|
|
625
|
+
criterion_name_for_code=qualname,
|
|
626
|
+
criterion_description_for_code=description or f"Evaluator for {qualname}",
|
|
600
627
|
display_name=display_name or evaluator_id,
|
|
601
628
|
description=description or f"Evaluator for {qualname}",
|
|
602
629
|
force=force,
|