eval-protocol 0.2.69.dev3__tar.gz → 0.2.70__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.69.dev3/eval_protocol.egg-info → eval_protocol-0.2.70}/PKG-INFO +1 -1
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/models.py +35 -3
- eval_protocol-0.2.70/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +177 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70/eval_protocol.egg-info}/PKG-INFO +1 -1
- eval_protocol-0.2.69.dev3/eval_protocol/quickstart/svg_agent/vercel_svg_server/api/init.py +0 -162
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/LICENSE +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/README.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/development/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/base.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/bigquery.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/fireworks_tracing.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/langchain.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/langsmith.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/openai_responses.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/weave.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/benchmarks/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/benchmarks/data/airline_dataset.jsonl +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/benchmarks/data/retail_dataset.jsonl +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/benchmarks/test_aime25.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/benchmarks/test_frozen_lake.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/benchmarks/test_gpqa.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/benchmarks/test_livebench_data_analysis.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/benchmarks/test_tau_bench_airline.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/benchmarks/test_tau_bench_retail.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/create_rft.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/upload.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/data_loader/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/data_loader/dynamic_data_loader.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/data_loader/factory_data_loader.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/data_loader/inline_data_loader.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/data_loader/models.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/fireworks_rft.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/log_utils/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/log_utils/elasticsearch_client.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/log_utils/elasticsearch_direct_http_handler.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/log_utils/elasticsearch_index_manager.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/log_utils/fireworks_tracing_http_handler.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/log_utils/init.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/log_utils/rollout_context.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/log_utils/rollout_id_filter.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/log_utils/util.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/execution/manager.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_adapter.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/frozen_lake/frozen_lake_mcp.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/frozen_lake/server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/tau2/README.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/tau2/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/tau2/server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/tau2/tau2_mcp.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/proxy/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/proxy/proxy_core/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/proxy/proxy_core/app.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/proxy/proxy_core/auth.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/proxy/proxy_core/langfuse.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/proxy/proxy_core/litellm.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/proxy/proxy_core/main.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/proxy/proxy_core/models.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/proxy/proxy_core/redis_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/default_langchain_rollout_processor.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/default_no_op_rollout_processor.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/dual_mode_wrapper.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/elasticsearch_setup.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/evaluation_test.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/evaluation_test_postprocess.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/evaluation_test_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/exception_config.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/execution.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/generate_parameter_combinations.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/github_action_rollout_processor.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/handle_persist_flow.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/parameterize.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/rollout_processor.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/store_experiment_link.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/store_results_url.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/tracing_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/pytest/validate_signature.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/quickstart/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/quickstart/aha_judge/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/quickstart/aha_judge/llm_judge.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/quickstart/aha_judge/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/quickstart/aha_judge/llm_judge_langfuse.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/quickstart/aha_judge/llm_judge_langsmith.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/quickstart/aha_judge/llm_judge_openai_responses.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/quickstart/aha_judge/utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/quickstart/llm_judge.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/quickstart/llm_judge_braintrust.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/quickstart/utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/types/errors.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/types/remote_rollout_processor.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/browser_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/check_server_status.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/evaluation_row_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/logs_models.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/logs_server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/show_results_url.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/subprocess_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol.egg-info/SOURCES.txt +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/pyproject.toml +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/setup.cfg +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/setup.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_config.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_directory_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_ep_upload_e2e.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_evaluation_postprocess.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_event_bus_helper.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_format.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_human_id.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_length.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_logs_server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_logs_server_simple.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_math.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_models.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_quickstart_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_retry_mechanism.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_show_results_url.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_status_migration_changes.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_status_migration_integration.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_status_model.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_tau_bench_airline_smoke.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_upload_entrypoint.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/tests/test_vite_server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data/domains/airline/policy.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data/domains/mock/policy.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data/domains/mock/policy_solo.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data/domains/retail/policy.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data/domains/telecom/main_policy.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data/domains/telecom/main_policy_solo.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data/domains/telecom/tech_support_manual.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data/domains/telecom/tech_support_workflow.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data/domains/telecom/tech_support_workflow_solo.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data/user_simulator/simulation_guidelines.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/versioneer.py +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vite-app/dist/assets/index-BGlGI2LH.css +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vite-app/dist/assets/index-CnGlFAnP.js +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vite-app/dist/assets/index-CnGlFAnP.js.map +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- {eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/vite-app/dist/index.html +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.70
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-10-
|
|
11
|
+
"date": "2025-10-29T04:00:08-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "c705cb8d88a8d5966f22c84172d885a4352debc0",
|
|
15
|
+
"version": "0.2.70"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -214,10 +214,10 @@ class Status(BaseModel):
|
|
|
214
214
|
logger.info(f"Re-raising {exception_type} from status details")
|
|
215
215
|
raise exception_to_raise
|
|
216
216
|
else:
|
|
217
|
-
logger.
|
|
217
|
+
logger.info(f"Could not create instance of {exception_type}")
|
|
218
218
|
continue
|
|
219
219
|
else:
|
|
220
|
-
logger.
|
|
220
|
+
logger.info(f"Could not import exception type: {exception_type}")
|
|
221
221
|
continue
|
|
222
222
|
|
|
223
223
|
return False
|
|
@@ -244,7 +244,9 @@ class Status(BaseModel):
|
|
|
244
244
|
# NOTE: we are losing some diagnostic information here by not passing the model and llm_provider. We could try to capture full exception state in rollout_error_from_exception.
|
|
245
245
|
lambda: exception_class(message, model="unknown", llm_provider="unknown"),
|
|
246
246
|
lambda: exception_class(message=message, model="unknown", llm_provider="unknown"),
|
|
247
|
-
# Pattern
|
|
247
|
+
# Pattern 5: OpenAI exceptions - create mock response object
|
|
248
|
+
lambda: cls._create_openai_exception(exception_class, message),
|
|
249
|
+
# Pattern 7: No arguments (fallback)
|
|
248
250
|
lambda: exception_class(),
|
|
249
251
|
]
|
|
250
252
|
|
|
@@ -260,6 +262,36 @@ class Status(BaseModel):
|
|
|
260
262
|
logger.debug(f"All constructor patterns failed for {exception_class.__name__}")
|
|
261
263
|
return None
|
|
262
264
|
|
|
265
|
+
@classmethod
|
|
266
|
+
def _create_openai_exception(cls, exception_class: type, message: str) -> Optional[Exception]:
|
|
267
|
+
"""
|
|
268
|
+
Create OpenAI exception with a mock response object.
|
|
269
|
+
|
|
270
|
+
OpenAI exceptions require httpx.Response objects which are complex to create,
|
|
271
|
+
so we create a minimal mock that satisfies the basic requirements.
|
|
272
|
+
"""
|
|
273
|
+
try:
|
|
274
|
+
import httpx
|
|
275
|
+
|
|
276
|
+
# Create a minimal mock response object
|
|
277
|
+
class MockRequest:
|
|
278
|
+
def __init__(self):
|
|
279
|
+
self.method = "POST"
|
|
280
|
+
self.url = "https://api.openai.com/v1/chat/completions"
|
|
281
|
+
|
|
282
|
+
class MockResponse:
|
|
283
|
+
def __init__(self):
|
|
284
|
+
self.status_code = 404
|
|
285
|
+
self.headers = {"x-request-id": "mock-request-id"}
|
|
286
|
+
self.request = MockRequest()
|
|
287
|
+
|
|
288
|
+
mock_response = MockResponse()
|
|
289
|
+
return exception_class(message, response=mock_response, body=None)
|
|
290
|
+
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logging.getLogger(__name__).debug(f"Failed to create OpenAI exception with mock response: {e}")
|
|
293
|
+
return None
|
|
294
|
+
|
|
263
295
|
@classmethod
|
|
264
296
|
def _import_exception_class(cls, exception_type: str) -> Optional[type]:
|
|
265
297
|
"""
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Vercel serverless function for SVGBench remote evaluation.
|
|
3
|
+
|
|
4
|
+
This function handles the model call part of the evaluation pipeline.
|
|
5
|
+
The SVG evaluation logic remains in the test client.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import logging
|
|
11
|
+
import sys
|
|
12
|
+
import asyncio
|
|
13
|
+
from flask import Flask, request, jsonify
|
|
14
|
+
from openai import OpenAI
|
|
15
|
+
from dotenv import load_dotenv
|
|
16
|
+
|
|
17
|
+
from eval_protocol import Status, InitRequest, FireworksTracingHttpHandler, RolloutIdFilter
|
|
18
|
+
|
|
19
|
+
load_dotenv()
|
|
20
|
+
|
|
21
|
+
# Configure logging so INFO and below go to stdout, WARNING+ to stderr.
|
|
22
|
+
# This avoids Vercel marking INFO logs as [error] (stderr).
|
|
23
|
+
root_logger = logging.getLogger()
|
|
24
|
+
root_logger.handlers.clear()
|
|
25
|
+
root_logger.setLevel(logging.INFO)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _InfoOnly(logging.Filter):
|
|
29
|
+
def filter(self, record: logging.LogRecord) -> bool:
|
|
30
|
+
return record.levelno <= logging.INFO
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
formatter = logging.Formatter("%(levelname)s:%(name)s:%(message)s")
|
|
34
|
+
|
|
35
|
+
stdout_handler = logging.StreamHandler(sys.stdout)
|
|
36
|
+
stdout_handler.addFilter(_InfoOnly())
|
|
37
|
+
stdout_handler.setFormatter(formatter)
|
|
38
|
+
root_logger.addHandler(stdout_handler)
|
|
39
|
+
|
|
40
|
+
stderr_handler = logging.StreamHandler(sys.stderr)
|
|
41
|
+
stderr_handler.setLevel(logging.WARNING)
|
|
42
|
+
stderr_handler.setFormatter(formatter)
|
|
43
|
+
root_logger.addHandler(stderr_handler)
|
|
44
|
+
|
|
45
|
+
# Attach Fireworks tracing handler to root logger (non-stream HTTP sink)
|
|
46
|
+
root_logger.addHandler(FireworksTracingHttpHandler())
|
|
47
|
+
|
|
48
|
+
# Create Flask app
|
|
49
|
+
app = Flask(__name__)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def execute_rollout_background(req, api_key):
|
|
53
|
+
"""Execute the OpenAI completion in background and log results"""
|
|
54
|
+
# Attach rollout_id filter to logger
|
|
55
|
+
logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
|
|
56
|
+
logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
model = req.completion_params.get("model")
|
|
60
|
+
# Uncomment if you need to strip fireworks_ai/ prefix
|
|
61
|
+
# if model and isinstance(model, str) and model.startswith("fireworks_ai/"):
|
|
62
|
+
# model = model[len("fireworks_ai/"):]
|
|
63
|
+
|
|
64
|
+
# Prepare completion arguments
|
|
65
|
+
completion_kwargs = {
|
|
66
|
+
"messages": req.messages,
|
|
67
|
+
# "messages": [{"role": "user", "content": "Hello, how are you?"}],
|
|
68
|
+
"model": model,
|
|
69
|
+
"temperature": req.completion_params.get("temperature"),
|
|
70
|
+
"max_tokens": req.completion_params.get("max_tokens"),
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# Add tools if present
|
|
74
|
+
if req.tools:
|
|
75
|
+
completion_kwargs["tools"] = req.tools
|
|
76
|
+
|
|
77
|
+
logger.info(
|
|
78
|
+
f"DEBUG: {req.model_base_url}, COMPLETION_KWARGS: {completion_kwargs}, API_KEY: {api_key}, MODEL: {model}"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Create AsyncOpenAI client
|
|
82
|
+
# client = AsyncOpenAI(base_url=req.model_base_url, api_key=api_key)
|
|
83
|
+
client = OpenAI(base_url=req.model_base_url, api_key=api_key)
|
|
84
|
+
|
|
85
|
+
logger.info(f"Sending completion request to model {model}")
|
|
86
|
+
|
|
87
|
+
# Make the async model call with timeout
|
|
88
|
+
import time
|
|
89
|
+
|
|
90
|
+
logger.info(f"timing start: {time.time()}")
|
|
91
|
+
completion = client.chat.completions.create(**completion_kwargs)
|
|
92
|
+
logger.info(f"Completed response: {completion}")
|
|
93
|
+
logger.info(f"timing end: {time.time()}")
|
|
94
|
+
# Log successful completion - THIS IS WHAT RemoteRolloutProcessor POLLS FOR
|
|
95
|
+
logger.info(f"Rollout {req.metadata.rollout_id} completed", extra={"status": Status.rollout_finished()})
|
|
96
|
+
|
|
97
|
+
except Exception as e:
|
|
98
|
+
# Log error with structured status - THIS IS WHAT RemoteRolloutProcessor POLLS FOR
|
|
99
|
+
logger.error(
|
|
100
|
+
f"Rollout {req.metadata.rollout_id} failed: {e}", extra={"status": Status.rollout_error_from_exception(e)}
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@app.route("/init", methods=["POST"])
|
|
105
|
+
async def init():
|
|
106
|
+
try:
|
|
107
|
+
# Parse as InitRequest
|
|
108
|
+
req = InitRequest(**request.get_json())
|
|
109
|
+
|
|
110
|
+
# Create logger for immediate validation logging
|
|
111
|
+
logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
|
|
112
|
+
logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
|
|
113
|
+
|
|
114
|
+
# Validate required fields
|
|
115
|
+
if not req.messages:
|
|
116
|
+
error_msg = "messages is required"
|
|
117
|
+
logger.error(error_msg, extra={"status": Status.rollout_error(error_msg)})
|
|
118
|
+
return jsonify({"error": error_msg}), 400
|
|
119
|
+
|
|
120
|
+
# Get API key (prefer request api_key, fallback to environment)
|
|
121
|
+
if req.api_key:
|
|
122
|
+
logger.info("Using API key from request")
|
|
123
|
+
api_key = req.api_key
|
|
124
|
+
elif os.environ.get("FIREWORKS_API_KEY"):
|
|
125
|
+
logger.info("Using API key from environment")
|
|
126
|
+
api_key = os.environ.get("FIREWORKS_API_KEY")
|
|
127
|
+
else:
|
|
128
|
+
error_msg = "API key not provided in request or environment variable"
|
|
129
|
+
logger.error(error_msg, extra={"status": Status.rollout_error(error_msg)})
|
|
130
|
+
return jsonify({"error": error_msg}), 401
|
|
131
|
+
|
|
132
|
+
# 🔥 FIRE: Return immediately with acceptance (within 30s requirement)
|
|
133
|
+
response_data = {
|
|
134
|
+
"status": "accepted",
|
|
135
|
+
"rollout_id": req.metadata.rollout_id,
|
|
136
|
+
"message": "Rollout processing started",
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
# Fire and forget: Execute rollout asynchronously
|
|
140
|
+
asyncio.create_task(execute_rollout_background(req, api_key))
|
|
141
|
+
|
|
142
|
+
return jsonify(response_data), 200
|
|
143
|
+
|
|
144
|
+
except Exception as e:
|
|
145
|
+
# For request parsing errors, return error immediately (don't retry)
|
|
146
|
+
return jsonify({"error": f"Request parsing error: {str(e)}"}), 400
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@app.route("/", methods=["GET"])
|
|
150
|
+
def health_check():
|
|
151
|
+
"""Health check endpoint"""
|
|
152
|
+
return jsonify(
|
|
153
|
+
{
|
|
154
|
+
"status": "ok",
|
|
155
|
+
"message": "SVGBench Vercel Serverless Function",
|
|
156
|
+
"endpoints": {"POST /": "Process SVGBench evaluation requests"},
|
|
157
|
+
}
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@app.route("/", methods=["OPTIONS"])
|
|
162
|
+
def options_handler():
|
|
163
|
+
"""Handle CORS preflight requests"""
|
|
164
|
+
response = jsonify({})
|
|
165
|
+
response.headers["Access-Control-Allow-Origin"] = "*"
|
|
166
|
+
response.headers["Access-Control-Allow-Methods"] = "POST, GET, OPTIONS"
|
|
167
|
+
response.headers["Access-Control-Allow-Headers"] = "Content-Type"
|
|
168
|
+
return response
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# Add CORS headers to all responses
|
|
172
|
+
@app.after_request
|
|
173
|
+
def add_cors_headers(response):
|
|
174
|
+
response.headers["Access-Control-Allow-Origin"] = "*"
|
|
175
|
+
response.headers["Access-Control-Allow-Methods"] = "POST, GET, OPTIONS"
|
|
176
|
+
response.headers["Access-Control-Allow-Headers"] = "Content-Type"
|
|
177
|
+
return response
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.70
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -1,162 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Vercel serverless function for SVGBench remote evaluation.
|
|
3
|
-
|
|
4
|
-
This function handles the model call part of the evaluation pipeline.
|
|
5
|
-
The SVG evaluation logic remains in the test client.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import json
|
|
9
|
-
import os
|
|
10
|
-
import logging
|
|
11
|
-
import sys
|
|
12
|
-
from http.server import BaseHTTPRequestHandler
|
|
13
|
-
from openai import OpenAI
|
|
14
|
-
from dotenv import load_dotenv
|
|
15
|
-
|
|
16
|
-
from eval_protocol import Status, InitRequest, FireworksTracingHttpHandler, RolloutIdFilter
|
|
17
|
-
|
|
18
|
-
load_dotenv()
|
|
19
|
-
|
|
20
|
-
# Configure logging so INFO and below go to stdout, WARNING+ to stderr.
|
|
21
|
-
# This avoids Vercel marking INFO logs as [error] (stderr).
|
|
22
|
-
root_logger = logging.getLogger()
|
|
23
|
-
root_logger.handlers.clear()
|
|
24
|
-
root_logger.setLevel(logging.INFO)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class _InfoOnly(logging.Filter):
|
|
28
|
-
def filter(self, record: logging.LogRecord) -> bool:
|
|
29
|
-
return record.levelno <= logging.INFO
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
formatter = logging.Formatter("%(levelname)s:%(name)s:%(message)s")
|
|
33
|
-
|
|
34
|
-
stdout_handler = logging.StreamHandler(sys.stdout)
|
|
35
|
-
stdout_handler.addFilter(_InfoOnly())
|
|
36
|
-
stdout_handler.setFormatter(formatter)
|
|
37
|
-
root_logger.addHandler(stdout_handler)
|
|
38
|
-
|
|
39
|
-
stderr_handler = logging.StreamHandler(sys.stderr)
|
|
40
|
-
stderr_handler.setLevel(logging.WARNING)
|
|
41
|
-
stderr_handler.setFormatter(formatter)
|
|
42
|
-
root_logger.addHandler(stderr_handler)
|
|
43
|
-
|
|
44
|
-
# Attach Fireworks tracing handler to root logger (non-stream HTTP sink)
|
|
45
|
-
root_logger.addHandler(FireworksTracingHttpHandler())
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
class handler(BaseHTTPRequestHandler):
|
|
49
|
-
def do_POST(self):
|
|
50
|
-
try:
|
|
51
|
-
# Read and parse request body
|
|
52
|
-
content_length = int(self.headers.get("Content-Length", 0))
|
|
53
|
-
request_body = self.rfile.read(content_length).decode("utf-8")
|
|
54
|
-
request_data = json.loads(request_body)
|
|
55
|
-
|
|
56
|
-
# Parse as InitRequest
|
|
57
|
-
req = InitRequest(**request_data)
|
|
58
|
-
|
|
59
|
-
# Attach rollout_id filter to logger
|
|
60
|
-
logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
|
|
61
|
-
logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
|
|
62
|
-
|
|
63
|
-
# Validate required fields
|
|
64
|
-
if not req.messages:
|
|
65
|
-
error_msg = "messages is required"
|
|
66
|
-
logger.error(error_msg, extra={"status": Status.rollout_error(error_msg)})
|
|
67
|
-
self._send_error(400, error_msg)
|
|
68
|
-
return
|
|
69
|
-
|
|
70
|
-
model = req.completion_params.get("model")
|
|
71
|
-
if model and isinstance(model, str) and model.startswith("fireworks_ai/"):
|
|
72
|
-
model = model[len("fireworks_ai/") :]
|
|
73
|
-
|
|
74
|
-
# Prepare completion arguments
|
|
75
|
-
completion_kwargs = {
|
|
76
|
-
"messages": req.messages,
|
|
77
|
-
"model": model,
|
|
78
|
-
"temperature": req.completion_params.get("temperature"),
|
|
79
|
-
"max_tokens": req.completion_params.get("max_tokens"),
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
# Add tools if present
|
|
83
|
-
if req.tools:
|
|
84
|
-
completion_kwargs["tools"] = req.tools
|
|
85
|
-
|
|
86
|
-
# Get API key (prefer request api_key, fallback to environment)
|
|
87
|
-
api_key = req.api_key or os.environ.get("FIREWORKS_API_KEY")
|
|
88
|
-
if not api_key:
|
|
89
|
-
error_msg = "API key not provided in request or FIREWORKS_API_KEY environment variable"
|
|
90
|
-
logger.error(error_msg, extra={"status": Status.rollout_error(error_msg)})
|
|
91
|
-
self._send_error(500, error_msg)
|
|
92
|
-
return
|
|
93
|
-
|
|
94
|
-
# Create OpenAI client
|
|
95
|
-
client = OpenAI(base_url=req.model_base_url, api_key=api_key)
|
|
96
|
-
|
|
97
|
-
logger.info(f"Sending completion request to model {req.completion_params.get('model')}")
|
|
98
|
-
|
|
99
|
-
# Make the model call
|
|
100
|
-
completion = client.chat.completions.create(**completion_kwargs)
|
|
101
|
-
|
|
102
|
-
logger.info(f"Completed response: {completion}")
|
|
103
|
-
|
|
104
|
-
# Log completion status
|
|
105
|
-
logger.info(f"Rollout {req.metadata.rollout_id} completed", extra={"status": Status.rollout_finished()})
|
|
106
|
-
|
|
107
|
-
# Return the completion response
|
|
108
|
-
response_data = {
|
|
109
|
-
"status": "completed",
|
|
110
|
-
"rollout_id": req.metadata.rollout_id,
|
|
111
|
-
"choices": [
|
|
112
|
-
{
|
|
113
|
-
"message": {
|
|
114
|
-
"role": completion.choices[0].message.role,
|
|
115
|
-
"content": completion.choices[0].message.content,
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
],
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
self._send_json_response(200, response_data)
|
|
122
|
-
|
|
123
|
-
except Exception as e:
|
|
124
|
-
# Log error if we have the request context
|
|
125
|
-
if "req" in locals() and "logger" in locals():
|
|
126
|
-
logger.error(f"❌ Error in rollout {req.metadata.rollout_id}: {e}")
|
|
127
|
-
logger.error(str(e), extra={"status": Status.rollout_error(str(e))})
|
|
128
|
-
|
|
129
|
-
self._send_error(500, str(e))
|
|
130
|
-
|
|
131
|
-
def do_GET(self):
|
|
132
|
-
"""Health check endpoint"""
|
|
133
|
-
self._send_json_response(
|
|
134
|
-
200,
|
|
135
|
-
{
|
|
136
|
-
"status": "ok",
|
|
137
|
-
"message": "SVGBench Vercel Serverless Function",
|
|
138
|
-
"endpoints": {"POST /": "Process SVGBench evaluation requests"},
|
|
139
|
-
},
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
def do_OPTIONS(self):
|
|
143
|
-
"""Handle CORS preflight requests"""
|
|
144
|
-
self.send_response(200)
|
|
145
|
-
self.send_header("Access-Control-Allow-Origin", "*")
|
|
146
|
-
self.send_header("Access-Control-Allow-Methods", "POST, GET, OPTIONS")
|
|
147
|
-
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
|
148
|
-
self.end_headers()
|
|
149
|
-
|
|
150
|
-
def _send_json_response(self, status_code: int, data: dict):
|
|
151
|
-
"""Send a JSON response"""
|
|
152
|
-
self.send_response(status_code)
|
|
153
|
-
self.send_header("Content-Type", "application/json")
|
|
154
|
-
self.send_header("Access-Control-Allow-Origin", "*")
|
|
155
|
-
self.send_header("Access-Control-Allow-Methods", "POST, GET, OPTIONS")
|
|
156
|
-
self.send_header("Access-Control-Allow-Headers", "Content-Type")
|
|
157
|
-
self.end_headers()
|
|
158
|
-
self.wfile.write(json.dumps(data).encode("utf-8"))
|
|
159
|
-
|
|
160
|
-
def _send_error(self, status_code: int, message: str):
|
|
161
|
-
"""Send an error response"""
|
|
162
|
-
self._send_json_response(status_code, {"error": message})
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/fireworks_tracing.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/adapters/openai_responses.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/docker_resource.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/agent/resources/sql_resource.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/benchmarks/test_frozen_lake.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/benchmarks/test_tau_bench_retail.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/agent_eval_cmd.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/cli_commands/run_eval_cmd.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/data_loader/dynamic_data_loader.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/data_loader/factory_data_loader.py
RENAMED
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/data_loader/inline_data_loader.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/dataset_logger/dataset_logger.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{eval_protocol-0.2.69.dev3 → eval_protocol-0.2.70}/eval_protocol/event_bus/sqlite_event_bus.py
RENAMED
|
File without changes
|
|
File without changes
|