eval-protocol 0.2.6.dev2__tar.gz → 0.2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.6.dev2/eval_protocol.egg-info → eval_protocol-0.2.8}/PKG-INFO +1 -1
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli.py +1 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/logs.py +4 -3
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +3 -4
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +14 -11
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/manager.py +4 -4
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/models.py +47 -21
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_agent_rollout_processor.py +5 -4
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_single_turn_rollout_process.py +4 -5
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/evaluation_test.py +96 -38
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/types.py +8 -2
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/logs_server.py +70 -20
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/vite_server.py +48 -17
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol.egg-info/SOURCES.txt +6 -3
- eval_protocol-0.2.8/tests/test_logs_server.py +585 -0
- eval_protocol-0.2.8/tests/test_logs_server_simple.py +88 -0
- eval_protocol-0.2.8/tests/test_vite_server.py +224 -0
- eval_protocol-0.2.8/vite-app/dist/assets/index-CGYj40Gx.css +1 -0
- eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js +88 -0
- eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js.map +1 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vite-app/dist/index.html +2 -2
- eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-D9iVTBbF.css +0 -1
- eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-DiF_B1x_.js +0 -88
- eval_protocol-0.2.6.dev2/vite-app/dist/assets/index-DiF_B1x_.js.map +0 -1
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/LICENSE +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/README.md +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/development/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/integrations/braintrust.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/session.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/pyproject.toml +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/setup.cfg +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/setup.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_braintrust_adapter.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_braintrust_example.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_config.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_format.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_frozen_lake_http_server.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_frozen_lake_seed_evaluation.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_length.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_math.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_models.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_server.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/versioneer.py +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.6.dev2 → eval_protocol-0.2.8}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.8
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-08-
|
|
11
|
+
"date": "2025-08-11T22:02:14-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "b004c422c7d873890fc88cc299935929fa966b1f",
|
|
15
|
+
"version": "0.2.8"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -289,6 +289,7 @@ def parse_args(args=None):
|
|
|
289
289
|
|
|
290
290
|
# Logs command
|
|
291
291
|
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
|
|
292
|
+
logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
|
|
292
293
|
|
|
293
294
|
# Run command (for Hydra-based evaluations)
|
|
294
295
|
# This subparser intentionally defines no arguments itself.
|
|
@@ -11,15 +11,16 @@ from ..utils.logs_server import serve_logs
|
|
|
11
11
|
def logs_command(args):
|
|
12
12
|
"""Serve logs with file watching and real-time updates"""
|
|
13
13
|
|
|
14
|
+
port = args.port
|
|
14
15
|
print(f"🚀 Starting Eval Protocol Logs Server")
|
|
15
|
-
print(f"🌐 URL: http://localhost:
|
|
16
|
-
print(f"🔌 WebSocket: ws://localhost:
|
|
16
|
+
print(f"🌐 URL: http://localhost:{port}")
|
|
17
|
+
print(f"🔌 WebSocket: ws://localhost:{port}/ws")
|
|
17
18
|
print(f"👀 Watching paths: {['current directory']}")
|
|
18
19
|
print("Press Ctrl+C to stop the server")
|
|
19
20
|
print("-" * 50)
|
|
20
21
|
|
|
21
22
|
try:
|
|
22
|
-
serve_logs()
|
|
23
|
+
serve_logs(port=args.port)
|
|
23
24
|
return 0
|
|
24
25
|
except KeyboardInterrupt:
|
|
25
26
|
print("\n🛑 Server stopped by user")
|
|
@@ -22,9 +22,8 @@ class SqliteDatasetLoggerAdapter(DatasetLogger):
|
|
|
22
22
|
self._store = SqliteEvaluationRowStore(self.db_path)
|
|
23
23
|
|
|
24
24
|
def log(self, row: "EvaluationRow") -> None:
|
|
25
|
-
row_id = row.input_metadata.row_id
|
|
26
25
|
data = row.model_dump(exclude_none=True, mode="json")
|
|
27
|
-
self._store.upsert_row(
|
|
26
|
+
self._store.upsert_row(data=data)
|
|
28
27
|
try:
|
|
29
28
|
event_bus.emit(LOG_EVENT_TYPE, EvaluationRow(**data))
|
|
30
29
|
except Exception as e:
|
|
@@ -32,8 +31,8 @@ class SqliteDatasetLoggerAdapter(DatasetLogger):
|
|
|
32
31
|
logger.error(f"Failed to emit row_upserted event: {e}")
|
|
33
32
|
pass
|
|
34
33
|
|
|
35
|
-
def read(self,
|
|
34
|
+
def read(self, rollout_id: Optional[str] = None) -> List["EvaluationRow"]:
|
|
36
35
|
from eval_protocol.models import EvaluationRow
|
|
37
36
|
|
|
38
|
-
results = self._store.read_rows(
|
|
37
|
+
results = self._store.read_rows(rollout_id=rollout_id)
|
|
39
38
|
return [EvaluationRow(**data) for data in results]
|
|
@@ -11,7 +11,7 @@ class SqliteEvaluationRowStore:
|
|
|
11
11
|
"""
|
|
12
12
|
Lightweight reusable SQLite store for evaluation rows.
|
|
13
13
|
|
|
14
|
-
Stores arbitrary row data as JSON keyed by a unique string `
|
|
14
|
+
Stores arbitrary row data as JSON keyed by a unique string `rollout_id`.
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
17
|
def __init__(self, db_path: str):
|
|
@@ -24,7 +24,7 @@ class SqliteEvaluationRowStore:
|
|
|
24
24
|
database = self._db
|
|
25
25
|
|
|
26
26
|
class EvaluationRow(BaseModel): # type: ignore
|
|
27
|
-
|
|
27
|
+
rollout_id = CharField(unique=True)
|
|
28
28
|
data = JSONField()
|
|
29
29
|
|
|
30
30
|
self._EvaluationRow = EvaluationRow
|
|
@@ -36,22 +36,25 @@ class SqliteEvaluationRowStore:
|
|
|
36
36
|
def db_path(self) -> str:
|
|
37
37
|
return self._db_path
|
|
38
38
|
|
|
39
|
-
def upsert_row(self,
|
|
40
|
-
|
|
41
|
-
|
|
39
|
+
def upsert_row(self, data: dict) -> None:
|
|
40
|
+
rollout_id = data["execution_metadata"]["rollout_id"]
|
|
41
|
+
if rollout_id is None:
|
|
42
|
+
raise ValueError("execution_metadata.rollout_id is required to upsert a row")
|
|
43
|
+
if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
|
|
44
|
+
self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
|
|
42
45
|
else:
|
|
43
|
-
self._EvaluationRow.create(
|
|
46
|
+
self._EvaluationRow.create(rollout_id=rollout_id, data=data)
|
|
44
47
|
|
|
45
|
-
def read_rows(self,
|
|
46
|
-
if
|
|
48
|
+
def read_rows(self, rollout_id: Optional[str] = None) -> List[dict]:
|
|
49
|
+
if rollout_id is None:
|
|
47
50
|
query = self._EvaluationRow.select().dicts()
|
|
48
51
|
else:
|
|
49
|
-
query = self._EvaluationRow.select().dicts().where(self._EvaluationRow.
|
|
52
|
+
query = self._EvaluationRow.select().dicts().where(self._EvaluationRow.rollout_id == rollout_id)
|
|
50
53
|
results = list(query)
|
|
51
54
|
return [result["data"] for result in results]
|
|
52
55
|
|
|
53
|
-
def delete_row(self,
|
|
54
|
-
return self._EvaluationRow.delete().where(self._EvaluationRow.
|
|
56
|
+
def delete_row(self, rollout_id: str) -> int:
|
|
57
|
+
return self._EvaluationRow.delete().where(self._EvaluationRow.rollout_id == rollout_id).execute()
|
|
55
58
|
|
|
56
59
|
def delete_all_rows(self) -> int:
|
|
57
60
|
return self._EvaluationRow.delete().execute()
|
|
@@ -158,8 +158,8 @@ class ExecutionManager:
|
|
|
158
158
|
messages.append(Message.model_validate(msg_dict))
|
|
159
159
|
|
|
160
160
|
evaluation_rows[idx].messages = messages
|
|
161
|
-
evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
|
|
162
|
-
evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
|
|
161
|
+
# evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
|
|
162
|
+
# evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
|
|
163
163
|
evaluation_rows[idx].tools = shared_tool_schema
|
|
164
164
|
evaluation_rows[idx].usage = CompletionUsage(**trajectory.usage)
|
|
165
165
|
evaluation_rows[idx].input_metadata.completion_params = CompletionParams(
|
|
@@ -482,11 +482,11 @@ class ExecutionManager:
|
|
|
482
482
|
trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"})
|
|
483
483
|
try:
|
|
484
484
|
await envs.connection_manager.reset_session(session)
|
|
485
|
-
except:
|
|
485
|
+
except: # noqa: E722
|
|
486
486
|
logger.error(f"Error resetting session {session.session_id}")
|
|
487
487
|
try:
|
|
488
488
|
await envs.connection_manager.close_session(session)
|
|
489
|
-
except:
|
|
489
|
+
except: # noqa: E722
|
|
490
490
|
logger.error(f"Error closing session {session.session_id}")
|
|
491
491
|
return trajectory
|
|
492
492
|
|
|
@@ -202,6 +202,21 @@ class InputMetadata(BaseModel):
|
|
|
202
202
|
)
|
|
203
203
|
|
|
204
204
|
|
|
205
|
+
class EvaluationThreshold(BaseModel):
|
|
206
|
+
"""Threshold configuration for evaluation tests.
|
|
207
|
+
|
|
208
|
+
The success field is required - tests must specify a minimum success rate.
|
|
209
|
+
The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
success: float = Field(
|
|
213
|
+
..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
|
|
214
|
+
)
|
|
215
|
+
standard_deviation: Optional[float] = Field(
|
|
216
|
+
None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
205
220
|
class EvalMetadata(BaseModel):
|
|
206
221
|
"""Metadata about the evaluation that was run."""
|
|
207
222
|
|
|
@@ -216,10 +231,36 @@ class EvalMetadata(BaseModel):
|
|
|
216
231
|
)
|
|
217
232
|
num_runs: int = Field(..., description="Number of times the evaluation was repeated")
|
|
218
233
|
aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
|
|
219
|
-
|
|
234
|
+
passed_threshold: Optional[EvaluationThreshold] = Field(
|
|
235
|
+
None, description="Threshold configuration for test success"
|
|
236
|
+
)
|
|
220
237
|
passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
|
|
221
238
|
|
|
222
239
|
|
|
240
|
+
class ExecutionMetadata(BaseModel):
|
|
241
|
+
"""Metadata about the execution of the evaluation."""
|
|
242
|
+
|
|
243
|
+
invocation_id: Optional[str] = Field(
|
|
244
|
+
default_factory=generate_id,
|
|
245
|
+
description="The ID of the invocation that this row belongs to.",
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
experiment_id: Optional[str] = Field(
|
|
249
|
+
default_factory=generate_id,
|
|
250
|
+
description="The ID of the experiment that this row belongs to.",
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
rollout_id: Optional[str] = Field(
|
|
254
|
+
default_factory=generate_id,
|
|
255
|
+
description="The ID of the rollout that this row belongs to.",
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
run_id: Optional[str] = Field(
|
|
259
|
+
None,
|
|
260
|
+
description=("The ID of the run that this row belongs to."),
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
223
264
|
class RolloutStatus(BaseModel):
|
|
224
265
|
"""Status of the rollout."""
|
|
225
266
|
|
|
@@ -264,26 +305,6 @@ class EvaluationRow(BaseModel):
|
|
|
264
305
|
description="The status of the rollout.",
|
|
265
306
|
)
|
|
266
307
|
|
|
267
|
-
invocation_id: Optional[str] = Field(
|
|
268
|
-
default_factory=generate_id,
|
|
269
|
-
description="The ID of the invocation that this row belongs to.",
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
cohort_id: Optional[str] = Field(
|
|
273
|
-
default_factory=generate_id,
|
|
274
|
-
description="The ID of the cohort that this row belongs to.",
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
rollout_id: Optional[str] = Field(
|
|
278
|
-
default_factory=generate_id,
|
|
279
|
-
description="The ID of the rollout that this row belongs to.",
|
|
280
|
-
)
|
|
281
|
-
|
|
282
|
-
run_id: Optional[str] = Field(
|
|
283
|
-
None,
|
|
284
|
-
description=("The ID of the run that this row belongs to."),
|
|
285
|
-
)
|
|
286
|
-
|
|
287
308
|
# Ground truth reference (moved from EvaluateResult to top level)
|
|
288
309
|
ground_truth: Optional[str] = Field(
|
|
289
310
|
default=None, description="Optional ground truth reference for this evaluation."
|
|
@@ -294,6 +315,11 @@ class EvaluationRow(BaseModel):
|
|
|
294
315
|
default=None, description="The evaluation result for this row/trajectory."
|
|
295
316
|
)
|
|
296
317
|
|
|
318
|
+
execution_metadata: ExecutionMetadata = Field(
|
|
319
|
+
default_factory=ExecutionMetadata,
|
|
320
|
+
description="Metadata about the execution of the evaluation.",
|
|
321
|
+
)
|
|
322
|
+
|
|
297
323
|
# LLM usage statistics
|
|
298
324
|
usage: Optional[CompletionUsage] = Field(
|
|
299
325
|
default=None, description="Token usage statistics from LLM calls during execution."
|
|
@@ -8,7 +8,7 @@ from openai import NOT_GIVEN, NotGiven
|
|
|
8
8
|
from openai.types.chat import ChatCompletionContentPartTextParam, ChatCompletionMessage, ChatCompletionToolParam
|
|
9
9
|
from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
|
|
10
10
|
|
|
11
|
-
from eval_protocol.dataset_logger import
|
|
11
|
+
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
|
|
12
12
|
from eval_protocol.mcp.execution.policy import LiteLLMPolicy
|
|
13
13
|
from eval_protocol.mcp.mcp_multi_client import MCPMultiClient
|
|
14
14
|
from eval_protocol.models import EvaluationRow, Message
|
|
@@ -20,12 +20,13 @@ class Agent:
|
|
|
20
20
|
A really simple agent that calls the model until no more tool calls are needed.
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
-
def __init__(self, model: str, row: EvaluationRow, config_path: str):
|
|
23
|
+
def __init__(self, model: str, row: EvaluationRow, config_path: str, logger: DatasetLogger):
|
|
24
24
|
self.model = model
|
|
25
25
|
self.evaluation_row: EvaluationRow = row
|
|
26
26
|
self._policy = LiteLLMPolicy(model_id=model)
|
|
27
27
|
self.mcp_client = MCPMultiClient(config_path=config_path) if config_path else None
|
|
28
28
|
self.tools: Union[List[ChatCompletionToolParam], NotGiven] = NOT_GIVEN
|
|
29
|
+
self.logger: DatasetLogger = logger
|
|
29
30
|
|
|
30
31
|
async def setup(self):
|
|
31
32
|
if self.mcp_client:
|
|
@@ -42,7 +43,7 @@ class Agent:
|
|
|
42
43
|
|
|
43
44
|
def append_message_and_log(self, message: Message):
|
|
44
45
|
self.messages.append(message)
|
|
45
|
-
|
|
46
|
+
self.logger.log(self.evaluation_row)
|
|
46
47
|
|
|
47
48
|
async def call_agent(self) -> str:
|
|
48
49
|
"""
|
|
@@ -116,7 +117,7 @@ async def default_agent_rollout_processor(
|
|
|
116
117
|
) -> List[EvaluationRow]:
|
|
117
118
|
dataset: Dataset = []
|
|
118
119
|
for row in rows:
|
|
119
|
-
agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path)
|
|
120
|
+
agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path, logger=config.logger)
|
|
120
121
|
await agent.setup()
|
|
121
122
|
await agent.call_agent()
|
|
122
123
|
dataset.append(agent.evaluation_row)
|
|
@@ -1,11 +1,9 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
from typing import List
|
|
3
|
-
|
|
4
2
|
import logging
|
|
5
3
|
import os
|
|
4
|
+
from typing import List
|
|
6
5
|
|
|
7
|
-
from eval_protocol.
|
|
8
|
-
from eval_protocol.models import EvaluationRow, Message, ChatCompletionMessageToolCall
|
|
6
|
+
from eval_protocol.models import ChatCompletionMessageToolCall, EvaluationRow, Message
|
|
9
7
|
from eval_protocol.pytest.types import RolloutProcessorConfig
|
|
10
8
|
|
|
11
9
|
|
|
@@ -49,6 +47,7 @@ async def default_single_turn_rollout_processor(
|
|
|
49
47
|
|
|
50
48
|
# Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
|
|
51
49
|
import importlib
|
|
50
|
+
|
|
52
51
|
_litellm = importlib.import_module("litellm")
|
|
53
52
|
acompletion = getattr(_litellm, "acompletion")
|
|
54
53
|
response = await acompletion(**request_params)
|
|
@@ -79,7 +78,7 @@ async def default_single_turn_rollout_processor(
|
|
|
79
78
|
]
|
|
80
79
|
|
|
81
80
|
row.messages = messages
|
|
82
|
-
|
|
81
|
+
config.logger.log(row)
|
|
83
82
|
return row
|
|
84
83
|
|
|
85
84
|
# Process rows with bounded concurrency if configured
|
|
@@ -3,13 +3,21 @@ import inspect
|
|
|
3
3
|
import math
|
|
4
4
|
import os
|
|
5
5
|
import statistics
|
|
6
|
-
from typing import Any, Callable, Dict, List, Literal, Optional
|
|
6
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
|
7
7
|
|
|
8
8
|
import pytest
|
|
9
9
|
|
|
10
10
|
from eval_protocol.dataset_logger import default_logger
|
|
11
|
+
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
|
|
11
12
|
from eval_protocol.human_id import generate_id
|
|
12
|
-
from eval_protocol.models import
|
|
13
|
+
from eval_protocol.models import (
|
|
14
|
+
CompletionParams,
|
|
15
|
+
EvalMetadata,
|
|
16
|
+
EvaluationRow,
|
|
17
|
+
EvaluationThreshold,
|
|
18
|
+
InputMetadata,
|
|
19
|
+
Message,
|
|
20
|
+
)
|
|
13
21
|
from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
|
|
14
22
|
from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
|
|
15
23
|
from eval_protocol.pytest.types import (
|
|
@@ -46,7 +54,7 @@ def evaluation_test( # noqa: C901
|
|
|
46
54
|
rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
|
|
47
55
|
evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
|
|
48
56
|
aggregation_method: AggregationMethod = "mean",
|
|
49
|
-
|
|
57
|
+
passed_threshold: Optional[Union[EvaluationThreshold, float]] = None,
|
|
50
58
|
num_runs: int = 1,
|
|
51
59
|
max_dataset_rows: Optional[int] = None,
|
|
52
60
|
mcp_config_path: Optional[str] = None,
|
|
@@ -55,6 +63,7 @@ def evaluation_test( # noqa: C901
|
|
|
55
63
|
steps: int = 30,
|
|
56
64
|
mode: EvaluationTestMode = "batch",
|
|
57
65
|
combine_datasets: bool = True,
|
|
66
|
+
logger: Optional[DatasetLogger] = None,
|
|
58
67
|
) -> Callable[
|
|
59
68
|
[TestFunction],
|
|
60
69
|
TestFunction,
|
|
@@ -64,14 +73,14 @@ def evaluation_test( # noqa: C901
|
|
|
64
73
|
Here are some key concepts to understand the terminology in EP:
|
|
65
74
|
|
|
66
75
|
- "invocation" is a single execution of a test function. An invocation can
|
|
67
|
-
generate 1 or more
|
|
76
|
+
generate 1 or more experiments. Grouping by invocation might be useful to
|
|
68
77
|
aggregate eval scores across multiple invocations when you want to aggregate
|
|
69
78
|
scores across multiple datasets.
|
|
70
|
-
- "
|
|
71
|
-
|
|
79
|
+
- "experiment" is a group of runs with for a combination of parameters. A single
|
|
80
|
+
experiment will have multiple runs if num_runs > 1.
|
|
72
81
|
1. If your evaluation_test has combinations of parameters, it will generate
|
|
73
|
-
multiple
|
|
74
|
-
2. A new execution of a test function will generate a new
|
|
82
|
+
multiple experiments per combination of parameters.
|
|
83
|
+
2. A new execution of a test function will generate a new experiment.
|
|
75
84
|
- "run" is a group of rollouts. For multiple num_runs > 1, there will be
|
|
76
85
|
multiple "run_id"s.
|
|
77
86
|
- "rollout" is the execution/process that produces a "trajectory". You
|
|
@@ -89,7 +98,7 @@ def evaluation_test( # noqa: C901
|
|
|
89
98
|
decorated test. It simply produces a score from 0 to 1 and attached it
|
|
90
99
|
to the row as the "evaluation_result" field.
|
|
91
100
|
|
|
92
|
-
"invocation", "
|
|
101
|
+
"invocation", "experiment", "run", "rollout", and "row" each have a unique ID
|
|
93
102
|
which can be used to easily group and identify your dataset by.
|
|
94
103
|
|
|
95
104
|
Args:
|
|
@@ -106,8 +115,8 @@ def evaluation_test( # noqa: C901
|
|
|
106
115
|
rollout_processor: Function used to perform the rollout.
|
|
107
116
|
evaluation_test_kwargs: Kwargs for the evaluation function.
|
|
108
117
|
aggregation_method: How to aggregate scores across rows.
|
|
109
|
-
|
|
110
|
-
below
|
|
118
|
+
passed_threshold: Threshold configuration for test success.
|
|
119
|
+
Success rate must be above success, and if set, standard deviation must be below standard_deviation.
|
|
111
120
|
num_runs: Number of times to repeat the rollout and evaluations.
|
|
112
121
|
max_dataset_rows: Limit dataset to the first N rows.
|
|
113
122
|
mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
|
|
@@ -117,11 +126,22 @@ def evaluation_test( # noqa: C901
|
|
|
117
126
|
mode: Evaluation mode. "batch" (default) expects test function to handle
|
|
118
127
|
full dataset. "pointwise" applies test function to each row. If your evaluation requires
|
|
119
128
|
the full rollout of all rows to compute the score, use
|
|
129
|
+
logger: DatasetLogger to use for logging. If not provided, a default logger will be used.
|
|
120
130
|
"""
|
|
121
131
|
|
|
132
|
+
active_logger: DatasetLogger = logger if logger else default_logger
|
|
133
|
+
|
|
122
134
|
def decorator(
|
|
123
135
|
test_func: TestFunction,
|
|
124
136
|
):
|
|
137
|
+
if passed_threshold is not None:
|
|
138
|
+
if isinstance(passed_threshold, float):
|
|
139
|
+
threshold = EvaluationThreshold(success=passed_threshold)
|
|
140
|
+
else:
|
|
141
|
+
threshold = EvaluationThreshold(**passed_threshold)
|
|
142
|
+
else:
|
|
143
|
+
threshold = None
|
|
144
|
+
|
|
125
145
|
sig = inspect.signature(test_func)
|
|
126
146
|
|
|
127
147
|
# For pointwise/rowwise mode, we expect a different signature
|
|
@@ -280,14 +300,14 @@ def evaluation_test( # noqa: C901
|
|
|
280
300
|
def wrapper_body(**kwargs):
|
|
281
301
|
model_name = kwargs["model"]
|
|
282
302
|
eval_metadata = None
|
|
283
|
-
all_results: List[EvaluationRow] = []
|
|
303
|
+
all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)]
|
|
284
304
|
|
|
285
|
-
|
|
305
|
+
experiment_id = generate_id()
|
|
286
306
|
|
|
287
307
|
def _log_eval_error(
|
|
288
308
|
status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
|
|
289
309
|
) -> None:
|
|
290
|
-
log_eval_status_and_rows(eval_metadata, rows, status, passed,
|
|
310
|
+
log_eval_status_and_rows(eval_metadata, rows, status, passed, active_logger)
|
|
291
311
|
|
|
292
312
|
try:
|
|
293
313
|
# Handle dataset loading
|
|
@@ -341,7 +361,7 @@ def evaluation_test( # noqa: C901
|
|
|
341
361
|
status="running",
|
|
342
362
|
num_runs=num_runs,
|
|
343
363
|
aggregation_method=aggregation_method,
|
|
344
|
-
|
|
364
|
+
passed_threshold=threshold,
|
|
345
365
|
passed=None,
|
|
346
366
|
)
|
|
347
367
|
|
|
@@ -363,13 +383,12 @@ def evaluation_test( # noqa: C901
|
|
|
363
383
|
row.input_metadata.session_data["mode"] = mode
|
|
364
384
|
# Initialize eval_metadata for each row
|
|
365
385
|
row.eval_metadata = eval_metadata
|
|
366
|
-
row.
|
|
367
|
-
row.invocation_id = invocation_id
|
|
386
|
+
row.execution_metadata.experiment_id = experiment_id
|
|
387
|
+
row.execution_metadata.invocation_id = invocation_id
|
|
368
388
|
|
|
369
389
|
# has to be done in the pytest main process since it's
|
|
370
390
|
# used to determine whether this eval has stopped
|
|
371
391
|
row.pid = os.getpid()
|
|
372
|
-
default_logger.log(row)
|
|
373
392
|
|
|
374
393
|
# Prepare rollout processor config once; we will generate fresh outputs per run
|
|
375
394
|
config = RolloutProcessorConfig(
|
|
@@ -379,21 +398,26 @@ def evaluation_test( # noqa: C901
|
|
|
379
398
|
max_concurrent_rollouts=max_concurrent_rollouts,
|
|
380
399
|
server_script_path=server_script_path,
|
|
381
400
|
steps=steps,
|
|
401
|
+
logger=active_logger,
|
|
382
402
|
)
|
|
383
403
|
|
|
384
|
-
for
|
|
404
|
+
for i in range(num_runs):
|
|
385
405
|
# Regenerate outputs each run by deep-copying the pristine dataset
|
|
386
406
|
# so model responses are not reused across runs.
|
|
387
407
|
run_id = generate_id()
|
|
388
|
-
fresh_dataset = [
|
|
408
|
+
fresh_dataset = [r.model_copy(deep=True) for r in data]
|
|
389
409
|
|
|
390
410
|
# apply new run_id to fresh_dataset
|
|
391
411
|
for row in fresh_dataset:
|
|
392
|
-
row.run_id = run_id
|
|
412
|
+
row.execution_metadata.run_id = run_id
|
|
393
413
|
|
|
394
414
|
# generate new rollout_id for each row
|
|
395
415
|
for row in fresh_dataset:
|
|
396
|
-
row.rollout_id = generate_id()
|
|
416
|
+
row.execution_metadata.rollout_id = generate_id()
|
|
417
|
+
|
|
418
|
+
# log the fresh_dataset
|
|
419
|
+
for row in fresh_dataset:
|
|
420
|
+
active_logger.log(row)
|
|
397
421
|
|
|
398
422
|
processed_dataset = execute_function(rollout_processor, rows=fresh_dataset, config=config)
|
|
399
423
|
|
|
@@ -409,7 +433,7 @@ def evaluation_test( # noqa: C901
|
|
|
409
433
|
raise ValueError(
|
|
410
434
|
f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
|
|
411
435
|
)
|
|
412
|
-
all_results.append(result)
|
|
436
|
+
all_results[i].append(result)
|
|
413
437
|
else:
|
|
414
438
|
# Batch mode: call the test function with the full dataset
|
|
415
439
|
results = execute_with_params(
|
|
@@ -433,17 +457,21 @@ def evaluation_test( # noqa: C901
|
|
|
433
457
|
raise ValueError(
|
|
434
458
|
f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
|
|
435
459
|
)
|
|
436
|
-
all_results
|
|
460
|
+
all_results[i] = results
|
|
437
461
|
|
|
438
|
-
scores = [
|
|
462
|
+
scores = [
|
|
463
|
+
sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
|
|
464
|
+
for result in all_results
|
|
465
|
+
]
|
|
439
466
|
agg_score = aggregate(scores, aggregation_method)
|
|
467
|
+
score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
|
|
440
468
|
|
|
441
469
|
# Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
|
|
442
470
|
ci_low: float | None = None
|
|
443
471
|
ci_high: float | None = None
|
|
444
472
|
if aggregation_method == "mean":
|
|
445
473
|
try:
|
|
446
|
-
result_ci = compute_fixed_set_mu_ci(all_results)
|
|
474
|
+
result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
|
|
447
475
|
mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
|
|
448
476
|
if mu_ci_low is not None and mu_ci_high is not None:
|
|
449
477
|
ci_low = float(mu_ci_low)
|
|
@@ -455,15 +483,24 @@ def evaluation_test( # noqa: C901
|
|
|
455
483
|
|
|
456
484
|
# Determine if the evaluation passed based on threshold
|
|
457
485
|
passed = None
|
|
458
|
-
|
|
459
|
-
|
|
486
|
+
|
|
487
|
+
if threshold is not None:
|
|
488
|
+
success_passed, std_passed = True, True
|
|
489
|
+
|
|
490
|
+
success_passed = agg_score >= threshold.success
|
|
491
|
+
|
|
492
|
+
if threshold.standard_deviation is not None:
|
|
493
|
+
std_passed = score_std <= threshold.standard_deviation
|
|
494
|
+
|
|
495
|
+
passed = success_passed and std_passed
|
|
460
496
|
|
|
461
497
|
# Update eval metadata status and passed field for all results
|
|
462
|
-
for
|
|
463
|
-
|
|
464
|
-
r.eval_metadata
|
|
465
|
-
|
|
466
|
-
|
|
498
|
+
for result in all_results:
|
|
499
|
+
for r in result:
|
|
500
|
+
if r.eval_metadata is not None:
|
|
501
|
+
r.eval_metadata.status = "finished"
|
|
502
|
+
r.eval_metadata.passed = passed
|
|
503
|
+
active_logger.log(r)
|
|
467
504
|
|
|
468
505
|
# Optional: print and/or persist a summary artifact for CI
|
|
469
506
|
try:
|
|
@@ -471,7 +508,7 @@ def evaluation_test( # noqa: C901
|
|
|
471
508
|
summary_path = os.getenv("EP_SUMMARY_JSON")
|
|
472
509
|
suite_name = test_func.__name__
|
|
473
510
|
model_used = model_name
|
|
474
|
-
total_rows = len(all_results)
|
|
511
|
+
total_rows = len([item for sublist in all_results for item in sublist])
|
|
475
512
|
summary_obj = {
|
|
476
513
|
"suite": suite_name,
|
|
477
514
|
"model": model_used,
|
|
@@ -488,7 +525,7 @@ def evaluation_test( # noqa: C901
|
|
|
488
525
|
from collections import defaultdict
|
|
489
526
|
|
|
490
527
|
metric_scores: Dict[str, list] = defaultdict(list)
|
|
491
|
-
for r in all_results:
|
|
528
|
+
for r in [item for sublist in all_results for item in sublist]:
|
|
492
529
|
if r.evaluation_result and r.evaluation_result.metrics:
|
|
493
530
|
for m_name, m_res in r.evaluation_result.metrics.items():
|
|
494
531
|
if m_res is not None and getattr(m_res, "score", None) is not None:
|
|
@@ -587,11 +624,32 @@ def evaluation_test( # noqa: C901
|
|
|
587
624
|
# Do not fail evaluation if summary writing fails
|
|
588
625
|
pass
|
|
589
626
|
|
|
627
|
+
# # Write all rows from active_logger.read() to a JSONL file in the same directory as the summary
|
|
628
|
+
# try:
|
|
629
|
+
# if active_logger is not None:
|
|
630
|
+
# rows = active_logger.read()
|
|
631
|
+
# # Write to a .jsonl file alongside the summary file
|
|
632
|
+
# jsonl_path = "logs.jsonl"
|
|
633
|
+
# import json
|
|
634
|
+
|
|
635
|
+
# with open(jsonl_path, "w", encoding="utf-8") as f_jsonl:
|
|
636
|
+
# for row in rows:
|
|
637
|
+
# json.dump(row.model_dump(exclude_none=True, mode="json"), f_jsonl)
|
|
638
|
+
# f_jsonl.write("\n")
|
|
639
|
+
# except Exception as e:
|
|
640
|
+
# # Do not fail evaluation if log writing fails
|
|
641
|
+
# print(e)
|
|
642
|
+
# pass
|
|
643
|
+
|
|
590
644
|
# Check threshold after logging
|
|
591
|
-
if
|
|
645
|
+
if threshold is not None and not passed:
|
|
592
646
|
assert (
|
|
593
|
-
agg_score >=
|
|
594
|
-
), f"Aggregated score {agg_score:.3f} below threshold {
|
|
647
|
+
agg_score >= threshold.success
|
|
648
|
+
), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
|
|
649
|
+
if threshold.standard_deviation is not None:
|
|
650
|
+
assert (
|
|
651
|
+
score_std <= threshold.standard_deviation
|
|
652
|
+
), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
|
|
595
653
|
|
|
596
654
|
except AssertionError:
|
|
597
655
|
_log_eval_error("finished", data if "data" in locals() else None, passed=False)
|
|
@@ -5,6 +5,9 @@ Parameter types
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from typing import Any, Callable, Dict, List, Literal, Optional
|
|
7
7
|
|
|
8
|
+
from eval_protocol.dataset_logger import default_logger
|
|
9
|
+
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
|
|
10
|
+
|
|
8
11
|
from ..models import EvaluationRow, Message
|
|
9
12
|
|
|
10
13
|
ModelParam = str # gpt-4o, gpt-4o-mini, accounts/fireworks/models/llama-3.1-8b-instruct
|
|
@@ -39,10 +42,13 @@ Rollout processor types
|
|
|
39
42
|
class RolloutProcessorConfig:
|
|
40
43
|
model: ModelParam
|
|
41
44
|
input_params: RolloutInputParam # optional input parameters for inference
|
|
42
|
-
mcp_config_path: str
|
|
43
|
-
server_script_path: Optional[str] =
|
|
45
|
+
mcp_config_path: str
|
|
46
|
+
server_script_path: Optional[str] = (
|
|
47
|
+
None # TODO: change from server_script_path to mcp_config_path for agent rollout processor
|
|
48
|
+
)
|
|
44
49
|
max_concurrent_rollouts: int = 8 # maximum number of concurrent rollouts
|
|
45
50
|
steps: int = 30 # max number of rollout steps
|
|
51
|
+
logger: DatasetLogger = default_logger # logger to use during rollout for mid-rollout logs
|
|
46
52
|
|
|
47
53
|
|
|
48
54
|
RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], List[EvaluationRow]]
|