eval-protocol 0.2.7__tar.gz → 0.2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.7/eval_protocol.egg-info → eval_protocol-0.2.8}/PKG-INFO +1 -1
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/_version.py +3 -3
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli.py +1 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/logs.py +4 -3
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +3 -3
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/manager.py +4 -4
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/models.py +47 -21
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/evaluation_test.py +68 -36
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/logs_server.py +70 -20
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/vite_server.py +48 -17
- {eval_protocol-0.2.7 → eval_protocol-0.2.8/eval_protocol.egg-info}/PKG-INFO +1 -1
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol.egg-info/SOURCES.txt +6 -3
- eval_protocol-0.2.8/tests/test_logs_server.py +585 -0
- eval_protocol-0.2.8/tests/test_logs_server_simple.py +88 -0
- eval_protocol-0.2.8/tests/test_vite_server.py +224 -0
- eval_protocol-0.2.8/vite-app/dist/assets/index-CGYj40Gx.css +1 -0
- eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js +88 -0
- eval_protocol-0.2.8/vite-app/dist/assets/index-CoiGX-Xs.js.map +1 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vite-app/dist/index.html +2 -2
- eval_protocol-0.2.7/vite-app/dist/assets/index-DWfIf2rx.css +0 -1
- eval_protocol-0.2.7/vite-app/dist/assets/index-D_nkLTVA.js +0 -88
- eval_protocol-0.2.7/vite-app/dist/assets/index-D_nkLTVA.js.map +0 -1
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/LICENSE +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/README.md +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/development/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/common_utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/dataset_logger/sqlite_dataset_logger_adapter.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/directory_utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/event_bus/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/event_bus/event_bus.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/event_bus/logger.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/event_bus/sqlite_event_bus.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/event_bus/sqlite_event_bus_database.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/get_pep440_version.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/human_id/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/integrations/braintrust.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/logging_utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/client/connection.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/execution/policy.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/mcpgym.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/session/manager.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_agent/session.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/mcp_env.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/plugin.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/pytest/utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/stats/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/stats/confidence_intervals.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/types/types.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol.egg-info/entry_points.txt +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol.egg-info/requires.txt +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/pyproject.toml +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/setup.cfg +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/setup.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_braintrust_adapter.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_braintrust_example.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_config.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_event_bus.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_format.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_frozen_lake_http_server.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_frozen_lake_seed_evaluation.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_length.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_math.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_models.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_rollout_control_plane_integration.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_server.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/evaluator/evaluator_nl_assertions.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/versioneer.py +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- {eval_protocol-0.2.7 → eval_protocol-0.2.8}/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.8
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-08-
|
|
11
|
+
"date": "2025-08-11T22:02:14-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "b004c422c7d873890fc88cc299935929fa966b1f",
|
|
15
|
+
"version": "0.2.8"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -289,6 +289,7 @@ def parse_args(args=None):
|
|
|
289
289
|
|
|
290
290
|
# Logs command
|
|
291
291
|
logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
|
|
292
|
+
logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
|
|
292
293
|
|
|
293
294
|
# Run command (for Hydra-based evaluations)
|
|
294
295
|
# This subparser intentionally defines no arguments itself.
|
|
@@ -11,15 +11,16 @@ from ..utils.logs_server import serve_logs
|
|
|
11
11
|
def logs_command(args):
|
|
12
12
|
"""Serve logs with file watching and real-time updates"""
|
|
13
13
|
|
|
14
|
+
port = args.port
|
|
14
15
|
print(f"🚀 Starting Eval Protocol Logs Server")
|
|
15
|
-
print(f"🌐 URL: http://localhost:
|
|
16
|
-
print(f"🔌 WebSocket: ws://localhost:
|
|
16
|
+
print(f"🌐 URL: http://localhost:{port}")
|
|
17
|
+
print(f"🔌 WebSocket: ws://localhost:{port}/ws")
|
|
17
18
|
print(f"👀 Watching paths: {['current directory']}")
|
|
18
19
|
print("Press Ctrl+C to stop the server")
|
|
19
20
|
print("-" * 50)
|
|
20
21
|
|
|
21
22
|
try:
|
|
22
|
-
serve_logs()
|
|
23
|
+
serve_logs(port=args.port)
|
|
23
24
|
return 0
|
|
24
25
|
except KeyboardInterrupt:
|
|
25
26
|
print("\n🛑 Server stopped by user")
|
|
@@ -37,9 +37,9 @@ class SqliteEvaluationRowStore:
|
|
|
37
37
|
return self._db_path
|
|
38
38
|
|
|
39
39
|
def upsert_row(self, data: dict) -> None:
|
|
40
|
-
rollout_id = data["rollout_id"]
|
|
41
|
-
if
|
|
42
|
-
raise ValueError("rollout_id is required to upsert a row")
|
|
40
|
+
rollout_id = data["execution_metadata"]["rollout_id"]
|
|
41
|
+
if rollout_id is None:
|
|
42
|
+
raise ValueError("execution_metadata.rollout_id is required to upsert a row")
|
|
43
43
|
if self._EvaluationRow.select().where(self._EvaluationRow.rollout_id == rollout_id).exists():
|
|
44
44
|
self._EvaluationRow.update(data=data).where(self._EvaluationRow.rollout_id == rollout_id).execute()
|
|
45
45
|
else:
|
|
@@ -158,8 +158,8 @@ class ExecutionManager:
|
|
|
158
158
|
messages.append(Message.model_validate(msg_dict))
|
|
159
159
|
|
|
160
160
|
evaluation_rows[idx].messages = messages
|
|
161
|
-
evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
|
|
162
|
-
evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
|
|
161
|
+
# evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id
|
|
162
|
+
# evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx])
|
|
163
163
|
evaluation_rows[idx].tools = shared_tool_schema
|
|
164
164
|
evaluation_rows[idx].usage = CompletionUsage(**trajectory.usage)
|
|
165
165
|
evaluation_rows[idx].input_metadata.completion_params = CompletionParams(
|
|
@@ -482,11 +482,11 @@ class ExecutionManager:
|
|
|
482
482
|
trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"})
|
|
483
483
|
try:
|
|
484
484
|
await envs.connection_manager.reset_session(session)
|
|
485
|
-
except:
|
|
485
|
+
except: # noqa: E722
|
|
486
486
|
logger.error(f"Error resetting session {session.session_id}")
|
|
487
487
|
try:
|
|
488
488
|
await envs.connection_manager.close_session(session)
|
|
489
|
-
except:
|
|
489
|
+
except: # noqa: E722
|
|
490
490
|
logger.error(f"Error closing session {session.session_id}")
|
|
491
491
|
return trajectory
|
|
492
492
|
|
|
@@ -202,6 +202,21 @@ class InputMetadata(BaseModel):
|
|
|
202
202
|
)
|
|
203
203
|
|
|
204
204
|
|
|
205
|
+
class EvaluationThreshold(BaseModel):
|
|
206
|
+
"""Threshold configuration for evaluation tests.
|
|
207
|
+
|
|
208
|
+
The success field is required - tests must specify a minimum success rate.
|
|
209
|
+
The standard_deviation field is optional - if provided, tests must also meet the maximum standard deviation requirement.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
success: float = Field(
|
|
213
|
+
..., description="Minimum success rate threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
|
|
214
|
+
)
|
|
215
|
+
standard_deviation: Optional[float] = Field(
|
|
216
|
+
None, description="Maximum standard deviation threshold (fraction of total score, 0.0 to 1.0)", ge=0.0, le=1.0
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
205
220
|
class EvalMetadata(BaseModel):
|
|
206
221
|
"""Metadata about the evaluation that was run."""
|
|
207
222
|
|
|
@@ -216,10 +231,36 @@ class EvalMetadata(BaseModel):
|
|
|
216
231
|
)
|
|
217
232
|
num_runs: int = Field(..., description="Number of times the evaluation was repeated")
|
|
218
233
|
aggregation_method: str = Field(..., description="Method used to aggregate scores across runs")
|
|
219
|
-
|
|
234
|
+
passed_threshold: Optional[EvaluationThreshold] = Field(
|
|
235
|
+
None, description="Threshold configuration for test success"
|
|
236
|
+
)
|
|
220
237
|
passed: Optional[bool] = Field(None, description="Whether the evaluation passed based on the threshold")
|
|
221
238
|
|
|
222
239
|
|
|
240
|
+
class ExecutionMetadata(BaseModel):
|
|
241
|
+
"""Metadata about the execution of the evaluation."""
|
|
242
|
+
|
|
243
|
+
invocation_id: Optional[str] = Field(
|
|
244
|
+
default_factory=generate_id,
|
|
245
|
+
description="The ID of the invocation that this row belongs to.",
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
experiment_id: Optional[str] = Field(
|
|
249
|
+
default_factory=generate_id,
|
|
250
|
+
description="The ID of the experiment that this row belongs to.",
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
rollout_id: Optional[str] = Field(
|
|
254
|
+
default_factory=generate_id,
|
|
255
|
+
description="The ID of the rollout that this row belongs to.",
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
run_id: Optional[str] = Field(
|
|
259
|
+
None,
|
|
260
|
+
description=("The ID of the run that this row belongs to."),
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
223
264
|
class RolloutStatus(BaseModel):
|
|
224
265
|
"""Status of the rollout."""
|
|
225
266
|
|
|
@@ -264,26 +305,6 @@ class EvaluationRow(BaseModel):
|
|
|
264
305
|
description="The status of the rollout.",
|
|
265
306
|
)
|
|
266
307
|
|
|
267
|
-
invocation_id: Optional[str] = Field(
|
|
268
|
-
default_factory=generate_id,
|
|
269
|
-
description="The ID of the invocation that this row belongs to.",
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
cohort_id: Optional[str] = Field(
|
|
273
|
-
default_factory=generate_id,
|
|
274
|
-
description="The ID of the cohort that this row belongs to.",
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
rollout_id: Optional[str] = Field(
|
|
278
|
-
default_factory=generate_id,
|
|
279
|
-
description="The ID of the rollout that this row belongs to.",
|
|
280
|
-
)
|
|
281
|
-
|
|
282
|
-
run_id: Optional[str] = Field(
|
|
283
|
-
None,
|
|
284
|
-
description=("The ID of the run that this row belongs to."),
|
|
285
|
-
)
|
|
286
|
-
|
|
287
308
|
# Ground truth reference (moved from EvaluateResult to top level)
|
|
288
309
|
ground_truth: Optional[str] = Field(
|
|
289
310
|
default=None, description="Optional ground truth reference for this evaluation."
|
|
@@ -294,6 +315,11 @@ class EvaluationRow(BaseModel):
|
|
|
294
315
|
default=None, description="The evaluation result for this row/trajectory."
|
|
295
316
|
)
|
|
296
317
|
|
|
318
|
+
execution_metadata: ExecutionMetadata = Field(
|
|
319
|
+
default_factory=ExecutionMetadata,
|
|
320
|
+
description="Metadata about the execution of the evaluation.",
|
|
321
|
+
)
|
|
322
|
+
|
|
297
323
|
# LLM usage statistics
|
|
298
324
|
usage: Optional[CompletionUsage] = Field(
|
|
299
325
|
default=None, description="Token usage statistics from LLM calls during execution."
|
|
@@ -3,14 +3,21 @@ import inspect
|
|
|
3
3
|
import math
|
|
4
4
|
import os
|
|
5
5
|
import statistics
|
|
6
|
-
from typing import Any, Callable, Dict, List, Literal, Optional
|
|
6
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
|
7
7
|
|
|
8
8
|
import pytest
|
|
9
9
|
|
|
10
10
|
from eval_protocol.dataset_logger import default_logger
|
|
11
11
|
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
|
|
12
12
|
from eval_protocol.human_id import generate_id
|
|
13
|
-
from eval_protocol.models import
|
|
13
|
+
from eval_protocol.models import (
|
|
14
|
+
CompletionParams,
|
|
15
|
+
EvalMetadata,
|
|
16
|
+
EvaluationRow,
|
|
17
|
+
EvaluationThreshold,
|
|
18
|
+
InputMetadata,
|
|
19
|
+
Message,
|
|
20
|
+
)
|
|
14
21
|
from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter
|
|
15
22
|
from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor
|
|
16
23
|
from eval_protocol.pytest.types import (
|
|
@@ -47,7 +54,7 @@ def evaluation_test( # noqa: C901
|
|
|
47
54
|
rollout_processor: RolloutProcessor = default_no_op_rollout_processor,
|
|
48
55
|
evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None,
|
|
49
56
|
aggregation_method: AggregationMethod = "mean",
|
|
50
|
-
|
|
57
|
+
passed_threshold: Optional[Union[EvaluationThreshold, float]] = None,
|
|
51
58
|
num_runs: int = 1,
|
|
52
59
|
max_dataset_rows: Optional[int] = None,
|
|
53
60
|
mcp_config_path: Optional[str] = None,
|
|
@@ -66,14 +73,14 @@ def evaluation_test( # noqa: C901
|
|
|
66
73
|
Here are some key concepts to understand the terminology in EP:
|
|
67
74
|
|
|
68
75
|
- "invocation" is a single execution of a test function. An invocation can
|
|
69
|
-
generate 1 or more
|
|
76
|
+
generate 1 or more experiments. Grouping by invocation might be useful to
|
|
70
77
|
aggregate eval scores across multiple invocations when you want to aggregate
|
|
71
78
|
scores across multiple datasets.
|
|
72
|
-
- "
|
|
73
|
-
|
|
79
|
+
- "experiment" is a group of runs with for a combination of parameters. A single
|
|
80
|
+
experiment will have multiple runs if num_runs > 1.
|
|
74
81
|
1. If your evaluation_test has combinations of parameters, it will generate
|
|
75
|
-
multiple
|
|
76
|
-
2. A new execution of a test function will generate a new
|
|
82
|
+
multiple experiments per combination of parameters.
|
|
83
|
+
2. A new execution of a test function will generate a new experiment.
|
|
77
84
|
- "run" is a group of rollouts. For multiple num_runs > 1, there will be
|
|
78
85
|
multiple "run_id"s.
|
|
79
86
|
- "rollout" is the execution/process that produces a "trajectory". You
|
|
@@ -91,7 +98,7 @@ def evaluation_test( # noqa: C901
|
|
|
91
98
|
decorated test. It simply produces a score from 0 to 1 and attached it
|
|
92
99
|
to the row as the "evaluation_result" field.
|
|
93
100
|
|
|
94
|
-
"invocation", "
|
|
101
|
+
"invocation", "experiment", "run", "rollout", and "row" each have a unique ID
|
|
95
102
|
which can be used to easily group and identify your dataset by.
|
|
96
103
|
|
|
97
104
|
Args:
|
|
@@ -108,8 +115,8 @@ def evaluation_test( # noqa: C901
|
|
|
108
115
|
rollout_processor: Function used to perform the rollout.
|
|
109
116
|
evaluation_test_kwargs: Kwargs for the evaluation function.
|
|
110
117
|
aggregation_method: How to aggregate scores across rows.
|
|
111
|
-
|
|
112
|
-
below
|
|
118
|
+
passed_threshold: Threshold configuration for test success.
|
|
119
|
+
Success rate must be above success, and if set, standard deviation must be below standard_deviation.
|
|
113
120
|
num_runs: Number of times to repeat the rollout and evaluations.
|
|
114
121
|
max_dataset_rows: Limit dataset to the first N rows.
|
|
115
122
|
mcp_config_path: Path to MCP config file that follows MCPMultiClientConfiguration schema
|
|
@@ -127,6 +134,14 @@ def evaluation_test( # noqa: C901
|
|
|
127
134
|
def decorator(
|
|
128
135
|
test_func: TestFunction,
|
|
129
136
|
):
|
|
137
|
+
if passed_threshold is not None:
|
|
138
|
+
if isinstance(passed_threshold, float):
|
|
139
|
+
threshold = EvaluationThreshold(success=passed_threshold)
|
|
140
|
+
else:
|
|
141
|
+
threshold = EvaluationThreshold(**passed_threshold)
|
|
142
|
+
else:
|
|
143
|
+
threshold = None
|
|
144
|
+
|
|
130
145
|
sig = inspect.signature(test_func)
|
|
131
146
|
|
|
132
147
|
# For pointwise/rowwise mode, we expect a different signature
|
|
@@ -285,9 +300,9 @@ def evaluation_test( # noqa: C901
|
|
|
285
300
|
def wrapper_body(**kwargs):
|
|
286
301
|
model_name = kwargs["model"]
|
|
287
302
|
eval_metadata = None
|
|
288
|
-
all_results: List[EvaluationRow] = []
|
|
303
|
+
all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)]
|
|
289
304
|
|
|
290
|
-
|
|
305
|
+
experiment_id = generate_id()
|
|
291
306
|
|
|
292
307
|
def _log_eval_error(
|
|
293
308
|
status: Literal["finished", "error"], rows: Optional[List[EvaluationRow]] | None, passed: bool
|
|
@@ -346,7 +361,7 @@ def evaluation_test( # noqa: C901
|
|
|
346
361
|
status="running",
|
|
347
362
|
num_runs=num_runs,
|
|
348
363
|
aggregation_method=aggregation_method,
|
|
349
|
-
|
|
364
|
+
passed_threshold=threshold,
|
|
350
365
|
passed=None,
|
|
351
366
|
)
|
|
352
367
|
|
|
@@ -368,8 +383,8 @@ def evaluation_test( # noqa: C901
|
|
|
368
383
|
row.input_metadata.session_data["mode"] = mode
|
|
369
384
|
# Initialize eval_metadata for each row
|
|
370
385
|
row.eval_metadata = eval_metadata
|
|
371
|
-
row.
|
|
372
|
-
row.invocation_id = invocation_id
|
|
386
|
+
row.execution_metadata.experiment_id = experiment_id
|
|
387
|
+
row.execution_metadata.invocation_id = invocation_id
|
|
373
388
|
|
|
374
389
|
# has to be done in the pytest main process since it's
|
|
375
390
|
# used to determine whether this eval has stopped
|
|
@@ -386,19 +401,19 @@ def evaluation_test( # noqa: C901
|
|
|
386
401
|
logger=active_logger,
|
|
387
402
|
)
|
|
388
403
|
|
|
389
|
-
for
|
|
404
|
+
for i in range(num_runs):
|
|
390
405
|
# Regenerate outputs each run by deep-copying the pristine dataset
|
|
391
406
|
# so model responses are not reused across runs.
|
|
392
407
|
run_id = generate_id()
|
|
393
|
-
fresh_dataset = [
|
|
408
|
+
fresh_dataset = [r.model_copy(deep=True) for r in data]
|
|
394
409
|
|
|
395
410
|
# apply new run_id to fresh_dataset
|
|
396
411
|
for row in fresh_dataset:
|
|
397
|
-
row.run_id = run_id
|
|
412
|
+
row.execution_metadata.run_id = run_id
|
|
398
413
|
|
|
399
414
|
# generate new rollout_id for each row
|
|
400
415
|
for row in fresh_dataset:
|
|
401
|
-
row.rollout_id = generate_id()
|
|
416
|
+
row.execution_metadata.rollout_id = generate_id()
|
|
402
417
|
|
|
403
418
|
# log the fresh_dataset
|
|
404
419
|
for row in fresh_dataset:
|
|
@@ -418,7 +433,7 @@ def evaluation_test( # noqa: C901
|
|
|
418
433
|
raise ValueError(
|
|
419
434
|
f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test."
|
|
420
435
|
)
|
|
421
|
-
all_results.append(result)
|
|
436
|
+
all_results[i].append(result)
|
|
422
437
|
else:
|
|
423
438
|
# Batch mode: call the test function with the full dataset
|
|
424
439
|
results = execute_with_params(
|
|
@@ -442,17 +457,21 @@ def evaluation_test( # noqa: C901
|
|
|
442
457
|
raise ValueError(
|
|
443
458
|
f"Test function {test_func.__name__} returned a list containing non-EvaluationRow instances. You must return a list of EvaluationRow instances from your test function decorated with @evaluation_test."
|
|
444
459
|
)
|
|
445
|
-
all_results
|
|
460
|
+
all_results[i] = results
|
|
446
461
|
|
|
447
|
-
scores = [
|
|
462
|
+
scores = [
|
|
463
|
+
sum([r.evaluation_result.score for r in result if r.evaluation_result]) / len(result)
|
|
464
|
+
for result in all_results
|
|
465
|
+
]
|
|
448
466
|
agg_score = aggregate(scores, aggregation_method)
|
|
467
|
+
score_std = statistics.stdev(scores) if len(scores) > 1 else 0.0
|
|
449
468
|
|
|
450
469
|
# Compute 95% confidence interval for the fixed-set mean μ (by-question, using repeats)
|
|
451
470
|
ci_low: float | None = None
|
|
452
471
|
ci_high: float | None = None
|
|
453
472
|
if aggregation_method == "mean":
|
|
454
473
|
try:
|
|
455
|
-
result_ci = compute_fixed_set_mu_ci(all_results)
|
|
474
|
+
result_ci = compute_fixed_set_mu_ci([item for sublist in all_results for item in sublist])
|
|
456
475
|
mu_ci_low, mu_ci_high = result_ci[1], result_ci[2]
|
|
457
476
|
if mu_ci_low is not None and mu_ci_high is not None:
|
|
458
477
|
ci_low = float(mu_ci_low)
|
|
@@ -464,15 +483,24 @@ def evaluation_test( # noqa: C901
|
|
|
464
483
|
|
|
465
484
|
# Determine if the evaluation passed based on threshold
|
|
466
485
|
passed = None
|
|
467
|
-
|
|
468
|
-
|
|
486
|
+
|
|
487
|
+
if threshold is not None:
|
|
488
|
+
success_passed, std_passed = True, True
|
|
489
|
+
|
|
490
|
+
success_passed = agg_score >= threshold.success
|
|
491
|
+
|
|
492
|
+
if threshold.standard_deviation is not None:
|
|
493
|
+
std_passed = score_std <= threshold.standard_deviation
|
|
494
|
+
|
|
495
|
+
passed = success_passed and std_passed
|
|
469
496
|
|
|
470
497
|
# Update eval metadata status and passed field for all results
|
|
471
|
-
for
|
|
472
|
-
|
|
473
|
-
r.eval_metadata
|
|
474
|
-
|
|
475
|
-
|
|
498
|
+
for result in all_results:
|
|
499
|
+
for r in result:
|
|
500
|
+
if r.eval_metadata is not None:
|
|
501
|
+
r.eval_metadata.status = "finished"
|
|
502
|
+
r.eval_metadata.passed = passed
|
|
503
|
+
active_logger.log(r)
|
|
476
504
|
|
|
477
505
|
# Optional: print and/or persist a summary artifact for CI
|
|
478
506
|
try:
|
|
@@ -480,7 +508,7 @@ def evaluation_test( # noqa: C901
|
|
|
480
508
|
summary_path = os.getenv("EP_SUMMARY_JSON")
|
|
481
509
|
suite_name = test_func.__name__
|
|
482
510
|
model_used = model_name
|
|
483
|
-
total_rows = len(all_results)
|
|
511
|
+
total_rows = len([item for sublist in all_results for item in sublist])
|
|
484
512
|
summary_obj = {
|
|
485
513
|
"suite": suite_name,
|
|
486
514
|
"model": model_used,
|
|
@@ -497,7 +525,7 @@ def evaluation_test( # noqa: C901
|
|
|
497
525
|
from collections import defaultdict
|
|
498
526
|
|
|
499
527
|
metric_scores: Dict[str, list] = defaultdict(list)
|
|
500
|
-
for r in all_results:
|
|
528
|
+
for r in [item for sublist in all_results for item in sublist]:
|
|
501
529
|
if r.evaluation_result and r.evaluation_result.metrics:
|
|
502
530
|
for m_name, m_res in r.evaluation_result.metrics.items():
|
|
503
531
|
if m_res is not None and getattr(m_res, "score", None) is not None:
|
|
@@ -614,10 +642,14 @@ def evaluation_test( # noqa: C901
|
|
|
614
642
|
# pass
|
|
615
643
|
|
|
616
644
|
# Check threshold after logging
|
|
617
|
-
if
|
|
645
|
+
if threshold is not None and not passed:
|
|
618
646
|
assert (
|
|
619
|
-
agg_score >=
|
|
620
|
-
), f"Aggregated score {agg_score:.3f} below threshold {
|
|
647
|
+
agg_score >= threshold.success
|
|
648
|
+
), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}"
|
|
649
|
+
if threshold.standard_deviation is not None:
|
|
650
|
+
assert (
|
|
651
|
+
score_std <= threshold.standard_deviation
|
|
652
|
+
), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}"
|
|
621
653
|
|
|
622
654
|
except AssertionError:
|
|
623
655
|
_log_eval_error("finished", data if "data" in locals() else None, passed=False)
|
|
@@ -87,18 +87,32 @@ class WebSocketManager:
|
|
|
87
87
|
return
|
|
88
88
|
|
|
89
89
|
tasks = []
|
|
90
|
+
failed_connections = []
|
|
91
|
+
|
|
90
92
|
for connection in connections:
|
|
91
93
|
try:
|
|
92
94
|
tasks.append(connection.send_text(text))
|
|
93
95
|
except Exception as e:
|
|
94
96
|
logger.error(f"Failed to send text to WebSocket: {e}")
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
except ValueError:
|
|
99
|
-
pass
|
|
97
|
+
failed_connections.append(connection)
|
|
98
|
+
|
|
99
|
+
# Execute all sends in parallel
|
|
100
100
|
if tasks:
|
|
101
|
-
await asyncio.gather(*tasks, return_exceptions=True)
|
|
101
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
102
|
+
|
|
103
|
+
# Check for any exceptions that occurred during execution
|
|
104
|
+
for i, result in enumerate(results):
|
|
105
|
+
if isinstance(result, Exception):
|
|
106
|
+
logger.error(f"Failed to send text to WebSocket: {result}")
|
|
107
|
+
failed_connections.append(connections[i])
|
|
108
|
+
|
|
109
|
+
# Remove all failed connections
|
|
110
|
+
with self._lock:
|
|
111
|
+
for connection in failed_connections:
|
|
112
|
+
try:
|
|
113
|
+
self.active_connections.remove(connection)
|
|
114
|
+
except ValueError:
|
|
115
|
+
pass
|
|
102
116
|
|
|
103
117
|
def start_broadcast_loop(self):
|
|
104
118
|
"""Start the broadcast loop in the current event loop."""
|
|
@@ -109,6 +123,7 @@ class WebSocketManager:
|
|
|
109
123
|
"""Stop the broadcast loop."""
|
|
110
124
|
if self._broadcast_task and not self._broadcast_task.done():
|
|
111
125
|
self._broadcast_task.cancel()
|
|
126
|
+
self._broadcast_task = None
|
|
112
127
|
|
|
113
128
|
|
|
114
129
|
class EvaluationWatcher:
|
|
@@ -233,7 +248,6 @@ class LogsServer(ViteServer):
|
|
|
233
248
|
|
|
234
249
|
# Subscribe to events and start listening for cross-process events
|
|
235
250
|
event_bus.subscribe(self._handle_event)
|
|
236
|
-
event_bus.start_listening()
|
|
237
251
|
|
|
238
252
|
logger.info(f"LogsServer initialized on {host}:{port}")
|
|
239
253
|
|
|
@@ -273,6 +287,12 @@ class LogsServer(ViteServer):
|
|
|
273
287
|
data = EvaluationRow(**data)
|
|
274
288
|
self.websocket_manager.broadcast_row_upserted(data)
|
|
275
289
|
|
|
290
|
+
def start_loops(self):
|
|
291
|
+
"""Start the broadcast loop and evaluation watcher."""
|
|
292
|
+
self.websocket_manager.start_broadcast_loop()
|
|
293
|
+
self.evaluation_watcher.start()
|
|
294
|
+
event_bus.start_listening()
|
|
295
|
+
|
|
276
296
|
async def run_async(self):
|
|
277
297
|
"""
|
|
278
298
|
Run the logs server asynchronously with file watching.
|
|
@@ -285,11 +305,7 @@ class LogsServer(ViteServer):
|
|
|
285
305
|
logger.info(f"Serving files from: {self.build_dir}")
|
|
286
306
|
logger.info("WebSocket endpoint available at /ws")
|
|
287
307
|
|
|
288
|
-
|
|
289
|
-
self.websocket_manager.start_broadcast_loop()
|
|
290
|
-
|
|
291
|
-
# Start the evaluation watcher
|
|
292
|
-
self.evaluation_watcher.start()
|
|
308
|
+
self.start_loops()
|
|
293
309
|
|
|
294
310
|
config = uvicorn.Config(
|
|
295
311
|
self.app,
|
|
@@ -319,20 +335,54 @@ class LogsServer(ViteServer):
|
|
|
319
335
|
asyncio.run(self.run_async())
|
|
320
336
|
|
|
321
337
|
|
|
322
|
-
|
|
323
|
-
|
|
338
|
+
def create_app(host: str = "localhost", port: int = 8000, build_dir: Optional[str] = None) -> FastAPI:
|
|
339
|
+
"""
|
|
340
|
+
Factory function to create a FastAPI app instance and start the server with async loops.
|
|
341
|
+
|
|
342
|
+
This creates a LogsServer instance and starts it in a background thread to ensure
|
|
343
|
+
all async loops (WebSocket broadcast, evaluation watching) are running.
|
|
324
344
|
|
|
345
|
+
Args:
|
|
346
|
+
host: Host to bind to
|
|
347
|
+
port: Port to bind to
|
|
348
|
+
build_dir: Optional custom build directory path
|
|
325
349
|
|
|
326
|
-
|
|
350
|
+
Returns:
|
|
351
|
+
FastAPI app instance with server running in background
|
|
352
|
+
"""
|
|
353
|
+
if build_dir is None:
|
|
354
|
+
build_dir = os.path.abspath(
|
|
355
|
+
os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "vite-app", "dist")
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
server = LogsServer(host=host, port=port, build_dir=build_dir)
|
|
359
|
+
server.start_loops()
|
|
360
|
+
return server.app
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
# For backward compatibility and direct usage
|
|
364
|
+
def serve_logs(port: Optional[int] = None):
|
|
327
365
|
"""
|
|
328
366
|
Convenience function to create and run a LogsServer.
|
|
329
367
|
"""
|
|
330
|
-
|
|
331
|
-
if server is None:
|
|
332
|
-
server = LogsServer()
|
|
333
|
-
app = server.app
|
|
368
|
+
server = LogsServer(port=port)
|
|
334
369
|
server.run()
|
|
335
370
|
|
|
336
371
|
|
|
337
372
|
if __name__ == "__main__":
|
|
338
|
-
|
|
373
|
+
import argparse
|
|
374
|
+
|
|
375
|
+
parser = argparse.ArgumentParser(description="Start the evaluation logs server")
|
|
376
|
+
parser.add_argument("--host", default="localhost", help="Host to bind to (default: localhost)")
|
|
377
|
+
parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
|
|
378
|
+
parser.add_argument("--build-dir", help="Path to Vite build directory")
|
|
379
|
+
|
|
380
|
+
args = parser.parse_args()
|
|
381
|
+
|
|
382
|
+
# Create server with command line arguments
|
|
383
|
+
if args.build_dir:
|
|
384
|
+
server = LogsServer(host=args.host, port=args.port, build_dir=args.build_dir)
|
|
385
|
+
else:
|
|
386
|
+
server = LogsServer(host=args.host, port=args.port)
|
|
387
|
+
|
|
388
|
+
server.run()
|