eval-protocol 0.2.5.dev1__tar.gz → 0.2.6.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {eval_protocol-0.2.5.dev1/eval_protocol.egg-info → eval_protocol-0.2.6.dev1}/PKG-INFO +3 -3
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/README.md +1 -1
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/__init__.py +4 -3
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/_version.py +3 -3
- eval_protocol-0.2.6.dev1/eval_protocol/common_utils.py +30 -0
- eval_protocol-0.2.6.dev1/eval_protocol/dataset_logger/directory_utils.py +55 -0
- eval_protocol-0.2.6.dev1/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +98 -0
- eval_protocol-0.2.6.dev1/eval_protocol/get_pep440_version.py +133 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/human_id/__init__.py +3 -2
- eval_protocol-0.2.6.dev1/eval_protocol/logging_utils.py +175 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/client/connection.py +11 -39
- eval_protocol-0.2.6.dev1/eval_protocol/mcp/execution/manager.py +547 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/execution/policy.py +11 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/mcpgym.py +6 -1
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/session/manager.py +3 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp_env.py +32 -4
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/models.py +32 -2
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +12 -5
- eval_protocol-0.2.6.dev1/eval_protocol/pytest/default_single_turn_rollout_process.py +64 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/pytest/evaluation_test.py +6 -3
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/pytest/utils.py +0 -2
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/types/types.py +1 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/utils/logs_server.py +5 -1
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1/eval_protocol.egg-info}/PKG-INFO +3 -3
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol.egg-info/SOURCES.txt +9 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol.egg-info/entry_points.txt +1 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/pyproject.toml +6 -2
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_rollout_control_plane_integration.py +2 -1
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/evaluator/evaluator_nl_assertions.py +27 -10
- eval_protocol-0.2.6.dev1/vite-app/dist/assets/favicon-BkAAWQga.png +0 -0
- eval_protocol-0.2.6.dev1/vite-app/dist/assets/index-BMc_e8JT.js +88 -0
- eval_protocol-0.2.6.dev1/vite-app/dist/assets/index-BMc_e8JT.js.map +1 -0
- eval_protocol-0.2.6.dev1/vite-app/dist/assets/index-Dp7ms4NJ.css +1 -0
- eval_protocol-0.2.6.dev1/vite-app/dist/assets/logo-light-BprIBJQW.png +0 -0
- eval_protocol-0.2.6.dev1/vite-app/dist/index.html +14 -0
- eval_protocol-0.2.5.dev1/eval_protocol/common_utils.py +0 -36
- eval_protocol-0.2.5.dev1/eval_protocol/dataset_logger/local_fs_dataset_logger_adapter.py +0 -114
- eval_protocol-0.2.5.dev1/eval_protocol/mcp/execution/manager.py +0 -526
- eval_protocol-0.2.5.dev1/eval_protocol/pytest/default_single_turn_rollout_process.py +0 -50
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/LICENSE +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/development/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/development/normalize_sandbox_fusion.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/development/utils/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/development/utils/generate_api_key.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/development/utils/subprocess_manager.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/__main__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/adapters/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/adapters/braintrust.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/adapters/huggingface.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/adapters/langfuse.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/adapters/trl.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/models.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/orchestrator.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resource_abc.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resource_pool.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/bfcl_envs/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/bfcl_envs/math_api.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/bfcl_envs/posting_api.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/bfcl_sim_api_resource.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/docker_resource.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/filesystem_resource.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/http_rollout_protocol.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/http_rollout_resource.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/python_state_resource.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/resources/sql_resource.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/task_manager.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/agent/tool_registry.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/auth.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/cli.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/cli_commands/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/cli_commands/agent_eval_cmd.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/cli_commands/common.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/cli_commands/deploy.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/cli_commands/deploy_mcp.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/cli_commands/logs.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/cli_commands/preview.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/cli_commands/run_eval_cmd.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/config.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/dataset_logger/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/dataset_logger/dataset_logger.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/datasets/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/datasets/loader.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/evaluation.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/execution/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/execution/pipeline.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/gcp_tools.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/generation/cache.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/generation/clients/base.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/generation/clients.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/generic_server.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/human_id/dictionary.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/integrations/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/integrations/braintrust.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/integrations/deepeval.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/integrations/openeval.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/integrations/trl.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/adapter.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/client/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/clients.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/execution/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/execution/base_policy.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/grid_renderer.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/mcp_multi_client.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/process_manager.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/session/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/simple_process_manager.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/simulation_server.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp_agent/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp_agent/config.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp_agent/intermediary_server.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp_agent/main.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp_agent/orchestration/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp_agent/orchestration/base_client.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp_agent/orchestration/local_docker_client.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp_agent/orchestration/remote_http_client.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp_agent/session.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/packaging.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/platform_api.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/playback_policy.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/pytest/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/pytest/default_agent_rollout_processor.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/pytest/default_dataset_adapter.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/pytest/default_no_op_rollout_process.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/pytest/types.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/resources.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/reward_function.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/accuracy.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/accuracy_length.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/apps_coding_reward.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/apps_execution_utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/apps_testing_util.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/bfcl_reward.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/code_execution.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/code_execution_utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/cpp_code.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/deepcoder_reward.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/format.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/function_calling.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/json_schema.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/language_consistency.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/lean_prover.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/length.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/math.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/reasoning_steps.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/repetition.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rewards/tag_count.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/rl_processing.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/server.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/typed_interface.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/types/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/utils/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/utils/batch_evaluation.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/utils/batch_transformation.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/utils/dataset_helpers.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/utils/module_loader.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/utils/packaging_utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/utils/static_policy.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/utils/vite_server.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol.egg-info/dependency_links.txt +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol.egg-info/requires.txt +1 -1
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol.egg-info/top_level.txt +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/setup.cfg +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/setup.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_accuracy.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_accuracy_length.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_adapters_e2e.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_agent_orchestrator.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_agent_resources.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_auth.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_batch_evaluation.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_braintrust_adapter.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_braintrust_example.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_cli.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_cli_agent.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_cli_args.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_code_execution.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_config.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_control_plane_separation.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_cpp_code.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_data_driven_task_manager.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_deepcoder_reward.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_deepeval_integration.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_deploy_integration.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_e2b_integration.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_e2b_js_integration.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_edge_cases.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_eval_protocol_import.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_evaluation.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_evaluation_integration.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_evaluation_preview_integration.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_examples_end_to_end.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_fireworks_api.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_format.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_fractional_code.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_frozen_lake_http_server.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_frozen_lake_seed_evaluation.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_function_calling.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_gcp_tools.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_generic_server.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_integration.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_json_schema.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_kwargs_validation.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_language_consistency.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_lean_prover.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_lean_prover_runner.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_length.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_list_comparison_math_reward.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_math.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_minimal.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_models.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_models_rl.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_multiple_choice_math_reward.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_n_variant_batch_integration.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_n_variant_integration.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_openai_compatibility.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_openeval_integration.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_packaging.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_parallel_rollouts.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_platform_api.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_readiness.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_reasoning_steps.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_repetition.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_repetition_debug.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_reward_function.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_reward_protocol_import.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_rl_processing.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_server.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_tag_count.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_typed_interface.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_typed_interface_rl.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/tests/test_url_handling.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/agent/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/agent/base.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/agent/llm_agent.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/api_service/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/api_service/api_config.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/api_service/data_model.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/api_service/simulation_service.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/cli.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/config.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/data_model/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/data_model/message.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/data_model/simulation.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/data_model/tasks.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/airline/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/airline/data_model.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/airline/environment.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/airline/tools.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/airline/utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/mock/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/mock/data_model.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/mock/environment.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/mock/tools.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/mock/utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/retail/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/retail/data_model.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/retail/environment.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/retail/tools.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/retail/utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/data_model.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/environment.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/tasks/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/tasks/const.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/tasks/create_tasks.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/tasks/manager.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/tasks/mms_issues.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/tasks/service_issues.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/tasks/utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/tools.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/user_data_model.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/user_tools.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/domains/telecom/utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/environment/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/environment/db.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/environment/environment.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/environment/server.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/environment/tool.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/environment/toolkit.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/environment/utils/interface_agent.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/evaluator/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/evaluator/evaluator.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/evaluator/evaluator_action.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/evaluator/evaluator_base.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/evaluator/evaluator_communicate.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/evaluator/evaluator_env.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/metrics/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/metrics/agent_metrics.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/metrics/break_down_metrics.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/orchestrator/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/orchestrator/environment_manager.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/orchestrator/orchestrator.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/orchestrator/utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/registry.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/run.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/scripts/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/scripts/check_data.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/scripts/show_domain_doc.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/scripts/start_servers.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/scripts/view_simulations.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/user/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/user/base.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/user/user_simulator.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/utils/__init__.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/utils/display.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/utils/io_utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/utils/llm_utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/utils/pydantic_utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/vendor/tau2/utils/utils.py +0 -0
- {eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/versioneer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-protocol
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6.dev1
|
|
4
4
|
Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
|
|
5
5
|
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -41,10 +41,10 @@ Requires-Dist: pandas>=1.5.0
|
|
|
41
41
|
Requires-Dist: watchdog>=2.1.0
|
|
42
42
|
Requires-Dist: websockets>=15.0.1
|
|
43
43
|
Requires-Dist: fastapi>=0.116.1
|
|
44
|
+
Requires-Dist: pytest>=6.0.0
|
|
44
45
|
Provides-Extra: dev
|
|
45
46
|
Requires-Dist: build; extra == "dev"
|
|
46
47
|
Requires-Dist: twine; extra == "dev"
|
|
47
|
-
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
|
48
48
|
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
49
49
|
Requires-Dist: pytest-httpserver; extra == "dev"
|
|
50
50
|
Requires-Dist: werkzeug>=2.0.0; extra == "dev"
|
|
@@ -109,7 +109,7 @@ markdown generation tasks to customer service agents with tool calling
|
|
|
109
109
|
capabilities.
|
|
110
110
|
|
|
111
111
|
<p align="center">
|
|
112
|
-
<img src="
|
|
112
|
+
<img src="https://raw.githubusercontent.com/eval-protocol/python-sdk/refs/heads/main/assets/ui.png" alt="UI" />
|
|
113
113
|
<br>
|
|
114
114
|
<sub><b>Log Viewer: Monitor your evaluation rollouts in real time.</b></sub>
|
|
115
115
|
</p>
|
|
@@ -13,7 +13,7 @@ markdown generation tasks to customer service agents with tool calling
|
|
|
13
13
|
capabilities.
|
|
14
14
|
|
|
15
15
|
<p align="center">
|
|
16
|
-
<img src="
|
|
16
|
+
<img src="https://raw.githubusercontent.com/eval-protocol/python-sdk/refs/heads/main/assets/ui.png" alt="UI" />
|
|
17
17
|
<br>
|
|
18
18
|
<sub><b>Log Viewer: Monitor your evaluation rollouts in real time.</b></sub>
|
|
19
19
|
</p>
|
|
@@ -10,15 +10,16 @@ tool-augmented models using self-contained task bundles.
|
|
|
10
10
|
|
|
11
11
|
import warnings
|
|
12
12
|
|
|
13
|
-
from .adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
|
|
13
|
+
from eval_protocol.adapters.braintrust import reward_fn_to_scorer, scorer_to_reward_fn
|
|
14
|
+
|
|
14
15
|
from .auth import get_fireworks_account_id, get_fireworks_api_key
|
|
15
16
|
from .common_utils import load_jsonl
|
|
16
17
|
from .config import RewardKitConfig, get_config, load_config
|
|
17
18
|
from .mcp_env import (
|
|
18
19
|
AnthropicPolicy,
|
|
19
|
-
OpenAIPolicy,
|
|
20
|
-
LiteLLMPolicy,
|
|
21
20
|
FireworksPolicy,
|
|
21
|
+
LiteLLMPolicy,
|
|
22
|
+
OpenAIPolicy,
|
|
22
23
|
make,
|
|
23
24
|
rollout,
|
|
24
25
|
test_mcp,
|
|
@@ -8,11 +8,11 @@ import json
|
|
|
8
8
|
|
|
9
9
|
version_json = '''
|
|
10
10
|
{
|
|
11
|
-
"date": "2025-08-
|
|
11
|
+
"date": "2025-08-08T10:51:54-0700",
|
|
12
12
|
"dirty": false,
|
|
13
13
|
"error": null,
|
|
14
|
-
"full-revisionid": "
|
|
15
|
-
"version": "0.2.
|
|
14
|
+
"full-revisionid": "986452fd04442f9a0d1ba902753a83480e413d43",
|
|
15
|
+
"version": "0.2.6-dev1"
|
|
16
16
|
}
|
|
17
17
|
''' # END VERSION_JSON
|
|
18
18
|
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
|
|
7
|
+
"""
|
|
8
|
+
Reads a JSONL file where each line is a valid JSON object and returns a list of these objects.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
file_path: Path to the JSONL file.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
A list of dictionaries, where each dictionary is a parsed JSON object from a line.
|
|
15
|
+
Returns an empty list if the file is not found or if errors occur during parsing.
|
|
16
|
+
"""
|
|
17
|
+
data: List[Dict[str, Any]] = []
|
|
18
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
19
|
+
for line_number, line in enumerate(f):
|
|
20
|
+
try:
|
|
21
|
+
data.append(json.loads(line.strip()))
|
|
22
|
+
except json.JSONDecodeError as e:
|
|
23
|
+
print(f"Error parsing JSON line for file {file_path} at line {line_number}")
|
|
24
|
+
# attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
|
|
25
|
+
row_id_index = line.find("row_id")
|
|
26
|
+
if row_id_index != -1:
|
|
27
|
+
row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
|
|
28
|
+
raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
|
|
29
|
+
raise e
|
|
30
|
+
return data
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
# Shared constants for directory discovery
|
|
5
|
+
EVAL_PROTOCOL_DIR = ".eval_protocol"
|
|
6
|
+
PYTHON_FILES = ["pyproject.toml", "requirements.txt"]
|
|
7
|
+
DATASETS_DIR = "datasets"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def find_eval_protocol_dir() -> str:
|
|
11
|
+
"""
|
|
12
|
+
Find the .eval_protocol directory by looking up the directory tree.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
Path to the .eval_protocol directory
|
|
16
|
+
"""
|
|
17
|
+
# recursively look up for a .eval_protocol directory
|
|
18
|
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
19
|
+
while current_dir != "/":
|
|
20
|
+
if os.path.exists(os.path.join(current_dir, EVAL_PROTOCOL_DIR)):
|
|
21
|
+
log_dir = os.path.join(current_dir, EVAL_PROTOCOL_DIR)
|
|
22
|
+
break
|
|
23
|
+
current_dir = os.path.dirname(current_dir)
|
|
24
|
+
else:
|
|
25
|
+
# if not found, recursively look up until a pyproject.toml or requirements.txt is found
|
|
26
|
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
27
|
+
while current_dir != "/":
|
|
28
|
+
if any(os.path.exists(os.path.join(current_dir, f)) for f in PYTHON_FILES):
|
|
29
|
+
log_dir = os.path.join(current_dir, EVAL_PROTOCOL_DIR)
|
|
30
|
+
break
|
|
31
|
+
current_dir = os.path.dirname(current_dir)
|
|
32
|
+
else:
|
|
33
|
+
# get the PWD that this python process is running in
|
|
34
|
+
log_dir = os.path.join(os.getcwd(), EVAL_PROTOCOL_DIR)
|
|
35
|
+
|
|
36
|
+
# create the .eval_protocol directory if it doesn't exist
|
|
37
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
38
|
+
|
|
39
|
+
return log_dir
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def find_eval_protocol_datasets_dir() -> str:
|
|
43
|
+
"""
|
|
44
|
+
Find the .eval_protocol/datasets directory by looking up the directory tree.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Path to the .eval_protocol/datasets directory
|
|
48
|
+
"""
|
|
49
|
+
log_dir = find_eval_protocol_dir()
|
|
50
|
+
|
|
51
|
+
# create the datasets subdirectory
|
|
52
|
+
datasets_dir = os.path.join(log_dir, DATASETS_DIR)
|
|
53
|
+
os.makedirs(datasets_dir, exist_ok=True)
|
|
54
|
+
|
|
55
|
+
return datasets_dir
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
7
|
+
|
|
8
|
+
from eval_protocol.common_utils import load_jsonl
|
|
9
|
+
from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
|
|
10
|
+
from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_datasets_dir
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from eval_protocol.models import EvaluationRow
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LocalFSDatasetLoggerAdapter(DatasetLogger):
|
|
17
|
+
"""
|
|
18
|
+
Logger that stores logs in the local filesystem with file locking to prevent race conditions.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
self.log_dir = os.path.dirname(find_eval_protocol_datasets_dir())
|
|
23
|
+
self.datasets_dir = find_eval_protocol_datasets_dir()
|
|
24
|
+
|
|
25
|
+
# ensure that log file exists
|
|
26
|
+
if not os.path.exists(self.current_jsonl_path):
|
|
27
|
+
with open(self.current_jsonl_path, "w") as f:
|
|
28
|
+
f.write("")
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def current_date(self) -> str:
|
|
32
|
+
# Use UTC timezone to be consistent across local device/locations/CI
|
|
33
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def current_jsonl_path(self) -> str:
|
|
37
|
+
"""
|
|
38
|
+
The current JSONL file path. Based on the current date.
|
|
39
|
+
"""
|
|
40
|
+
return os.path.join(self.datasets_dir, f"{self.current_date}.jsonl")
|
|
41
|
+
|
|
42
|
+
def log(self, row: "EvaluationRow") -> None:
|
|
43
|
+
"""Log a row, updating existing row with same ID or appending new row."""
|
|
44
|
+
row_id = row.input_metadata.row_id
|
|
45
|
+
|
|
46
|
+
# Check if row with this ID already exists in any JSONL file
|
|
47
|
+
if os.path.exists(self.datasets_dir):
|
|
48
|
+
for filename in os.listdir(self.datasets_dir):
|
|
49
|
+
if filename.endswith(".jsonl"):
|
|
50
|
+
file_path = os.path.join(self.datasets_dir, filename)
|
|
51
|
+
if os.path.exists(file_path):
|
|
52
|
+
with open(file_path, "r") as f:
|
|
53
|
+
lines = f.readlines()
|
|
54
|
+
|
|
55
|
+
# Find the line with matching ID
|
|
56
|
+
for i, line in enumerate(lines):
|
|
57
|
+
try:
|
|
58
|
+
line_data = json.loads(line.strip())
|
|
59
|
+
if line_data["input_metadata"]["row_id"] == row_id:
|
|
60
|
+
# Update existing row
|
|
61
|
+
lines[i] = row.model_dump_json(exclude_none=True) + os.linesep
|
|
62
|
+
with open(file_path, "w") as f:
|
|
63
|
+
f.writelines(lines)
|
|
64
|
+
return
|
|
65
|
+
except json.JSONDecodeError:
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
# If no existing row found, append new row to current file
|
|
69
|
+
with open(self.current_jsonl_path, "a") as f:
|
|
70
|
+
f.write(row.model_dump_json(exclude_none=True) + os.linesep)
|
|
71
|
+
|
|
72
|
+
def read(self, row_id: Optional[str] = None) -> List["EvaluationRow"]:
|
|
73
|
+
"""Read rows from all JSONL files in the datasets directory. Also
|
|
74
|
+
ensures that there are no duplicate row IDs."""
|
|
75
|
+
from eval_protocol.models import EvaluationRow
|
|
76
|
+
|
|
77
|
+
if not os.path.exists(self.datasets_dir):
|
|
78
|
+
return []
|
|
79
|
+
|
|
80
|
+
all_rows = []
|
|
81
|
+
existing_row_ids = set()
|
|
82
|
+
for filename in os.listdir(self.datasets_dir):
|
|
83
|
+
if filename.endswith(".jsonl"):
|
|
84
|
+
file_path = os.path.join(self.datasets_dir, filename)
|
|
85
|
+
data = load_jsonl(file_path)
|
|
86
|
+
for r in data:
|
|
87
|
+
row = EvaluationRow(**r)
|
|
88
|
+
if row.input_metadata.row_id not in existing_row_ids:
|
|
89
|
+
existing_row_ids.add(row.input_metadata.row_id)
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError(f"Duplicate Row ID {row.input_metadata.row_id} already exists")
|
|
92
|
+
all_rows.append(row)
|
|
93
|
+
|
|
94
|
+
if row_id:
|
|
95
|
+
# Filter by row_id if specified
|
|
96
|
+
return [row for row in all_rows if getattr(row.input_metadata, "row_id", None) == row_id]
|
|
97
|
+
else:
|
|
98
|
+
return all_rows
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# Cache for PEP 440 version string
|
|
2
|
+
import subprocess
|
|
3
|
+
|
|
4
|
+
_version_cache = {"version": None, "base_version": None}
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_pep440_version(base_version=None):
|
|
8
|
+
"""
|
|
9
|
+
Generate a PEP 440 compliant version string based on git information.
|
|
10
|
+
|
|
11
|
+
This function is inspired by versioneer but doesn't require the full versioneer
|
|
12
|
+
setup, making it easier for downstream users to adopt without additional dependencies.
|
|
13
|
+
|
|
14
|
+
The result is cached statically to avoid repeated git calls.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
base_version: The base version string (e.g., "1.0.0"). If None, will try to
|
|
18
|
+
find the most recent version tag in git.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
A PEP 440 compliant version string that includes:
|
|
22
|
+
- Development release number (devN) based on commit count since base_version
|
|
23
|
+
- Local version identifier with git commit hash
|
|
24
|
+
- Dirty indicator if there are uncommitted changes
|
|
25
|
+
|
|
26
|
+
Examples:
|
|
27
|
+
>>> get_pep440_version("1.0.0")
|
|
28
|
+
"1.0.0.dev42+g1234567" # 42 commits since 1.0.0, commit hash 1234567
|
|
29
|
+
>>> get_pep440_version("1.0.0") # with uncommitted changes
|
|
30
|
+
"1.0.0.dev42+g1234567.dirty" # indicates dirty working directory
|
|
31
|
+
>>> get_pep440_version("1.0.0") # no git available
|
|
32
|
+
"1.0.0+unknown" # indicates git info not available
|
|
33
|
+
"""
|
|
34
|
+
# Check if we have a cached version for this base_version
|
|
35
|
+
if _version_cache["version"] is not None and _version_cache["base_version"] == base_version:
|
|
36
|
+
return _version_cache["version"]
|
|
37
|
+
try:
|
|
38
|
+
# Check if we're in a git repository
|
|
39
|
+
subprocess.run(
|
|
40
|
+
["git", "rev-parse", "--git-dir"],
|
|
41
|
+
check=True,
|
|
42
|
+
stdout=subprocess.PIPE,
|
|
43
|
+
stderr=subprocess.PIPE,
|
|
44
|
+
universal_newlines=True,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# If base_version is None, try to find the most recent version tag
|
|
48
|
+
if base_version is None:
|
|
49
|
+
try:
|
|
50
|
+
base_version = subprocess.check_output(
|
|
51
|
+
["git", "describe", "--tags", "--abbrev=0"], universal_newlines=True, stderr=subprocess.DEVNULL
|
|
52
|
+
).strip()
|
|
53
|
+
except subprocess.CalledProcessError:
|
|
54
|
+
# No tags found, we'll handle this case specially
|
|
55
|
+
base_version = None
|
|
56
|
+
|
|
57
|
+
# Get commit count since base_version
|
|
58
|
+
if base_version is None:
|
|
59
|
+
# No base version (no tags), just count all commits
|
|
60
|
+
count = subprocess.check_output(
|
|
61
|
+
["git", "rev-list", "--count", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
|
|
62
|
+
).strip()
|
|
63
|
+
base_version = "0.0.0" # Use this for the final version string
|
|
64
|
+
else:
|
|
65
|
+
try:
|
|
66
|
+
count = subprocess.check_output(
|
|
67
|
+
["git", "rev-list", "--count", f"{base_version}..HEAD"],
|
|
68
|
+
universal_newlines=True,
|
|
69
|
+
stderr=subprocess.DEVNULL,
|
|
70
|
+
).strip()
|
|
71
|
+
# If no commits found, try counting from the beginning
|
|
72
|
+
if count == "0" or not count:
|
|
73
|
+
count = subprocess.check_output(
|
|
74
|
+
["git", "rev-list", "--count", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
|
|
75
|
+
).strip()
|
|
76
|
+
except subprocess.CalledProcessError:
|
|
77
|
+
# If base_version tag doesn't exist, count all commits
|
|
78
|
+
count = subprocess.check_output(
|
|
79
|
+
["git", "rev-list", "--count", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
|
|
80
|
+
).strip()
|
|
81
|
+
|
|
82
|
+
# Get short commit hash
|
|
83
|
+
commit_hash = subprocess.check_output(
|
|
84
|
+
["git", "rev-parse", "--short", "HEAD"], universal_newlines=True, stderr=subprocess.DEVNULL
|
|
85
|
+
).strip()
|
|
86
|
+
|
|
87
|
+
# Check for uncommitted changes (dirty working directory)
|
|
88
|
+
try:
|
|
89
|
+
subprocess.run(
|
|
90
|
+
["git", "diff-index", "--quiet", "HEAD", "--"],
|
|
91
|
+
check=True,
|
|
92
|
+
stdout=subprocess.PIPE,
|
|
93
|
+
stderr=subprocess.PIPE,
|
|
94
|
+
)
|
|
95
|
+
dirty_suffix = ""
|
|
96
|
+
except subprocess.CalledProcessError:
|
|
97
|
+
dirty_suffix = ".dirty"
|
|
98
|
+
|
|
99
|
+
# Ensure count is a valid integer
|
|
100
|
+
try:
|
|
101
|
+
dev_count = int(count)
|
|
102
|
+
except (ValueError, TypeError):
|
|
103
|
+
dev_count = 0
|
|
104
|
+
|
|
105
|
+
# Build PEP 440 compliant version string
|
|
106
|
+
# Format: <base_version>.dev<count>+g<hash>[.dirty]
|
|
107
|
+
version_parts = [base_version]
|
|
108
|
+
|
|
109
|
+
if dev_count > 0:
|
|
110
|
+
version_parts.append(f".dev{dev_count}")
|
|
111
|
+
|
|
112
|
+
version_parts.append(f"+g{commit_hash}")
|
|
113
|
+
|
|
114
|
+
if dirty_suffix:
|
|
115
|
+
version_parts.append(dirty_suffix)
|
|
116
|
+
|
|
117
|
+
result = "".join(version_parts)
|
|
118
|
+
|
|
119
|
+
# Cache the result
|
|
120
|
+
_version_cache["version"] = result
|
|
121
|
+
_version_cache["base_version"] = base_version
|
|
122
|
+
|
|
123
|
+
return result
|
|
124
|
+
|
|
125
|
+
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
|
|
126
|
+
# Git is not available or not a git repository
|
|
127
|
+
result = f"{base_version}+unknown"
|
|
128
|
+
|
|
129
|
+
# Cache the result
|
|
130
|
+
_version_cache["version"] = result
|
|
131
|
+
_version_cache["base_version"] = base_version
|
|
132
|
+
|
|
133
|
+
return result
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import random
|
|
2
1
|
import itertools
|
|
2
|
+
import random
|
|
3
3
|
from typing import Hashable
|
|
4
|
+
|
|
4
5
|
from . import dictionary
|
|
5
6
|
|
|
6
7
|
__all__ = ["generate_id"]
|
|
@@ -8,7 +9,7 @@ __all__ = ["generate_id"]
|
|
|
8
9
|
system_random = random.SystemRandom()
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
def generate_id(separator="-", seed: int | float | str | bytes | bytearray | None = None, word_count=
|
|
12
|
+
def generate_id(separator="-", seed: int | float | str | bytes | bytearray | None = None, word_count=5) -> str:
|
|
12
13
|
"""
|
|
13
14
|
Generate a human readable ID
|
|
14
15
|
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Logging utilities for the eval_protocol package.
|
|
4
|
+
|
|
5
|
+
This module provides centralized logging configuration and utilities
|
|
6
|
+
for consistent logging across the eval_protocol package.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
from eval_protocol.dataset_logger.directory_utils import find_eval_protocol_dir
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def setup_logger(
|
|
19
|
+
name: str,
|
|
20
|
+
log_file: Optional[str] = None,
|
|
21
|
+
level: int = logging.INFO,
|
|
22
|
+
console_level: int = logging.INFO,
|
|
23
|
+
file_level: int = logging.DEBUG,
|
|
24
|
+
) -> logging.Logger:
|
|
25
|
+
"""
|
|
26
|
+
Set up a logger with both console and file handlers.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
name: Logger name
|
|
30
|
+
log_file: Optional log file name (will be created in logs directory)
|
|
31
|
+
level: Overall logger level
|
|
32
|
+
console_level: Console handler level
|
|
33
|
+
file_level: File handler level
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Configured logger instance
|
|
37
|
+
"""
|
|
38
|
+
# Create logs directory under eval_protocol
|
|
39
|
+
eval_protocol_dir = Path(find_eval_protocol_dir())
|
|
40
|
+
logs_dir = eval_protocol_dir / "logs"
|
|
41
|
+
logs_dir.mkdir(exist_ok=True)
|
|
42
|
+
|
|
43
|
+
# Create logger
|
|
44
|
+
logger = logging.getLogger(name)
|
|
45
|
+
|
|
46
|
+
# Only configure if not already configured (has handlers and proper level)
|
|
47
|
+
if logger.handlers and logger.level != logging.NOTSET:
|
|
48
|
+
return logger
|
|
49
|
+
|
|
50
|
+
logger.setLevel(level)
|
|
51
|
+
|
|
52
|
+
# Clear existing handlers to avoid duplicates
|
|
53
|
+
logger.handlers.clear()
|
|
54
|
+
|
|
55
|
+
# Create formatters
|
|
56
|
+
file_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
57
|
+
console_formatter = logging.Formatter("%(levelname)s - %(message)s")
|
|
58
|
+
|
|
59
|
+
# Console handler - explicitly write to sys.stdout
|
|
60
|
+
console_handler = logging.StreamHandler(sys.stdout)
|
|
61
|
+
console_handler.setLevel(console_level)
|
|
62
|
+
console_handler.setFormatter(console_formatter)
|
|
63
|
+
logger.addHandler(console_handler)
|
|
64
|
+
|
|
65
|
+
# File handler (if log_file specified) - explicitly write to file only
|
|
66
|
+
if log_file:
|
|
67
|
+
log_file_path = logs_dir / log_file
|
|
68
|
+
file_handler = logging.FileHandler(log_file_path)
|
|
69
|
+
file_handler.setLevel(file_level)
|
|
70
|
+
file_handler.setFormatter(file_formatter)
|
|
71
|
+
logger.addHandler(file_handler)
|
|
72
|
+
|
|
73
|
+
# Prevent propagation to avoid duplicate logging
|
|
74
|
+
logger.propagate = False
|
|
75
|
+
|
|
76
|
+
return logger
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_logger(name: str) -> logging.Logger:
|
|
80
|
+
"""
|
|
81
|
+
Get a logger instance. If it doesn't exist, create it with default settings.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
name: Logger name
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Logger instance
|
|
88
|
+
"""
|
|
89
|
+
logger = logging.getLogger(name)
|
|
90
|
+
|
|
91
|
+
# If logger doesn't have handlers, set it up with defaults
|
|
92
|
+
if not logger.handlers:
|
|
93
|
+
# For eval_watcher, check if running in daemon mode
|
|
94
|
+
if name == "eval_watcher":
|
|
95
|
+
import sys
|
|
96
|
+
|
|
97
|
+
# Check if running in daemon mode (subprocess)
|
|
98
|
+
if "--daemon" in sys.argv:
|
|
99
|
+
# Subprocess: log to file only
|
|
100
|
+
logger = setup_logger(name, f"{name}.log", console_level=logging.CRITICAL)
|
|
101
|
+
else:
|
|
102
|
+
# Top-level: log to console only
|
|
103
|
+
logger = setup_logger(name, None)
|
|
104
|
+
else:
|
|
105
|
+
logger = setup_logger(name, f"{name}.log")
|
|
106
|
+
|
|
107
|
+
return logger
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def log_evaluation_event(
|
|
111
|
+
event_type: str, evaluation_id: str, message: str, level: int = logging.INFO, **kwargs
|
|
112
|
+
) -> None:
|
|
113
|
+
"""
|
|
114
|
+
Log evaluation-specific events to a dedicated evaluation log file.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
event_type: Type of event (e.g., 'start', 'complete', 'error')
|
|
118
|
+
evaluation_id: Evaluation identifier
|
|
119
|
+
message: Log message
|
|
120
|
+
level: Log level
|
|
121
|
+
**kwargs: Additional context to include in log
|
|
122
|
+
"""
|
|
123
|
+
logger = get_logger("evaluation_events")
|
|
124
|
+
|
|
125
|
+
# Create structured log entry
|
|
126
|
+
log_entry = {"event_type": event_type, "evaluation_id": evaluation_id, "message": message, **kwargs}
|
|
127
|
+
|
|
128
|
+
if level == logging.DEBUG:
|
|
129
|
+
logger.debug(f"EVENT: {log_entry}")
|
|
130
|
+
elif level == logging.INFO:
|
|
131
|
+
logger.info(f"EVENT: {log_entry}")
|
|
132
|
+
elif level == logging.WARNING:
|
|
133
|
+
logger.warning(f"EVENT: {log_entry}")
|
|
134
|
+
elif level == logging.ERROR:
|
|
135
|
+
logger.error(f"EVENT: {log_entry}")
|
|
136
|
+
elif level == logging.CRITICAL:
|
|
137
|
+
logger.critical(f"EVENT: {log_entry}")
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def log_performance_metric(metric_name: str, value: float, unit: str = "", context: Optional[dict] = None) -> None:
|
|
141
|
+
"""
|
|
142
|
+
Log performance metrics to a dedicated metrics log file.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
metric_name: Name of the metric
|
|
146
|
+
value: Metric value
|
|
147
|
+
unit: Unit of measurement
|
|
148
|
+
context: Additional context information
|
|
149
|
+
"""
|
|
150
|
+
logger = get_logger("performance_metrics")
|
|
151
|
+
|
|
152
|
+
metric_entry = {"metric": metric_name, "value": value, "unit": unit, "context": context or {}}
|
|
153
|
+
|
|
154
|
+
logger.info(f"METRIC: {metric_entry}")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def log_error_with_context(error: Exception, context: str, additional_info: Optional[dict] = None) -> None:
|
|
158
|
+
"""
|
|
159
|
+
Log errors with additional context information.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
error: The exception that occurred
|
|
163
|
+
context: Context where the error occurred
|
|
164
|
+
additional_info: Additional information about the error
|
|
165
|
+
"""
|
|
166
|
+
logger = get_logger("errors")
|
|
167
|
+
|
|
168
|
+
error_entry = {
|
|
169
|
+
"error_type": type(error).__name__,
|
|
170
|
+
"error_message": str(error),
|
|
171
|
+
"context": context,
|
|
172
|
+
"additional_info": additional_info or {},
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
logger.error(f"ERROR: {error_entry}", exc_info=True)
|
{eval_protocol-0.2.5.dev1 → eval_protocol-0.2.6.dev1}/eval_protocol/mcp/client/connection.py
RENAMED
|
@@ -16,6 +16,7 @@ from mcp.client.session import ClientSession
|
|
|
16
16
|
from mcp.client.streamable_http import streamablehttp_client
|
|
17
17
|
|
|
18
18
|
from ...types import MCPSession
|
|
19
|
+
from mcp.types import Implementation
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
@@ -50,19 +51,16 @@ class MCPConnectionManager:
|
|
|
50
51
|
|
|
51
52
|
exit_stack = AsyncExitStack()
|
|
52
53
|
|
|
53
|
-
client_info =
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
client_info._extra["dataset_row_id"] = session.dataset_row.id
|
|
64
|
-
if session.model_id:
|
|
65
|
-
client_info._extra["model_id"] = session.model_id
|
|
54
|
+
client_info = Implementation(name="reward-kit", version="1.0.0", _extra={})
|
|
55
|
+
client_info._extra["session_id"] = session.session_id
|
|
56
|
+
if session.seed is not None:
|
|
57
|
+
client_info._extra["seed"] = session.seed
|
|
58
|
+
if session.dataset_row and session.dataset_row.environment_context:
|
|
59
|
+
client_info._extra["config"] = session.dataset_row.environment_context
|
|
60
|
+
if session.dataset_row and session.dataset_row.id:
|
|
61
|
+
client_info._extra["dataset_row_id"] = session.dataset_row.id
|
|
62
|
+
if session.model_id:
|
|
63
|
+
client_info._extra["model_id"] = session.model_id
|
|
66
64
|
|
|
67
65
|
read_stream, write_stream, _ = await exit_stack.enter_async_context(
|
|
68
66
|
streamablehttp_client(session.base_url, terminate_on_close=True)
|
|
@@ -77,32 +75,6 @@ class MCPConnectionManager:
|
|
|
77
75
|
session._mcp_session = mcp_session
|
|
78
76
|
session._exit_stack = exit_stack
|
|
79
77
|
|
|
80
|
-
# Update session ID to match server's calculation (for control plane sync)
|
|
81
|
-
if client_info and hasattr(client_info, "_extra"):
|
|
82
|
-
extra_data = client_info._extra
|
|
83
|
-
if extra_data and isinstance(extra_data, dict):
|
|
84
|
-
|
|
85
|
-
seed_value = extra_data.get("seed")
|
|
86
|
-
config_value = extra_data.get("config", {})
|
|
87
|
-
dataset_row_id_value = extra_data.get("dataset_row_id")
|
|
88
|
-
model_id_value = extra_data.get("model_id")
|
|
89
|
-
|
|
90
|
-
stable_data = {
|
|
91
|
-
"seed": seed_value,
|
|
92
|
-
"config": config_value,
|
|
93
|
-
"dataset_row_id": dataset_row_id_value,
|
|
94
|
-
"model_id": model_id_value,
|
|
95
|
-
"name": client_info.name,
|
|
96
|
-
"version": client_info.version,
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
stable_str = json.dumps(stable_data, sort_keys=True)
|
|
100
|
-
server_session_id = hashlib.md5(stable_str.encode()).hexdigest()
|
|
101
|
-
|
|
102
|
-
# Update the session ID to match what the server generated
|
|
103
|
-
session.session_id = server_session_id
|
|
104
|
-
logger.info(f"Updated session ID to match server: {server_session_id}")
|
|
105
|
-
|
|
106
78
|
# PRE-WARM: Discover and cache tools immediately after session initialization
|
|
107
79
|
# This prevents concurrent list_tools() calls later
|
|
108
80
|
await self._prewarm_tools_cache(session)
|