eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,161 @@
1
+ """
2
+ DeepCoder-style reward function for evaluating code correctness based on test cases.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import re # For function name extraction
8
+ from typing import Any, Dict, List, Optional, Union
9
+
10
+ from ..models import EvaluateResult, Message, MetricResult
11
+ from ..reward_function import reward_function
12
+ from .code_execution import _HAS_E2B # Import _HAS_E2B to check E2B availability
13
+ from .code_execution import _run_test_cases # Import the main test case runner
14
+ from .code_execution import (
15
+ compare_outputs,
16
+ execute_code_with_e2b,
17
+ execute_javascript_code,
18
+ execute_python_code,
19
+ extract_code_blocks,
20
+ )
21
+
22
+
23
+ @reward_function
24
+ def deepcoder_code_reward(
25
+ messages: List[Message], # Full conversation, model's response is messages[-1]
26
+ ground_truth: List[Dict[str, Any]], # This is the test_cases
27
+ language: str,
28
+ timeout: int = 10,
29
+ environment: str = "local",
30
+ api_key: Optional[str] = None,
31
+ target_function: Optional[str] = None,
32
+ **kwargs: Any,
33
+ ) -> EvaluateResult:
34
+ """
35
+ Evaluates code based on a set of test cases, DeepCoder-style.
36
+ Returns 1.0 if all test cases pass, 0.0 otherwise.
37
+ This version calls the shared _run_test_cases utility.
38
+
39
+ Args:
40
+ messages: List of conversation messages. The last message is assumed to be the
41
+ assistant's response containing the code.
42
+ ground_truth: A list of dictionaries, each representing a test case with "input" (string)
43
+ and "expected_output" (string). This corresponds to the `test_cases`
44
+ parameter in the previous signature.
45
+ language: Programming language of the code (e.g., "python", "javascript").
46
+ timeout: Execution timeout per test case in seconds.
47
+ environment: "local" or "e2b" for code execution.
48
+ api_key: E2B API key, required if environment is "e2b".
49
+ target_function: Optional name of the function to call within the code.
50
+ **kwargs: Additional arguments.
51
+
52
+ Returns:
53
+ EvaluateResult with a score of 1.0 or 0.0 and detailed metrics.
54
+ """
55
+ metrics_dict: Dict[str, MetricResult] = {}
56
+
57
+ if (
58
+ not messages
59
+ or not isinstance(messages[-1], Message)
60
+ or messages[-1].role != "assistant"
61
+ or messages[-1].content is None
62
+ ):
63
+ return EvaluateResult(
64
+ score=0.0,
65
+ reason="Invalid or missing assistant response in messages.",
66
+ metrics={
67
+ "error": MetricResult(
68
+ score=0.0,
69
+ is_score_valid=False,
70
+ reason="Last message not a valid assistant response.",
71
+ )
72
+ },
73
+ is_score_valid=False,
74
+ )
75
+
76
+ assistant_content = messages[-1].content
77
+ test_cases = ground_truth
78
+
79
+ code_blocks = extract_code_blocks(assistant_content, language)
80
+ if not code_blocks:
81
+ return EvaluateResult(
82
+ score=0.0,
83
+ reason=f"No {language} code block found.",
84
+ metrics={
85
+ "error": MetricResult(
86
+ score=0.0,
87
+ is_score_valid=False,
88
+ reason=f"No {language} code block found.",
89
+ )
90
+ },
91
+ is_score_valid=False,
92
+ )
93
+
94
+ code_to_execute = code_blocks[0]["code"]
95
+ metrics_dict["extracted_code"] = MetricResult(
96
+ score=0.0,
97
+ is_score_valid=True,
98
+ reason=f"Extracted code:\n```\n{code_to_execute}\n```",
99
+ )
100
+
101
+ if not test_cases:
102
+ return EvaluateResult(
103
+ score=0.0,
104
+ reason="No test cases provided.",
105
+ metrics={
106
+ "error": MetricResult(score=0.0, is_score_valid=False, reason="No test cases provided."),
107
+ **metrics_dict, # Include already gathered metrics like extracted_code
108
+ },
109
+ is_score_valid=False,
110
+ )
111
+
112
+ function_to_call = target_function
113
+ if function_to_call:
114
+ metrics_dict["target_function_provided"] = MetricResult(
115
+ score=0.0,
116
+ is_score_valid=True,
117
+ reason=f"Using provided target function: {function_to_call}",
118
+ )
119
+ else:
120
+ metrics_dict["target_function_missing"] = MetricResult(
121
+ score=0.0,
122
+ is_score_valid=False,
123
+ reason="Target function name not provided in input data. Will attempt stdin/stdout.",
124
+ )
125
+
126
+ run_test_cases_kwargs = {
127
+ "code": code_to_execute,
128
+ "language": language,
129
+ "test_cases": test_cases,
130
+ "timeout": timeout,
131
+ "environment": environment,
132
+ "api_key": api_key,
133
+ "function_to_call": function_to_call,
134
+ }
135
+
136
+ filtered_kwargs = {k: v for k, v in run_test_cases_kwargs.items() if v is not None}
137
+
138
+ eval_result_from_tests: EvaluateResult = _run_test_cases(**filtered_kwargs) # type: ignore
139
+
140
+ final_score = 1.0 if eval_result_from_tests.score == 1.0 else 0.0
141
+
142
+ if eval_result_from_tests.metrics:
143
+ metrics_dict.update(eval_result_from_tests.metrics)
144
+
145
+ overall_reason = "All tests passed." if final_score == 1.0 else "One or more tests failed or an error occurred."
146
+ if eval_result_from_tests.reason and eval_result_from_tests.score == 0.0:
147
+ pass
148
+ metrics_dict["overall_status"] = MetricResult(
149
+ score=final_score, is_score_valid=(final_score == 1.0), reason=overall_reason
150
+ )
151
+
152
+ final_reason = overall_reason
153
+ if eval_result_from_tests.score != 1.0 and eval_result_from_tests.reason:
154
+ final_reason = eval_result_from_tests.reason
155
+
156
+ return EvaluateResult(
157
+ score=final_score,
158
+ reason=final_reason,
159
+ metrics=metrics_dict,
160
+ is_score_valid=final_score == 1.0,
161
+ )
@@ -0,0 +1,129 @@
1
+ """
2
+ Reward functions for validating text format.
3
+
4
+ This module provides reward functions that validate if text responses
5
+ adhere to specific formatting requirements, such as containing specific tags
6
+ in the correct order.
7
+ """
8
+
9
+ import re
10
+ from typing import Any, Dict, List, Optional, Union # Added Optional
11
+
12
+ from ..models import EvaluateResult, Message, MetricResult
13
+ from ..typed_interface import reward_function
14
+
15
+
16
+ @reward_function
17
+ def format_reward(
18
+ messages: Union[List[Message], List[Dict[str, Any]]],
19
+ ground_truth: Optional[Union[List[Message], List[Dict[str, Any]]]] = None,
20
+ format_regex: str = r"^<think>\n.*?</think>\n<answer>\n.*?</answer>$",
21
+ require_exact_match: bool = True,
22
+ **kwargs: Any,
23
+ ) -> EvaluateResult:
24
+ """
25
+ Reward function that validates if text follows a specific format pattern.
26
+ The model's response is assumed to be the last message in the `messages` list.
27
+
28
+ By default, this checks for <think> and <answer> tags in the correct order,
29
+ ensuring proper separation of reasoning and final answer.
30
+
31
+ Args:
32
+ messages: List of conversation messages, where `messages[-1]` is the model's response.
33
+ ground_truth: Optional. Expected assistant response trajectory. Not directly used by this format reward.
34
+ format_regex: Regular expression pattern to match. Default checks for
35
+ <think>...</think> followed by <answer>...</answer>.
36
+ require_exact_match: If True, the entire text must match the pattern.
37
+ If False, pattern just needs to be found in text.
38
+ **kwargs: Additional arguments.
39
+
40
+ Returns:
41
+ EvaluateResult with score 1.0 if format is correct, 0.0 otherwise
42
+ """
43
+ if not messages or len(messages) == 0:
44
+ return EvaluateResult(
45
+ score=0.0,
46
+ reason="No messages provided",
47
+ metrics={"format_check": MetricResult(score=0.0, is_score_valid=False, reason="No messages provided")},
48
+ is_score_valid=False,
49
+ )
50
+
51
+ response = messages[-1]
52
+
53
+ if isinstance(response, Message):
54
+ if response.role != "assistant" or not response.content:
55
+ return EvaluateResult(
56
+ score=0.0,
57
+ reason="No assistant response found",
58
+ metrics={
59
+ "format_check": MetricResult(
60
+ score=0.0,
61
+ is_score_valid=False,
62
+ reason="Message not from assistant or has no content",
63
+ )
64
+ },
65
+ is_score_valid=False,
66
+ )
67
+ text = response.content
68
+ elif isinstance(response, dict):
69
+ if response.get("role") != "assistant" or not response.get("content"):
70
+ return EvaluateResult(
71
+ score=0.0,
72
+ reason="No assistant response found",
73
+ metrics={
74
+ "format_check": MetricResult(
75
+ score=0.0,
76
+ is_score_valid=False,
77
+ reason="Message not from assistant or has no content",
78
+ )
79
+ },
80
+ is_score_valid=False,
81
+ )
82
+ text = response.get("content", "")
83
+ else:
84
+ return EvaluateResult(
85
+ score=0.0,
86
+ reason="Last message is of unexpected type.",
87
+ metrics={
88
+ "format_check": MetricResult(
89
+ score=0.0,
90
+ is_score_valid=False,
91
+ reason="Invalid message type in messages.",
92
+ )
93
+ },
94
+ is_score_valid=False,
95
+ )
96
+
97
+ pattern = re.compile(format_regex, re.DOTALL)
98
+
99
+ if require_exact_match:
100
+ match = pattern.match(text)
101
+ else:
102
+ match = pattern.search(text)
103
+
104
+ if match:
105
+ return EvaluateResult(
106
+ score=1.0,
107
+ reason="Format is correct",
108
+ metrics={
109
+ "format_check": MetricResult(
110
+ score=1.0,
111
+ is_score_valid=True,
112
+ reason="Text follows the required format pattern",
113
+ )
114
+ },
115
+ is_score_valid=True,
116
+ )
117
+ else:
118
+ return EvaluateResult(
119
+ score=0.0,
120
+ reason="Format is incorrect",
121
+ metrics={
122
+ "format_check": MetricResult(
123
+ score=0.0,
124
+ is_score_valid=False,
125
+ reason="Text does not follow the required format pattern",
126
+ )
127
+ },
128
+ is_score_valid=False,
129
+ )