eval-protocol 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- development/__init__.py +1 -0
- development/normalize_sandbox_fusion.py +628 -0
- development/utils/__init__.py +1 -0
- development/utils/generate_api_key.py +31 -0
- development/utils/subprocess_manager.py +481 -0
- eval_protocol/__init__.py +86 -0
- eval_protocol/__main__.py +10 -0
- eval_protocol/_version.py +21 -0
- eval_protocol/adapters/__init__.py +1 -0
- eval_protocol/adapters/braintrust.py +8 -0
- eval_protocol/adapters/trl.py +8 -0
- eval_protocol/agent/__init__.py +29 -0
- eval_protocol/agent/models.py +69 -0
- eval_protocol/agent/orchestrator.py +893 -0
- eval_protocol/agent/resource_abc.py +89 -0
- eval_protocol/agent/resource_pool.py +184 -0
- eval_protocol/agent/resources/__init__.py +44 -0
- eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
- eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
- eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
- eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
- eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
- eval_protocol/agent/resources/docker_resource.py +479 -0
- eval_protocol/agent/resources/filesystem_resource.py +371 -0
- eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
- eval_protocol/agent/resources/http_rollout_resource.py +325 -0
- eval_protocol/agent/resources/python_state_resource.py +170 -0
- eval_protocol/agent/resources/sql_resource.py +271 -0
- eval_protocol/agent/task_manager.py +1064 -0
- eval_protocol/agent/tool_registry.py +111 -0
- eval_protocol/auth.py +156 -0
- eval_protocol/cli.py +425 -0
- eval_protocol/cli_commands/__init__.py +1 -0
- eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
- eval_protocol/cli_commands/common.py +242 -0
- eval_protocol/cli_commands/deploy.py +486 -0
- eval_protocol/cli_commands/deploy_mcp.py +287 -0
- eval_protocol/cli_commands/preview.py +186 -0
- eval_protocol/cli_commands/run_eval_cmd.py +202 -0
- eval_protocol/common_utils.py +36 -0
- eval_protocol/config.py +180 -0
- eval_protocol/datasets/__init__.py +1 -0
- eval_protocol/datasets/loader.py +521 -0
- eval_protocol/evaluation.py +1045 -0
- eval_protocol/execution/__init__.py +1 -0
- eval_protocol/execution/pipeline.py +920 -0
- eval_protocol/gcp_tools.py +484 -0
- eval_protocol/generation/cache.py +141 -0
- eval_protocol/generation/clients/base.py +67 -0
- eval_protocol/generation/clients.py +248 -0
- eval_protocol/generic_server.py +165 -0
- eval_protocol/integrations/__init__.py +12 -0
- eval_protocol/integrations/braintrust.py +51 -0
- eval_protocol/integrations/deepeval.py +106 -0
- eval_protocol/integrations/openeval.py +40 -0
- eval_protocol/integrations/trl.py +187 -0
- eval_protocol/mcp/__init__.py +48 -0
- eval_protocol/mcp/adapter.py +131 -0
- eval_protocol/mcp/client/__init__.py +12 -0
- eval_protocol/mcp/client/connection.py +499 -0
- eval_protocol/mcp/clients.py +195 -0
- eval_protocol/mcp/execution/__init__.py +23 -0
- eval_protocol/mcp/execution/base_policy.py +227 -0
- eval_protocol/mcp/execution/fireworks_policy.py +209 -0
- eval_protocol/mcp/execution/manager.py +506 -0
- eval_protocol/mcp/execution/policy.py +421 -0
- eval_protocol/mcp/grid_renderer.py +54 -0
- eval_protocol/mcp/mcpgym.py +637 -0
- eval_protocol/mcp/process_manager.py +177 -0
- eval_protocol/mcp/session/__init__.py +11 -0
- eval_protocol/mcp/session/manager.py +228 -0
- eval_protocol/mcp/simple_process_manager.py +291 -0
- eval_protocol/mcp/simulation_server.py +458 -0
- eval_protocol/mcp/types.py +80 -0
- eval_protocol/mcp_agent/__init__.py +1 -0
- eval_protocol/mcp_agent/config.py +147 -0
- eval_protocol/mcp_agent/intermediary_server.py +542 -0
- eval_protocol/mcp_agent/main.py +210 -0
- eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
- eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
- eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
- eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
- eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
- eval_protocol/mcp_agent/session.py +79 -0
- eval_protocol/mcp_env.py +304 -0
- eval_protocol/models.py +366 -0
- eval_protocol/packaging.py +219 -0
- eval_protocol/platform_api.py +360 -0
- eval_protocol/playback_policy.py +396 -0
- eval_protocol/resources.py +128 -0
- eval_protocol/reward_function.py +410 -0
- eval_protocol/rewards/__init__.py +94 -0
- eval_protocol/rewards/accuracy.py +454 -0
- eval_protocol/rewards/accuracy_length.py +173 -0
- eval_protocol/rewards/apps_coding_reward.py +331 -0
- eval_protocol/rewards/apps_execution_utils.py +149 -0
- eval_protocol/rewards/apps_testing_util.py +559 -0
- eval_protocol/rewards/bfcl_reward.py +313 -0
- eval_protocol/rewards/code_execution.py +1620 -0
- eval_protocol/rewards/code_execution_utils.py +72 -0
- eval_protocol/rewards/cpp_code.py +861 -0
- eval_protocol/rewards/deepcoder_reward.py +161 -0
- eval_protocol/rewards/format.py +129 -0
- eval_protocol/rewards/function_calling.py +541 -0
- eval_protocol/rewards/json_schema.py +422 -0
- eval_protocol/rewards/language_consistency.py +700 -0
- eval_protocol/rewards/lean_prover.py +479 -0
- eval_protocol/rewards/length.py +375 -0
- eval_protocol/rewards/list_comparison_math_reward.py +221 -0
- eval_protocol/rewards/math.py +762 -0
- eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
- eval_protocol/rewards/reasoning_steps.py +249 -0
- eval_protocol/rewards/repetition.py +342 -0
- eval_protocol/rewards/tag_count.py +162 -0
- eval_protocol/rl_processing.py +82 -0
- eval_protocol/server.py +271 -0
- eval_protocol/typed_interface.py +260 -0
- eval_protocol/utils/__init__.py +8 -0
- eval_protocol/utils/batch_evaluation.py +217 -0
- eval_protocol/utils/batch_transformation.py +205 -0
- eval_protocol/utils/dataset_helpers.py +112 -0
- eval_protocol/utils/module_loader.py +56 -0
- eval_protocol/utils/packaging_utils.py +108 -0
- eval_protocol/utils/static_policy.py +305 -0
- eval_protocol-0.0.3.dist-info/METADATA +635 -0
- eval_protocol-0.0.3.dist-info/RECORD +130 -0
- eval_protocol-0.0.3.dist-info/WHEEL +5 -0
- eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
- eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
- eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DeepCoder-style reward function for evaluating code correctness based on test cases.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import re # For function name extraction
|
|
8
|
+
from typing import Any, Dict, List, Optional, Union
|
|
9
|
+
|
|
10
|
+
from ..models import EvaluateResult, Message, MetricResult
|
|
11
|
+
from ..reward_function import reward_function
|
|
12
|
+
from .code_execution import _HAS_E2B # Import _HAS_E2B to check E2B availability
|
|
13
|
+
from .code_execution import _run_test_cases # Import the main test case runner
|
|
14
|
+
from .code_execution import (
|
|
15
|
+
compare_outputs,
|
|
16
|
+
execute_code_with_e2b,
|
|
17
|
+
execute_javascript_code,
|
|
18
|
+
execute_python_code,
|
|
19
|
+
extract_code_blocks,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@reward_function
|
|
24
|
+
def deepcoder_code_reward(
|
|
25
|
+
messages: List[Message], # Full conversation, model's response is messages[-1]
|
|
26
|
+
ground_truth: List[Dict[str, Any]], # This is the test_cases
|
|
27
|
+
language: str,
|
|
28
|
+
timeout: int = 10,
|
|
29
|
+
environment: str = "local",
|
|
30
|
+
api_key: Optional[str] = None,
|
|
31
|
+
target_function: Optional[str] = None,
|
|
32
|
+
**kwargs: Any,
|
|
33
|
+
) -> EvaluateResult:
|
|
34
|
+
"""
|
|
35
|
+
Evaluates code based on a set of test cases, DeepCoder-style.
|
|
36
|
+
Returns 1.0 if all test cases pass, 0.0 otherwise.
|
|
37
|
+
This version calls the shared _run_test_cases utility.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
messages: List of conversation messages. The last message is assumed to be the
|
|
41
|
+
assistant's response containing the code.
|
|
42
|
+
ground_truth: A list of dictionaries, each representing a test case with "input" (string)
|
|
43
|
+
and "expected_output" (string). This corresponds to the `test_cases`
|
|
44
|
+
parameter in the previous signature.
|
|
45
|
+
language: Programming language of the code (e.g., "python", "javascript").
|
|
46
|
+
timeout: Execution timeout per test case in seconds.
|
|
47
|
+
environment: "local" or "e2b" for code execution.
|
|
48
|
+
api_key: E2B API key, required if environment is "e2b".
|
|
49
|
+
target_function: Optional name of the function to call within the code.
|
|
50
|
+
**kwargs: Additional arguments.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
EvaluateResult with a score of 1.0 or 0.0 and detailed metrics.
|
|
54
|
+
"""
|
|
55
|
+
metrics_dict: Dict[str, MetricResult] = {}
|
|
56
|
+
|
|
57
|
+
if (
|
|
58
|
+
not messages
|
|
59
|
+
or not isinstance(messages[-1], Message)
|
|
60
|
+
or messages[-1].role != "assistant"
|
|
61
|
+
or messages[-1].content is None
|
|
62
|
+
):
|
|
63
|
+
return EvaluateResult(
|
|
64
|
+
score=0.0,
|
|
65
|
+
reason="Invalid or missing assistant response in messages.",
|
|
66
|
+
metrics={
|
|
67
|
+
"error": MetricResult(
|
|
68
|
+
score=0.0,
|
|
69
|
+
is_score_valid=False,
|
|
70
|
+
reason="Last message not a valid assistant response.",
|
|
71
|
+
)
|
|
72
|
+
},
|
|
73
|
+
is_score_valid=False,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
assistant_content = messages[-1].content
|
|
77
|
+
test_cases = ground_truth
|
|
78
|
+
|
|
79
|
+
code_blocks = extract_code_blocks(assistant_content, language)
|
|
80
|
+
if not code_blocks:
|
|
81
|
+
return EvaluateResult(
|
|
82
|
+
score=0.0,
|
|
83
|
+
reason=f"No {language} code block found.",
|
|
84
|
+
metrics={
|
|
85
|
+
"error": MetricResult(
|
|
86
|
+
score=0.0,
|
|
87
|
+
is_score_valid=False,
|
|
88
|
+
reason=f"No {language} code block found.",
|
|
89
|
+
)
|
|
90
|
+
},
|
|
91
|
+
is_score_valid=False,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
code_to_execute = code_blocks[0]["code"]
|
|
95
|
+
metrics_dict["extracted_code"] = MetricResult(
|
|
96
|
+
score=0.0,
|
|
97
|
+
is_score_valid=True,
|
|
98
|
+
reason=f"Extracted code:\n```\n{code_to_execute}\n```",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if not test_cases:
|
|
102
|
+
return EvaluateResult(
|
|
103
|
+
score=0.0,
|
|
104
|
+
reason="No test cases provided.",
|
|
105
|
+
metrics={
|
|
106
|
+
"error": MetricResult(score=0.0, is_score_valid=False, reason="No test cases provided."),
|
|
107
|
+
**metrics_dict, # Include already gathered metrics like extracted_code
|
|
108
|
+
},
|
|
109
|
+
is_score_valid=False,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
function_to_call = target_function
|
|
113
|
+
if function_to_call:
|
|
114
|
+
metrics_dict["target_function_provided"] = MetricResult(
|
|
115
|
+
score=0.0,
|
|
116
|
+
is_score_valid=True,
|
|
117
|
+
reason=f"Using provided target function: {function_to_call}",
|
|
118
|
+
)
|
|
119
|
+
else:
|
|
120
|
+
metrics_dict["target_function_missing"] = MetricResult(
|
|
121
|
+
score=0.0,
|
|
122
|
+
is_score_valid=False,
|
|
123
|
+
reason="Target function name not provided in input data. Will attempt stdin/stdout.",
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
run_test_cases_kwargs = {
|
|
127
|
+
"code": code_to_execute,
|
|
128
|
+
"language": language,
|
|
129
|
+
"test_cases": test_cases,
|
|
130
|
+
"timeout": timeout,
|
|
131
|
+
"environment": environment,
|
|
132
|
+
"api_key": api_key,
|
|
133
|
+
"function_to_call": function_to_call,
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
filtered_kwargs = {k: v for k, v in run_test_cases_kwargs.items() if v is not None}
|
|
137
|
+
|
|
138
|
+
eval_result_from_tests: EvaluateResult = _run_test_cases(**filtered_kwargs) # type: ignore
|
|
139
|
+
|
|
140
|
+
final_score = 1.0 if eval_result_from_tests.score == 1.0 else 0.0
|
|
141
|
+
|
|
142
|
+
if eval_result_from_tests.metrics:
|
|
143
|
+
metrics_dict.update(eval_result_from_tests.metrics)
|
|
144
|
+
|
|
145
|
+
overall_reason = "All tests passed." if final_score == 1.0 else "One or more tests failed or an error occurred."
|
|
146
|
+
if eval_result_from_tests.reason and eval_result_from_tests.score == 0.0:
|
|
147
|
+
pass
|
|
148
|
+
metrics_dict["overall_status"] = MetricResult(
|
|
149
|
+
score=final_score, is_score_valid=(final_score == 1.0), reason=overall_reason
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
final_reason = overall_reason
|
|
153
|
+
if eval_result_from_tests.score != 1.0 and eval_result_from_tests.reason:
|
|
154
|
+
final_reason = eval_result_from_tests.reason
|
|
155
|
+
|
|
156
|
+
return EvaluateResult(
|
|
157
|
+
score=final_score,
|
|
158
|
+
reason=final_reason,
|
|
159
|
+
metrics=metrics_dict,
|
|
160
|
+
is_score_valid=final_score == 1.0,
|
|
161
|
+
)
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Reward functions for validating text format.
|
|
3
|
+
|
|
4
|
+
This module provides reward functions that validate if text responses
|
|
5
|
+
adhere to specific formatting requirements, such as containing specific tags
|
|
6
|
+
in the correct order.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
from typing import Any, Dict, List, Optional, Union # Added Optional
|
|
11
|
+
|
|
12
|
+
from ..models import EvaluateResult, Message, MetricResult
|
|
13
|
+
from ..typed_interface import reward_function
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@reward_function
|
|
17
|
+
def format_reward(
|
|
18
|
+
messages: Union[List[Message], List[Dict[str, Any]]],
|
|
19
|
+
ground_truth: Optional[Union[List[Message], List[Dict[str, Any]]]] = None,
|
|
20
|
+
format_regex: str = r"^<think>\n.*?</think>\n<answer>\n.*?</answer>$",
|
|
21
|
+
require_exact_match: bool = True,
|
|
22
|
+
**kwargs: Any,
|
|
23
|
+
) -> EvaluateResult:
|
|
24
|
+
"""
|
|
25
|
+
Reward function that validates if text follows a specific format pattern.
|
|
26
|
+
The model's response is assumed to be the last message in the `messages` list.
|
|
27
|
+
|
|
28
|
+
By default, this checks for <think> and <answer> tags in the correct order,
|
|
29
|
+
ensuring proper separation of reasoning and final answer.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
messages: List of conversation messages, where `messages[-1]` is the model's response.
|
|
33
|
+
ground_truth: Optional. Expected assistant response trajectory. Not directly used by this format reward.
|
|
34
|
+
format_regex: Regular expression pattern to match. Default checks for
|
|
35
|
+
<think>...</think> followed by <answer>...</answer>.
|
|
36
|
+
require_exact_match: If True, the entire text must match the pattern.
|
|
37
|
+
If False, pattern just needs to be found in text.
|
|
38
|
+
**kwargs: Additional arguments.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
EvaluateResult with score 1.0 if format is correct, 0.0 otherwise
|
|
42
|
+
"""
|
|
43
|
+
if not messages or len(messages) == 0:
|
|
44
|
+
return EvaluateResult(
|
|
45
|
+
score=0.0,
|
|
46
|
+
reason="No messages provided",
|
|
47
|
+
metrics={"format_check": MetricResult(score=0.0, is_score_valid=False, reason="No messages provided")},
|
|
48
|
+
is_score_valid=False,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
response = messages[-1]
|
|
52
|
+
|
|
53
|
+
if isinstance(response, Message):
|
|
54
|
+
if response.role != "assistant" or not response.content:
|
|
55
|
+
return EvaluateResult(
|
|
56
|
+
score=0.0,
|
|
57
|
+
reason="No assistant response found",
|
|
58
|
+
metrics={
|
|
59
|
+
"format_check": MetricResult(
|
|
60
|
+
score=0.0,
|
|
61
|
+
is_score_valid=False,
|
|
62
|
+
reason="Message not from assistant or has no content",
|
|
63
|
+
)
|
|
64
|
+
},
|
|
65
|
+
is_score_valid=False,
|
|
66
|
+
)
|
|
67
|
+
text = response.content
|
|
68
|
+
elif isinstance(response, dict):
|
|
69
|
+
if response.get("role") != "assistant" or not response.get("content"):
|
|
70
|
+
return EvaluateResult(
|
|
71
|
+
score=0.0,
|
|
72
|
+
reason="No assistant response found",
|
|
73
|
+
metrics={
|
|
74
|
+
"format_check": MetricResult(
|
|
75
|
+
score=0.0,
|
|
76
|
+
is_score_valid=False,
|
|
77
|
+
reason="Message not from assistant or has no content",
|
|
78
|
+
)
|
|
79
|
+
},
|
|
80
|
+
is_score_valid=False,
|
|
81
|
+
)
|
|
82
|
+
text = response.get("content", "")
|
|
83
|
+
else:
|
|
84
|
+
return EvaluateResult(
|
|
85
|
+
score=0.0,
|
|
86
|
+
reason="Last message is of unexpected type.",
|
|
87
|
+
metrics={
|
|
88
|
+
"format_check": MetricResult(
|
|
89
|
+
score=0.0,
|
|
90
|
+
is_score_valid=False,
|
|
91
|
+
reason="Invalid message type in messages.",
|
|
92
|
+
)
|
|
93
|
+
},
|
|
94
|
+
is_score_valid=False,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
pattern = re.compile(format_regex, re.DOTALL)
|
|
98
|
+
|
|
99
|
+
if require_exact_match:
|
|
100
|
+
match = pattern.match(text)
|
|
101
|
+
else:
|
|
102
|
+
match = pattern.search(text)
|
|
103
|
+
|
|
104
|
+
if match:
|
|
105
|
+
return EvaluateResult(
|
|
106
|
+
score=1.0,
|
|
107
|
+
reason="Format is correct",
|
|
108
|
+
metrics={
|
|
109
|
+
"format_check": MetricResult(
|
|
110
|
+
score=1.0,
|
|
111
|
+
is_score_valid=True,
|
|
112
|
+
reason="Text follows the required format pattern",
|
|
113
|
+
)
|
|
114
|
+
},
|
|
115
|
+
is_score_valid=True,
|
|
116
|
+
)
|
|
117
|
+
else:
|
|
118
|
+
return EvaluateResult(
|
|
119
|
+
score=0.0,
|
|
120
|
+
reason="Format is incorrect",
|
|
121
|
+
metrics={
|
|
122
|
+
"format_check": MetricResult(
|
|
123
|
+
score=0.0,
|
|
124
|
+
is_score_valid=False,
|
|
125
|
+
reason="Text does not follow the required format pattern",
|
|
126
|
+
)
|
|
127
|
+
},
|
|
128
|
+
is_score_valid=False,
|
|
129
|
+
)
|