eval-protocol 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- development/__init__.py +1 -0
- development/normalize_sandbox_fusion.py +628 -0
- development/utils/__init__.py +1 -0
- development/utils/generate_api_key.py +31 -0
- development/utils/subprocess_manager.py +481 -0
- eval_protocol/__init__.py +86 -0
- eval_protocol/__main__.py +10 -0
- eval_protocol/_version.py +21 -0
- eval_protocol/adapters/__init__.py +1 -0
- eval_protocol/adapters/braintrust.py +8 -0
- eval_protocol/adapters/trl.py +8 -0
- eval_protocol/agent/__init__.py +29 -0
- eval_protocol/agent/models.py +69 -0
- eval_protocol/agent/orchestrator.py +893 -0
- eval_protocol/agent/resource_abc.py +89 -0
- eval_protocol/agent/resource_pool.py +184 -0
- eval_protocol/agent/resources/__init__.py +44 -0
- eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
- eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
- eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
- eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
- eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
- eval_protocol/agent/resources/docker_resource.py +479 -0
- eval_protocol/agent/resources/filesystem_resource.py +371 -0
- eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
- eval_protocol/agent/resources/http_rollout_resource.py +325 -0
- eval_protocol/agent/resources/python_state_resource.py +170 -0
- eval_protocol/agent/resources/sql_resource.py +271 -0
- eval_protocol/agent/task_manager.py +1064 -0
- eval_protocol/agent/tool_registry.py +111 -0
- eval_protocol/auth.py +156 -0
- eval_protocol/cli.py +425 -0
- eval_protocol/cli_commands/__init__.py +1 -0
- eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
- eval_protocol/cli_commands/common.py +242 -0
- eval_protocol/cli_commands/deploy.py +486 -0
- eval_protocol/cli_commands/deploy_mcp.py +287 -0
- eval_protocol/cli_commands/preview.py +186 -0
- eval_protocol/cli_commands/run_eval_cmd.py +202 -0
- eval_protocol/common_utils.py +36 -0
- eval_protocol/config.py +180 -0
- eval_protocol/datasets/__init__.py +1 -0
- eval_protocol/datasets/loader.py +521 -0
- eval_protocol/evaluation.py +1045 -0
- eval_protocol/execution/__init__.py +1 -0
- eval_protocol/execution/pipeline.py +920 -0
- eval_protocol/gcp_tools.py +484 -0
- eval_protocol/generation/cache.py +141 -0
- eval_protocol/generation/clients/base.py +67 -0
- eval_protocol/generation/clients.py +248 -0
- eval_protocol/generic_server.py +165 -0
- eval_protocol/integrations/__init__.py +12 -0
- eval_protocol/integrations/braintrust.py +51 -0
- eval_protocol/integrations/deepeval.py +106 -0
- eval_protocol/integrations/openeval.py +40 -0
- eval_protocol/integrations/trl.py +187 -0
- eval_protocol/mcp/__init__.py +48 -0
- eval_protocol/mcp/adapter.py +131 -0
- eval_protocol/mcp/client/__init__.py +12 -0
- eval_protocol/mcp/client/connection.py +499 -0
- eval_protocol/mcp/clients.py +195 -0
- eval_protocol/mcp/execution/__init__.py +23 -0
- eval_protocol/mcp/execution/base_policy.py +227 -0
- eval_protocol/mcp/execution/fireworks_policy.py +209 -0
- eval_protocol/mcp/execution/manager.py +506 -0
- eval_protocol/mcp/execution/policy.py +421 -0
- eval_protocol/mcp/grid_renderer.py +54 -0
- eval_protocol/mcp/mcpgym.py +637 -0
- eval_protocol/mcp/process_manager.py +177 -0
- eval_protocol/mcp/session/__init__.py +11 -0
- eval_protocol/mcp/session/manager.py +228 -0
- eval_protocol/mcp/simple_process_manager.py +291 -0
- eval_protocol/mcp/simulation_server.py +458 -0
- eval_protocol/mcp/types.py +80 -0
- eval_protocol/mcp_agent/__init__.py +1 -0
- eval_protocol/mcp_agent/config.py +147 -0
- eval_protocol/mcp_agent/intermediary_server.py +542 -0
- eval_protocol/mcp_agent/main.py +210 -0
- eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
- eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
- eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
- eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
- eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
- eval_protocol/mcp_agent/session.py +79 -0
- eval_protocol/mcp_env.py +304 -0
- eval_protocol/models.py +366 -0
- eval_protocol/packaging.py +219 -0
- eval_protocol/platform_api.py +360 -0
- eval_protocol/playback_policy.py +396 -0
- eval_protocol/resources.py +128 -0
- eval_protocol/reward_function.py +410 -0
- eval_protocol/rewards/__init__.py +94 -0
- eval_protocol/rewards/accuracy.py +454 -0
- eval_protocol/rewards/accuracy_length.py +173 -0
- eval_protocol/rewards/apps_coding_reward.py +331 -0
- eval_protocol/rewards/apps_execution_utils.py +149 -0
- eval_protocol/rewards/apps_testing_util.py +559 -0
- eval_protocol/rewards/bfcl_reward.py +313 -0
- eval_protocol/rewards/code_execution.py +1620 -0
- eval_protocol/rewards/code_execution_utils.py +72 -0
- eval_protocol/rewards/cpp_code.py +861 -0
- eval_protocol/rewards/deepcoder_reward.py +161 -0
- eval_protocol/rewards/format.py +129 -0
- eval_protocol/rewards/function_calling.py +541 -0
- eval_protocol/rewards/json_schema.py +422 -0
- eval_protocol/rewards/language_consistency.py +700 -0
- eval_protocol/rewards/lean_prover.py +479 -0
- eval_protocol/rewards/length.py +375 -0
- eval_protocol/rewards/list_comparison_math_reward.py +221 -0
- eval_protocol/rewards/math.py +762 -0
- eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
- eval_protocol/rewards/reasoning_steps.py +249 -0
- eval_protocol/rewards/repetition.py +342 -0
- eval_protocol/rewards/tag_count.py +162 -0
- eval_protocol/rl_processing.py +82 -0
- eval_protocol/server.py +271 -0
- eval_protocol/typed_interface.py +260 -0
- eval_protocol/utils/__init__.py +8 -0
- eval_protocol/utils/batch_evaluation.py +217 -0
- eval_protocol/utils/batch_transformation.py +205 -0
- eval_protocol/utils/dataset_helpers.py +112 -0
- eval_protocol/utils/module_loader.py +56 -0
- eval_protocol/utils/packaging_utils.py +108 -0
- eval_protocol/utils/static_policy.py +305 -0
- eval_protocol-0.0.3.dist-info/METADATA +635 -0
- eval_protocol-0.0.3.dist-info/RECORD +130 -0
- eval_protocol-0.0.3.dist-info/WHEEL +5 -0
- eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
- eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
- eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import inspect
|
|
3
|
+
import json
|
|
4
|
+
import logging # Import logging
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
|
+
|
|
7
|
+
from eval_protocol.agent.resources.bfcl_sim_api_resource import BFCLSimAPIResource
|
|
8
|
+
from eval_protocol.models import EvaluateResult, Message, MetricResult
|
|
9
|
+
from eval_protocol.typed_interface import reward_function
|
|
10
|
+
|
|
11
|
+
# Get logger for bfcl_reward
|
|
12
|
+
logger = logging.getLogger("bfcl_reward")
|
|
13
|
+
logger.setLevel(logging.DEBUG) # Ensure debug logs are processed
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Helper functions adapted from BfclRubric (copied here)
|
|
17
|
+
def _parse_function_call(func_call_str: str):
|
|
18
|
+
"""
|
|
19
|
+
Parses a function call string into a JSON-like dictionary.
|
|
20
|
+
|
|
21
|
+
:param func_call_str: String representation of a function call.
|
|
22
|
+
:return: JSON-like dictionary with function name and arguments.
|
|
23
|
+
"""
|
|
24
|
+
try:
|
|
25
|
+
tree = ast.parse(func_call_str, mode="eval")
|
|
26
|
+
if not isinstance(tree.body, ast.Call):
|
|
27
|
+
raise ValueError("Input is not a valid function call.")
|
|
28
|
+
func_name = tree.body.func.id if isinstance(tree.body.func, ast.Name) else None
|
|
29
|
+
if not func_name:
|
|
30
|
+
raise ValueError("Could not determine function name.")
|
|
31
|
+
args_dict = {}
|
|
32
|
+
for kw in tree.body.keywords:
|
|
33
|
+
args_dict[kw.arg] = ast.literal_eval(kw.value)
|
|
34
|
+
for i, arg in enumerate(tree.body.args):
|
|
35
|
+
args_dict[f"pos_arg_{i}"] = ast.literal_eval(arg) # Standardized positional arg key
|
|
36
|
+
|
|
37
|
+
json_obj = {"name": func_name, "args": args_dict}
|
|
38
|
+
return json_obj
|
|
39
|
+
except Exception:
|
|
40
|
+
raise ValueError(f"Error parsing function call string: {func_call_str}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _are_function_calls_equivalent(call1: Dict[str, Any], call2: Dict[str, Any]) -> bool:
|
|
44
|
+
"""
|
|
45
|
+
Compares two parsed function call dictionaries for semantic equivalence.
|
|
46
|
+
Special handling for 'sort' command arguments.
|
|
47
|
+
"""
|
|
48
|
+
if not isinstance(call1, dict) or not isinstance(call2, dict):
|
|
49
|
+
logger.warning(f"Invalid input to _are_function_calls_equivalent: call1={call1}, call2={call2}")
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
name1 = call1.get("name")
|
|
53
|
+
name2 = call2.get("name")
|
|
54
|
+
|
|
55
|
+
if name1 != name2:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
args1 = call1.get("args", {})
|
|
59
|
+
args2 = call2.get("args", {})
|
|
60
|
+
|
|
61
|
+
if not isinstance(args1, dict) or not isinstance(args2, dict):
|
|
62
|
+
logger.warning(f"Invalid args in _are_function_calls_equivalent: args1={args1}, args2={args2}")
|
|
63
|
+
return False # Should be dicts
|
|
64
|
+
|
|
65
|
+
if name1 == "sort":
|
|
66
|
+
val1 = args1.get("pos_arg_0", args1.get("file_name"))
|
|
67
|
+
val2 = args2.get("pos_arg_0", args2.get("file_name"))
|
|
68
|
+
|
|
69
|
+
# Check if both extracted values are not None and are equal
|
|
70
|
+
# And that both arg dicts have only one relevant key for the sort value
|
|
71
|
+
# (to avoid matching if one has extra unrecognized args beyond the file name)
|
|
72
|
+
|
|
73
|
+
# Condition 1: Both args have exactly one key
|
|
74
|
+
cond1_holds = len(args1) == 1 and len(args2) == 1
|
|
75
|
+
# Condition 2: The values associated with the (potentially different) keys are the same
|
|
76
|
+
cond2_holds = val1 is not None and val1 == val2
|
|
77
|
+
|
|
78
|
+
if cond1_holds and cond2_holds:
|
|
79
|
+
return True
|
|
80
|
+
# Fallback to direct equality if the specific sort logic doesn't cleanly apply
|
|
81
|
+
# (e.g. if one has pos_arg_0 and the other has file_name AND other args, or different number of args)
|
|
82
|
+
return args1 == args2
|
|
83
|
+
else:
|
|
84
|
+
# For all other functions, use direct argument dictionary equality
|
|
85
|
+
return args1 == args2
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _is_subsequence_unordered(list1: List[Dict[str, Any]], list2: List[Dict[str, Any]]) -> tuple[bool, list]:
|
|
89
|
+
"""
|
|
90
|
+
Checks if all elements of list1 are present in list2, using _are_function_calls_equivalent for comparison.
|
|
91
|
+
Also returns the elements of list1 that are not present in list2.
|
|
92
|
+
"""
|
|
93
|
+
if not list1: # If list1 is empty, it's always a subsequence
|
|
94
|
+
return True, []
|
|
95
|
+
# If list1 is not empty but list2 is, list1 cannot be a subsequence.
|
|
96
|
+
# This also handles the case where list2 becomes empty during processing.
|
|
97
|
+
if not list2 and list1:
|
|
98
|
+
return False, list1[:]
|
|
99
|
+
|
|
100
|
+
list2_copy = list2[:] # Make a copy to modify
|
|
101
|
+
missing_elements = []
|
|
102
|
+
|
|
103
|
+
for item1 in list1:
|
|
104
|
+
found_match_in_list2 = False
|
|
105
|
+
for i, item2 in enumerate(list2_copy):
|
|
106
|
+
if _are_function_calls_equivalent(item1, item2):
|
|
107
|
+
list2_copy.pop(i) # Remove the matched element from list2_copy
|
|
108
|
+
found_match_in_list2 = True
|
|
109
|
+
break # Move to the next item in list1
|
|
110
|
+
if not found_match_in_list2:
|
|
111
|
+
missing_elements.append(item1)
|
|
112
|
+
|
|
113
|
+
is_subsequence = len(missing_elements) == 0
|
|
114
|
+
return is_subsequence, missing_elements
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def compare_comparable_states(model_state: Dict[str, Any], gt_state: Dict[str, Any]) -> Tuple[bool, Dict[str, Any]]:
|
|
118
|
+
"""
|
|
119
|
+
Compares two comparable state dictionaries.
|
|
120
|
+
Returns True if they match, False otherwise, and a dictionary of differences.
|
|
121
|
+
"""
|
|
122
|
+
if set(model_state.keys()) != set(gt_state.keys()):
|
|
123
|
+
return False, {
|
|
124
|
+
"keys_mismatch": {
|
|
125
|
+
"model_keys": list(model_state.keys()),
|
|
126
|
+
"gt_keys": list(gt_state.keys()),
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
all_match = True
|
|
131
|
+
differences = {}
|
|
132
|
+
for class_name in model_state.keys():
|
|
133
|
+
model_instance_state = model_state[class_name]
|
|
134
|
+
gt_instance_state = gt_state[class_name]
|
|
135
|
+
|
|
136
|
+
if model_instance_state != gt_instance_state:
|
|
137
|
+
all_match = False
|
|
138
|
+
diffs = {
|
|
139
|
+
k: (model_instance_state.get(k), gt_instance_state.get(k))
|
|
140
|
+
for k in set(model_instance_state) | set(gt_instance_state)
|
|
141
|
+
if model_instance_state.get(k) != gt_instance_state.get(k)
|
|
142
|
+
}
|
|
143
|
+
differences[class_name] = diffs
|
|
144
|
+
|
|
145
|
+
return all_match, differences
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@reward_function
|
|
149
|
+
def bfcl_reward(
|
|
150
|
+
messages: List[Message], # Full conversation, assistant responses are at the end
|
|
151
|
+
ground_truth: Dict[str, Any], # Contains 'function_calls' and 'comparable_state'
|
|
152
|
+
state: Dict[str, Any], # Runtime state (BFCLSimAPIResource, successful_func_calls)
|
|
153
|
+
**kwargs: Any,
|
|
154
|
+
) -> EvaluateResult:
|
|
155
|
+
"""
|
|
156
|
+
Evaluates agent performance on BFCL tasks based on state, function calls, and format.
|
|
157
|
+
"""
|
|
158
|
+
ground_truth_function_calls: Optional[List[List[str]]] = ground_truth.get("function_calls")
|
|
159
|
+
ground_truth_comparable_state: Optional[Dict[str, Any]] = ground_truth.get("comparable_state")
|
|
160
|
+
|
|
161
|
+
# Log ground truth data received
|
|
162
|
+
logger.debug(f"Ground truth function calls from input: {ground_truth_function_calls}")
|
|
163
|
+
logger.debug(f"Ground truth comparable state from input: {ground_truth_comparable_state}")
|
|
164
|
+
|
|
165
|
+
if ground_truth_function_calls is None or ground_truth_comparable_state is None:
|
|
166
|
+
return EvaluateResult(
|
|
167
|
+
score=0.0,
|
|
168
|
+
reason="Ground truth 'function_calls' or 'comparable_state' not found in ground_truth dict.",
|
|
169
|
+
metrics={},
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Access the BFCLSimAPIResource instance from the state
|
|
173
|
+
bfcl_resource: Optional[BFCLSimAPIResource] = state.get("resource")
|
|
174
|
+
|
|
175
|
+
if not isinstance(bfcl_resource, BFCLSimAPIResource):
|
|
176
|
+
return EvaluateResult(
|
|
177
|
+
score=0.0,
|
|
178
|
+
reason="BFCLSimAPIResource instance not found in state.",
|
|
179
|
+
metrics={},
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# --- State Matches Check ---
|
|
183
|
+
model_comparable_state = bfcl_resource.get_comparable_state()
|
|
184
|
+
state_match, state_diffs = compare_comparable_states(model_comparable_state, ground_truth_comparable_state)
|
|
185
|
+
|
|
186
|
+
state_match_score = 0.5 if state_match else 0.0
|
|
187
|
+
|
|
188
|
+
# --- Function Call Matches Check ---
|
|
189
|
+
# model_successful_func_calls is List[List[Dict[str, Any]]], one inner list per user turn's accumulated calls
|
|
190
|
+
model_successful_func_calls_per_turn = state.get("successful_func_calls", [])
|
|
191
|
+
|
|
192
|
+
num_func_matches_for_score = 0 # Number of user turns where model's calls matched GT's calls for that turn
|
|
193
|
+
func_match_score = 0.0
|
|
194
|
+
|
|
195
|
+
num_gt_turns_with_calls = len(ground_truth_function_calls) if ground_truth_function_calls else 0
|
|
196
|
+
num_model_turns_with_actual_calls = len(model_successful_func_calls_per_turn)
|
|
197
|
+
|
|
198
|
+
# Iterate over GT turns to see if the model matched them
|
|
199
|
+
# This handles cases where model makes fewer turns with calls than GT expects.
|
|
200
|
+
if num_gt_turns_with_calls > 0:
|
|
201
|
+
for i in range(num_gt_turns_with_calls):
|
|
202
|
+
gt_calls_str_for_this_turn = ground_truth_function_calls[i] # List[str]
|
|
203
|
+
|
|
204
|
+
model_calls_for_this_turn = [] # List[Dict]
|
|
205
|
+
if i < num_model_turns_with_actual_calls:
|
|
206
|
+
model_calls_for_this_turn = model_successful_func_calls_per_turn[i]
|
|
207
|
+
|
|
208
|
+
try:
|
|
209
|
+
gt_calls_for_this_turn = [_parse_function_call(call_str) for call_str in gt_calls_str_for_this_turn]
|
|
210
|
+
logger.debug(f"GT calls for turn {i}: {json.dumps(gt_calls_for_this_turn)}")
|
|
211
|
+
logger.debug(f"Model calls for turn {i}: {json.dumps(model_calls_for_this_turn)}")
|
|
212
|
+
|
|
213
|
+
is_match_for_turn, missing_gt_calls = _is_subsequence_unordered(
|
|
214
|
+
gt_calls_for_this_turn, model_calls_for_this_turn
|
|
215
|
+
)
|
|
216
|
+
if is_match_for_turn:
|
|
217
|
+
num_func_matches_for_score += 1
|
|
218
|
+
logger.debug(f"Turn {i} matched.")
|
|
219
|
+
else:
|
|
220
|
+
logger.debug(
|
|
221
|
+
f"Turn {i} did NOT match. Missing GT calls in model's calls: {json.dumps(missing_gt_calls)}"
|
|
222
|
+
)
|
|
223
|
+
except Exception as e:
|
|
224
|
+
logger.error(f"Error comparing function calls for GT turn index {i}: {e}")
|
|
225
|
+
|
|
226
|
+
if (
|
|
227
|
+
num_func_matches_for_score == num_gt_turns_with_calls
|
|
228
|
+
and num_model_turns_with_actual_calls == num_gt_turns_with_calls
|
|
229
|
+
):
|
|
230
|
+
func_match_score = 0.5
|
|
231
|
+
elif (
|
|
232
|
+
num_func_matches_for_score == num_gt_turns_with_calls
|
|
233
|
+
and num_model_turns_with_actual_calls != num_gt_turns_with_calls
|
|
234
|
+
):
|
|
235
|
+
func_match_score = 0.0
|
|
236
|
+
else:
|
|
237
|
+
func_match_score = 0.0
|
|
238
|
+
|
|
239
|
+
elif num_gt_turns_with_calls == 0:
|
|
240
|
+
if num_model_turns_with_actual_calls == 0:
|
|
241
|
+
func_match_score = 0.5
|
|
242
|
+
else:
|
|
243
|
+
func_match_score = 0.0
|
|
244
|
+
|
|
245
|
+
reason_num_total_gt_turns_with_calls = (
|
|
246
|
+
num_gt_turns_with_calls if num_gt_turns_with_calls > 0 else "0 (no GT calls expected)"
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# --- Format Check (on model's response messages from the `messages` list) ---
|
|
250
|
+
format_score = 0.2
|
|
251
|
+
valid_assistant_messages = 0
|
|
252
|
+
total_assistant_messages = 0
|
|
253
|
+
assistant_message_found = False
|
|
254
|
+
|
|
255
|
+
# Iterate over all messages to find assistant responses
|
|
256
|
+
# The actual model response messages are part of the `messages` list.
|
|
257
|
+
# Typically, these would be the last few messages if it's a multi-turn interaction,
|
|
258
|
+
# or messages[-1] if it's a single assistant response.
|
|
259
|
+
# For simplicity in format check, we scan all assistant messages in the provided `messages`.
|
|
260
|
+
for msg in messages:
|
|
261
|
+
if isinstance(msg, Message) and msg.role == "assistant":
|
|
262
|
+
assistant_message_found = True
|
|
263
|
+
total_assistant_messages += 1
|
|
264
|
+
# Check for any content or any tool_call
|
|
265
|
+
if (msg.content and msg.content.strip()) or msg.tool_calls:
|
|
266
|
+
valid_assistant_messages += 1
|
|
267
|
+
|
|
268
|
+
if not assistant_message_found:
|
|
269
|
+
format_score = 0.0
|
|
270
|
+
elif total_assistant_messages > 0 and valid_assistant_messages == 0:
|
|
271
|
+
# Assistant messages were found, but none had content or tool_calls
|
|
272
|
+
format_score = 0.0
|
|
273
|
+
# If valid_assistant_messages > 0, format_score remains 0.2 (or could be scaled)
|
|
274
|
+
|
|
275
|
+
# --- Combine Scores ---
|
|
276
|
+
base_score = state_match_score + func_match_score
|
|
277
|
+
|
|
278
|
+
if base_score >= 1.0:
|
|
279
|
+
final_score = base_score + format_score
|
|
280
|
+
reason = "State and function calls matched ground truth."
|
|
281
|
+
if format_score == 0.2:
|
|
282
|
+
reason += " Format was also correct."
|
|
283
|
+
else:
|
|
284
|
+
reason += " Format was incorrect."
|
|
285
|
+
else:
|
|
286
|
+
final_score = 0.0
|
|
287
|
+
reason = "State or function calls did not perfectly match ground truth."
|
|
288
|
+
if state_match_score < 0.5:
|
|
289
|
+
reason += f" State match failed."
|
|
290
|
+
if state_diffs:
|
|
291
|
+
reason += f" Differences: {json.dumps(state_diffs)}"
|
|
292
|
+
if func_match_score < 0.5: # Check against 0.5 as perfect score for this component
|
|
293
|
+
reason += f" Function call match failed ({num_func_matches_for_score}/{reason_num_total_gt_turns_with_calls} GT turns with calls matched)."
|
|
294
|
+
|
|
295
|
+
# Add metrics
|
|
296
|
+
metrics = {}
|
|
297
|
+
metrics["state_match"] = MetricResult(
|
|
298
|
+
score=state_match_score,
|
|
299
|
+
is_score_valid=state_match_score == 0.5,
|
|
300
|
+
reason=f"State match: {state_match}" + (f", Differences: {json.dumps(state_diffs)}" if state_diffs else ""),
|
|
301
|
+
)
|
|
302
|
+
metrics["function_call_match"] = MetricResult(
|
|
303
|
+
score=func_match_score,
|
|
304
|
+
is_score_valid=func_match_score == 0.5, # Success if it gets the full 0.5 for this part
|
|
305
|
+
reason=f"{num_func_matches_for_score}/{reason_num_total_gt_turns_with_calls} GT turns with calls matched by model. Model made calls in {num_model_turns_with_actual_calls} turn(s).",
|
|
306
|
+
)
|
|
307
|
+
metrics["format_check"] = MetricResult(
|
|
308
|
+
score=format_score,
|
|
309
|
+
is_score_valid=format_score == 0.2, # Success if it gets the full 0.2 for format
|
|
310
|
+
reason=f"{valid_assistant_messages}/{total_assistant_messages} assistant messages had correct format.",
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
return EvaluateResult(score=final_score, reason=reason, metrics=metrics)
|