eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,313 @@
1
+ import ast
2
+ import inspect
3
+ import json
4
+ import logging # Import logging
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
+
7
+ from eval_protocol.agent.resources.bfcl_sim_api_resource import BFCLSimAPIResource
8
+ from eval_protocol.models import EvaluateResult, Message, MetricResult
9
+ from eval_protocol.typed_interface import reward_function
10
+
11
+ # Get logger for bfcl_reward
12
+ logger = logging.getLogger("bfcl_reward")
13
+ logger.setLevel(logging.DEBUG) # Ensure debug logs are processed
14
+
15
+
16
+ # Helper functions adapted from BfclRubric (copied here)
17
+ def _parse_function_call(func_call_str: str):
18
+ """
19
+ Parses a function call string into a JSON-like dictionary.
20
+
21
+ :param func_call_str: String representation of a function call.
22
+ :return: JSON-like dictionary with function name and arguments.
23
+ """
24
+ try:
25
+ tree = ast.parse(func_call_str, mode="eval")
26
+ if not isinstance(tree.body, ast.Call):
27
+ raise ValueError("Input is not a valid function call.")
28
+ func_name = tree.body.func.id if isinstance(tree.body.func, ast.Name) else None
29
+ if not func_name:
30
+ raise ValueError("Could not determine function name.")
31
+ args_dict = {}
32
+ for kw in tree.body.keywords:
33
+ args_dict[kw.arg] = ast.literal_eval(kw.value)
34
+ for i, arg in enumerate(tree.body.args):
35
+ args_dict[f"pos_arg_{i}"] = ast.literal_eval(arg) # Standardized positional arg key
36
+
37
+ json_obj = {"name": func_name, "args": args_dict}
38
+ return json_obj
39
+ except Exception:
40
+ raise ValueError(f"Error parsing function call string: {func_call_str}")
41
+
42
+
43
+ def _are_function_calls_equivalent(call1: Dict[str, Any], call2: Dict[str, Any]) -> bool:
44
+ """
45
+ Compares two parsed function call dictionaries for semantic equivalence.
46
+ Special handling for 'sort' command arguments.
47
+ """
48
+ if not isinstance(call1, dict) or not isinstance(call2, dict):
49
+ logger.warning(f"Invalid input to _are_function_calls_equivalent: call1={call1}, call2={call2}")
50
+ return False
51
+
52
+ name1 = call1.get("name")
53
+ name2 = call2.get("name")
54
+
55
+ if name1 != name2:
56
+ return False
57
+
58
+ args1 = call1.get("args", {})
59
+ args2 = call2.get("args", {})
60
+
61
+ if not isinstance(args1, dict) or not isinstance(args2, dict):
62
+ logger.warning(f"Invalid args in _are_function_calls_equivalent: args1={args1}, args2={args2}")
63
+ return False # Should be dicts
64
+
65
+ if name1 == "sort":
66
+ val1 = args1.get("pos_arg_0", args1.get("file_name"))
67
+ val2 = args2.get("pos_arg_0", args2.get("file_name"))
68
+
69
+ # Check if both extracted values are not None and are equal
70
+ # And that both arg dicts have only one relevant key for the sort value
71
+ # (to avoid matching if one has extra unrecognized args beyond the file name)
72
+
73
+ # Condition 1: Both args have exactly one key
74
+ cond1_holds = len(args1) == 1 and len(args2) == 1
75
+ # Condition 2: The values associated with the (potentially different) keys are the same
76
+ cond2_holds = val1 is not None and val1 == val2
77
+
78
+ if cond1_holds and cond2_holds:
79
+ return True
80
+ # Fallback to direct equality if the specific sort logic doesn't cleanly apply
81
+ # (e.g. if one has pos_arg_0 and the other has file_name AND other args, or different number of args)
82
+ return args1 == args2
83
+ else:
84
+ # For all other functions, use direct argument dictionary equality
85
+ return args1 == args2
86
+
87
+
88
+ def _is_subsequence_unordered(list1: List[Dict[str, Any]], list2: List[Dict[str, Any]]) -> tuple[bool, list]:
89
+ """
90
+ Checks if all elements of list1 are present in list2, using _are_function_calls_equivalent for comparison.
91
+ Also returns the elements of list1 that are not present in list2.
92
+ """
93
+ if not list1: # If list1 is empty, it's always a subsequence
94
+ return True, []
95
+ # If list1 is not empty but list2 is, list1 cannot be a subsequence.
96
+ # This also handles the case where list2 becomes empty during processing.
97
+ if not list2 and list1:
98
+ return False, list1[:]
99
+
100
+ list2_copy = list2[:] # Make a copy to modify
101
+ missing_elements = []
102
+
103
+ for item1 in list1:
104
+ found_match_in_list2 = False
105
+ for i, item2 in enumerate(list2_copy):
106
+ if _are_function_calls_equivalent(item1, item2):
107
+ list2_copy.pop(i) # Remove the matched element from list2_copy
108
+ found_match_in_list2 = True
109
+ break # Move to the next item in list1
110
+ if not found_match_in_list2:
111
+ missing_elements.append(item1)
112
+
113
+ is_subsequence = len(missing_elements) == 0
114
+ return is_subsequence, missing_elements
115
+
116
+
117
+ def compare_comparable_states(model_state: Dict[str, Any], gt_state: Dict[str, Any]) -> Tuple[bool, Dict[str, Any]]:
118
+ """
119
+ Compares two comparable state dictionaries.
120
+ Returns True if they match, False otherwise, and a dictionary of differences.
121
+ """
122
+ if set(model_state.keys()) != set(gt_state.keys()):
123
+ return False, {
124
+ "keys_mismatch": {
125
+ "model_keys": list(model_state.keys()),
126
+ "gt_keys": list(gt_state.keys()),
127
+ }
128
+ }
129
+
130
+ all_match = True
131
+ differences = {}
132
+ for class_name in model_state.keys():
133
+ model_instance_state = model_state[class_name]
134
+ gt_instance_state = gt_state[class_name]
135
+
136
+ if model_instance_state != gt_instance_state:
137
+ all_match = False
138
+ diffs = {
139
+ k: (model_instance_state.get(k), gt_instance_state.get(k))
140
+ for k in set(model_instance_state) | set(gt_instance_state)
141
+ if model_instance_state.get(k) != gt_instance_state.get(k)
142
+ }
143
+ differences[class_name] = diffs
144
+
145
+ return all_match, differences
146
+
147
+
148
+ @reward_function
149
+ def bfcl_reward(
150
+ messages: List[Message], # Full conversation, assistant responses are at the end
151
+ ground_truth: Dict[str, Any], # Contains 'function_calls' and 'comparable_state'
152
+ state: Dict[str, Any], # Runtime state (BFCLSimAPIResource, successful_func_calls)
153
+ **kwargs: Any,
154
+ ) -> EvaluateResult:
155
+ """
156
+ Evaluates agent performance on BFCL tasks based on state, function calls, and format.
157
+ """
158
+ ground_truth_function_calls: Optional[List[List[str]]] = ground_truth.get("function_calls")
159
+ ground_truth_comparable_state: Optional[Dict[str, Any]] = ground_truth.get("comparable_state")
160
+
161
+ # Log ground truth data received
162
+ logger.debug(f"Ground truth function calls from input: {ground_truth_function_calls}")
163
+ logger.debug(f"Ground truth comparable state from input: {ground_truth_comparable_state}")
164
+
165
+ if ground_truth_function_calls is None or ground_truth_comparable_state is None:
166
+ return EvaluateResult(
167
+ score=0.0,
168
+ reason="Ground truth 'function_calls' or 'comparable_state' not found in ground_truth dict.",
169
+ metrics={},
170
+ )
171
+
172
+ # Access the BFCLSimAPIResource instance from the state
173
+ bfcl_resource: Optional[BFCLSimAPIResource] = state.get("resource")
174
+
175
+ if not isinstance(bfcl_resource, BFCLSimAPIResource):
176
+ return EvaluateResult(
177
+ score=0.0,
178
+ reason="BFCLSimAPIResource instance not found in state.",
179
+ metrics={},
180
+ )
181
+
182
+ # --- State Matches Check ---
183
+ model_comparable_state = bfcl_resource.get_comparable_state()
184
+ state_match, state_diffs = compare_comparable_states(model_comparable_state, ground_truth_comparable_state)
185
+
186
+ state_match_score = 0.5 if state_match else 0.0
187
+
188
+ # --- Function Call Matches Check ---
189
+ # model_successful_func_calls is List[List[Dict[str, Any]]], one inner list per user turn's accumulated calls
190
+ model_successful_func_calls_per_turn = state.get("successful_func_calls", [])
191
+
192
+ num_func_matches_for_score = 0 # Number of user turns where model's calls matched GT's calls for that turn
193
+ func_match_score = 0.0
194
+
195
+ num_gt_turns_with_calls = len(ground_truth_function_calls) if ground_truth_function_calls else 0
196
+ num_model_turns_with_actual_calls = len(model_successful_func_calls_per_turn)
197
+
198
+ # Iterate over GT turns to see if the model matched them
199
+ # This handles cases where model makes fewer turns with calls than GT expects.
200
+ if num_gt_turns_with_calls > 0:
201
+ for i in range(num_gt_turns_with_calls):
202
+ gt_calls_str_for_this_turn = ground_truth_function_calls[i] # List[str]
203
+
204
+ model_calls_for_this_turn = [] # List[Dict]
205
+ if i < num_model_turns_with_actual_calls:
206
+ model_calls_for_this_turn = model_successful_func_calls_per_turn[i]
207
+
208
+ try:
209
+ gt_calls_for_this_turn = [_parse_function_call(call_str) for call_str in gt_calls_str_for_this_turn]
210
+ logger.debug(f"GT calls for turn {i}: {json.dumps(gt_calls_for_this_turn)}")
211
+ logger.debug(f"Model calls for turn {i}: {json.dumps(model_calls_for_this_turn)}")
212
+
213
+ is_match_for_turn, missing_gt_calls = _is_subsequence_unordered(
214
+ gt_calls_for_this_turn, model_calls_for_this_turn
215
+ )
216
+ if is_match_for_turn:
217
+ num_func_matches_for_score += 1
218
+ logger.debug(f"Turn {i} matched.")
219
+ else:
220
+ logger.debug(
221
+ f"Turn {i} did NOT match. Missing GT calls in model's calls: {json.dumps(missing_gt_calls)}"
222
+ )
223
+ except Exception as e:
224
+ logger.error(f"Error comparing function calls for GT turn index {i}: {e}")
225
+
226
+ if (
227
+ num_func_matches_for_score == num_gt_turns_with_calls
228
+ and num_model_turns_with_actual_calls == num_gt_turns_with_calls
229
+ ):
230
+ func_match_score = 0.5
231
+ elif (
232
+ num_func_matches_for_score == num_gt_turns_with_calls
233
+ and num_model_turns_with_actual_calls != num_gt_turns_with_calls
234
+ ):
235
+ func_match_score = 0.0
236
+ else:
237
+ func_match_score = 0.0
238
+
239
+ elif num_gt_turns_with_calls == 0:
240
+ if num_model_turns_with_actual_calls == 0:
241
+ func_match_score = 0.5
242
+ else:
243
+ func_match_score = 0.0
244
+
245
+ reason_num_total_gt_turns_with_calls = (
246
+ num_gt_turns_with_calls if num_gt_turns_with_calls > 0 else "0 (no GT calls expected)"
247
+ )
248
+
249
+ # --- Format Check (on model's response messages from the `messages` list) ---
250
+ format_score = 0.2
251
+ valid_assistant_messages = 0
252
+ total_assistant_messages = 0
253
+ assistant_message_found = False
254
+
255
+ # Iterate over all messages to find assistant responses
256
+ # The actual model response messages are part of the `messages` list.
257
+ # Typically, these would be the last few messages if it's a multi-turn interaction,
258
+ # or messages[-1] if it's a single assistant response.
259
+ # For simplicity in format check, we scan all assistant messages in the provided `messages`.
260
+ for msg in messages:
261
+ if isinstance(msg, Message) and msg.role == "assistant":
262
+ assistant_message_found = True
263
+ total_assistant_messages += 1
264
+ # Check for any content or any tool_call
265
+ if (msg.content and msg.content.strip()) or msg.tool_calls:
266
+ valid_assistant_messages += 1
267
+
268
+ if not assistant_message_found:
269
+ format_score = 0.0
270
+ elif total_assistant_messages > 0 and valid_assistant_messages == 0:
271
+ # Assistant messages were found, but none had content or tool_calls
272
+ format_score = 0.0
273
+ # If valid_assistant_messages > 0, format_score remains 0.2 (or could be scaled)
274
+
275
+ # --- Combine Scores ---
276
+ base_score = state_match_score + func_match_score
277
+
278
+ if base_score >= 1.0:
279
+ final_score = base_score + format_score
280
+ reason = "State and function calls matched ground truth."
281
+ if format_score == 0.2:
282
+ reason += " Format was also correct."
283
+ else:
284
+ reason += " Format was incorrect."
285
+ else:
286
+ final_score = 0.0
287
+ reason = "State or function calls did not perfectly match ground truth."
288
+ if state_match_score < 0.5:
289
+ reason += f" State match failed."
290
+ if state_diffs:
291
+ reason += f" Differences: {json.dumps(state_diffs)}"
292
+ if func_match_score < 0.5: # Check against 0.5 as perfect score for this component
293
+ reason += f" Function call match failed ({num_func_matches_for_score}/{reason_num_total_gt_turns_with_calls} GT turns with calls matched)."
294
+
295
+ # Add metrics
296
+ metrics = {}
297
+ metrics["state_match"] = MetricResult(
298
+ score=state_match_score,
299
+ is_score_valid=state_match_score == 0.5,
300
+ reason=f"State match: {state_match}" + (f", Differences: {json.dumps(state_diffs)}" if state_diffs else ""),
301
+ )
302
+ metrics["function_call_match"] = MetricResult(
303
+ score=func_match_score,
304
+ is_score_valid=func_match_score == 0.5, # Success if it gets the full 0.5 for this part
305
+ reason=f"{num_func_matches_for_score}/{reason_num_total_gt_turns_with_calls} GT turns with calls matched by model. Model made calls in {num_model_turns_with_actual_calls} turn(s).",
306
+ )
307
+ metrics["format_check"] = MetricResult(
308
+ score=format_score,
309
+ is_score_valid=format_score == 0.2, # Success if it gets the full 0.2 for format
310
+ reason=f"{valid_assistant_messages}/{total_assistant_messages} assistant messages had correct format.",
311
+ )
312
+
313
+ return EvaluateResult(score=final_score, reason=reason, metrics=metrics)