eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,108 @@
1
+ import logging
2
+ import os
3
+ import subprocess
4
+ import sys
5
+ import tempfile
6
+ from typing import List, Optional
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def get_pip_executable(venv_pip_path: Optional[str] = None) -> List[str]:
12
+ """Determines the pip executable command parts."""
13
+ if venv_pip_path and os.path.exists(venv_pip_path) and os.access(venv_pip_path, os.X_OK):
14
+ logger.info(f"Using specified pip executable: {venv_pip_path}")
15
+ return [venv_pip_path]
16
+
17
+ # Try to find pip in the current virtual environment's scripts/bin directory
18
+ # sys.executable should be /path/to/.venv/bin/python
19
+ # So, pip should be /path/to/.venv/bin/pip
20
+ # On Windows, it might be /path/to/.venv/Scripts/pip.exe
21
+ potential_pip_path = os.path.join(os.path.dirname(sys.executable), "pip")
22
+ if os.name == "nt": # Windows check
23
+ potential_pip_path += ".exe"
24
+
25
+ if os.path.exists(potential_pip_path) and os.access(potential_pip_path, os.X_OK):
26
+ logger.info(f"Using auto-detected pip executable: {potential_pip_path}")
27
+ return [potential_pip_path]
28
+
29
+ # Fallback to sys.executable -m pip (should generally work if python is from the venv)
30
+ logger.info(f"Using pip via: {sys.executable} -m pip")
31
+ return [sys.executable, "-m", "pip"]
32
+
33
+
34
+ def install_requirements(
35
+ requirements_list: List[str],
36
+ venv_pip_path: Optional[str] = None, # User can specify e.g. ".venv/bin/pip"
37
+ log_output: bool = True,
38
+ ) -> None:
39
+ """
40
+ Installs a list of Python package requirements using pip.
41
+
42
+ Args:
43
+ requirements_list: A list of requirement strings (e.g., ["package_a==1.0", "package_b>=2.0"]).
44
+ venv_pip_path: Optional path to the specific pip executable to use.
45
+ log_output: If True, logs the stdout and stderr of the pip command.
46
+ """
47
+ if not requirements_list:
48
+ logger.debug("No requirements provided to install.")
49
+ return
50
+
51
+ unique_requirements = sorted(list(set(req.strip() for req in requirements_list if req.strip())))
52
+ if not unique_requirements:
53
+ logger.debug("No unique, non-empty requirements to install after stripping.")
54
+ return
55
+
56
+ pip_command_parts = get_pip_executable(venv_pip_path)
57
+
58
+ # Create a temporary requirements file
59
+ # delete=False is used because on Windows, a file opened for writing cannot be opened by another process.
60
+ # We will manually delete it in the finally block.
61
+ tmp_req_fd, tmp_req_file_path = tempfile.mkstemp(suffix=".txt", prefix="rk_reqs_")
62
+
63
+ try:
64
+ with os.fdopen(tmp_req_fd, "w") as tmp_req_file:
65
+ for req in unique_requirements:
66
+ tmp_req_file.write(req + "\n")
67
+
68
+ logger.info(
69
+ f"Attempting to install requirements: {unique_requirements} using pip command: {' '.join(pip_command_parts)} -r {tmp_req_file_path}"
70
+ )
71
+
72
+ command = pip_command_parts + ["install", "-r", tmp_req_file_path]
73
+
74
+ process = subprocess.run(
75
+ command,
76
+ check=True, # Raise CalledProcessError on non-zero exit
77
+ capture_output=True,
78
+ text=True, # Decodes stdout/stderr as text
79
+ encoding="utf-8", # Explicit encoding
80
+ errors="replace", # Handle potential encoding errors in pip output
81
+ )
82
+ if log_output and process.stdout:
83
+ logger.info(f"Pip install stdout:\n{process.stdout.strip()}")
84
+ # pip often uses stderr for progress/warnings even on success
85
+ if log_output and process.stderr:
86
+ logger.info(f"Pip install stderr:\n{process.stderr.strip()}")
87
+ logger.info(f"Successfully installed requirements: {unique_requirements}")
88
+
89
+ except subprocess.CalledProcessError as e:
90
+ error_message = f"Error installing requirements from {tmp_req_file_path}.\n"
91
+ error_message += f"Command: {' '.join(e.cmd)}\n"
92
+ if e.stdout:
93
+ error_message += f"Pip stdout:\n{e.stdout.strip()}\n"
94
+ if e.stderr:
95
+ error_message += f"Pip stderr:\n{e.stderr.strip()}\n"
96
+ logger.error(error_message)
97
+ raise RuntimeError(
98
+ f"Failed to install requirements: {unique_requirements}. Details:\n{e.stderr or e.stdout or str(e)}"
99
+ )
100
+ except FileNotFoundError:
101
+ logger.error(
102
+ f"Pip executable not found: {' '.join(pip_command_parts)}. Please ensure pip is installed and in PATH, or venv_pip_path is correct."
103
+ )
104
+ raise
105
+ finally:
106
+ if os.path.exists(tmp_req_file_path):
107
+ os.remove(tmp_req_file_path)
108
+ logger.debug(f"Removed temporary requirements file: {tmp_req_file_path}")
@@ -0,0 +1,305 @@
1
+ """
2
+ General Static Policy for MCP Environment Testing
3
+
4
+ This policy provides a deterministic, non-LLM action sequence for fast iteration
5
+ across different MCP environments. It can be configured with custom tool names
6
+ and action sequences.
7
+
8
+ This is useful for:
9
+ - Fast testing of multi-session functionality
10
+ - Debugging environment behavior
11
+ - Performance testing without LLM overhead
12
+ """
13
+
14
+ import asyncio
15
+ import json
16
+ import logging
17
+ import os
18
+ import random
19
+ from typing import Any, Dict, List, Optional, Tuple, Union
20
+
21
+ # Import the base policy and types for proper recording functionality
22
+ from eval_protocol.mcp.types import LLMUsageStats, MCPToolCall
23
+ from eval_protocol.playback_policy import PlaybackPolicyBase
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class StaticPolicy(PlaybackPolicyBase):
29
+ """
30
+ Static policy that follows a predetermined action sequence.
31
+
32
+ Can be configured for different environments with custom tool names and actions.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ tool_name: str,
38
+ action_sequence: Optional[List[str]] = None,
39
+ available_actions: Optional[List[str]] = None,
40
+ **kwargs,
41
+ ):
42
+ """
43
+ Initialize static policy with recording/playback support.
44
+
45
+ Args:
46
+ tool_name: Name of the tool to call for actions (e.g., "lake_move", "lander_action")
47
+ action_sequence: List of actions to execute. If None, uses a default sequence.
48
+ available_actions: List of all available actions for this environment.
49
+ **kwargs: Additional arguments passed to PlaybackPolicyBase
50
+ """
51
+ # Initialize parent class for recording/playback functionality
52
+ super().__init__(**kwargs)
53
+
54
+ self.tool_name = tool_name
55
+ self.available_actions = available_actions or []
56
+
57
+ # Set default action sequence if not provided
58
+ if action_sequence is None:
59
+ if self.available_actions:
60
+ # Use first few actions as default sequence
61
+ self.action_sequence = self.available_actions[: min(6, len(self.available_actions))]
62
+ else:
63
+ self.action_sequence = ["DEFAULT_ACTION"]
64
+ else:
65
+ self.action_sequence = action_sequence
66
+
67
+ self.step_counts = {} # Track step count per environment
68
+
69
+ async def _generate_live_tool_calls(
70
+ self,
71
+ tool_schemas: List[Dict],
72
+ env_index: int,
73
+ conversation_history: List[Dict[str, Any]],
74
+ ) -> Tuple[List[MCPToolCall], LLMUsageStats]:
75
+ """
76
+ Generate tool calls in live mode using the static action sequence.
77
+
78
+ This implements the abstract method from PlaybackPolicyBase.
79
+
80
+ Args:
81
+ tool_schemas: Available tools for this environment
82
+ env_index: Environment index
83
+ conversation_history: Current conversation history for this environment
84
+
85
+ Returns:
86
+ List of MCPToolCall objects
87
+ """
88
+ # Get current step count for this environment
89
+ step_count = self.step_counts.get(env_index, 0)
90
+
91
+ # Determine action based on step count
92
+ if step_count < len(self.action_sequence):
93
+ action = self.action_sequence[step_count]
94
+ else:
95
+ # After sequence completes, repeat the last action
96
+ action = self.action_sequence[-1]
97
+
98
+ # Create tool call in MCPToolCall format
99
+ tool_call = MCPToolCall(tool_name=self.tool_name, arguments={"action": action})
100
+
101
+ # Update step count
102
+ self.step_counts[env_index] = step_count + 1
103
+
104
+ logger.debug(f"🎮 Env {env_index} step {step_count}: {action}")
105
+
106
+ return [tool_call], None
107
+
108
+ def add_tool_response(
109
+ self,
110
+ env_index: int,
111
+ tool_call: MCPToolCall,
112
+ tool_response: Union[str, List[Dict[str, Any]]],
113
+ conversation_history: List[Dict[str, Any]],
114
+ reward: float = 0.0,
115
+ terminated: bool = False,
116
+ info: Dict[str, Any] = None,
117
+ ):
118
+ """Add tool call and response to conversation history for recording."""
119
+
120
+ # Find the most recent assistant message with tool calls to get the correct call_id
121
+ call_id = None
122
+ for i in range(len(conversation_history) - 1, -1, -1):
123
+ if conversation_history[i]["role"] == "assistant" and "tool_calls" in conversation_history[i]:
124
+ # Find the tool call that matches our tool_name
125
+ for tc in conversation_history[i]["tool_calls"]:
126
+ if tc["function"]["name"] == tool_call.tool_name:
127
+ call_id = tc["id"]
128
+ break
129
+ if call_id:
130
+ break
131
+
132
+ # Fallback if no matching tool call found
133
+ if not call_id:
134
+ call_id = f"call_{env_index}_{len(conversation_history)}"
135
+
136
+ # Add tool response with control plane metadata
137
+ tool_message = {
138
+ "role": "tool",
139
+ "tool_call_id": call_id,
140
+ "content": tool_response,
141
+ }
142
+
143
+ # Add control plane metadata if provided
144
+ if reward != 0.0 or terminated or info:
145
+ tool_message["metadata"] = {
146
+ "reward": reward,
147
+ "terminated": terminated,
148
+ "info": info or {},
149
+ }
150
+
151
+ conversation_history.append(tool_message)
152
+
153
+ def log_conversation_state_for_playback(
154
+ self, env_index: int, step: int, conversation_history: List[Dict[str, Any]]
155
+ ):
156
+ """
157
+ Log the current conversation state in the format required for playback.
158
+
159
+ Expected format: {"env_index": 0, "step": 0, "messages": [{..}, {..}]}
160
+
161
+ Args:
162
+ env_index: Environment index
163
+ step: Current step number
164
+ conversation_history: List of conversation messages
165
+ """
166
+ # Use EP_PLAYBACK_FILE environment variable for recording
167
+ playback_file = os.environ.get("EP_PLAYBACK_FILE")
168
+ if not playback_file:
169
+ return # No recording file specified
170
+
171
+ playback_entry = {
172
+ "env_index": env_index,
173
+ "step": step,
174
+ "messages": conversation_history.copy(),
175
+ }
176
+
177
+ with open(playback_file, "a") as f:
178
+ f.write(json.dumps(playback_entry) + "\n")
179
+
180
+ @property
181
+ def model_id(self) -> str:
182
+ """Model identifier for static policy."""
183
+ return f"static-policy-{self.tool_name}-v1"
184
+
185
+
186
+ class RandomPolicy(PlaybackPolicyBase):
187
+ """
188
+ Random policy that selects random actions.
189
+ Useful for testing environment robustness.
190
+ """
191
+
192
+ def __init__(
193
+ self,
194
+ tool_name: str,
195
+ available_actions: List[str],
196
+ seed: Optional[int] = None,
197
+ **kwargs,
198
+ ):
199
+ """
200
+ Initialize random policy with recording/playback support.
201
+
202
+ Args:
203
+ tool_name: Name of the tool to call for actions
204
+ available_actions: List of all available actions for this environment
205
+ seed: Random seed for reproducibility
206
+ **kwargs: Additional arguments passed to PlaybackPolicyBase
207
+ """
208
+ # Initialize parent class for recording/playback functionality
209
+ super().__init__(**kwargs)
210
+
211
+ self.tool_name = tool_name
212
+ self.available_actions = available_actions
213
+ self.random = random.Random(seed)
214
+
215
+ async def _generate_live_tool_calls(
216
+ self,
217
+ tool_schemas: List[Dict],
218
+ env_index: int,
219
+ conversation_history: List[Dict[str, Any]],
220
+ ) -> Tuple[List[MCPToolCall], LLMUsageStats]:
221
+ """
222
+ Generate random tool calls in live mode.
223
+
224
+ Args:
225
+ tool_schemas: Available tools for this environment
226
+ env_index: Environment index
227
+ conversation_history: Current conversation history for this environment
228
+
229
+ Returns:
230
+ List of MCPToolCall objects
231
+ """
232
+ # Select random action
233
+ action = self.random.choice(self.available_actions)
234
+
235
+ # Create tool call
236
+ tool_call = MCPToolCall(tool_name=self.tool_name, arguments={"action": action})
237
+
238
+ logger.debug(f"🎲 Env {env_index}: {action}")
239
+
240
+ return [tool_call], None
241
+
242
+ def add_tool_response(
243
+ self,
244
+ env_index: int,
245
+ tool_call: MCPToolCall,
246
+ tool_response: Union[str, List[Dict[str, Any]]],
247
+ conversation_history: List[Dict[str, Any]],
248
+ reward: float = 0.0,
249
+ terminated: bool = False,
250
+ info: Dict[str, Any] = None,
251
+ ):
252
+ """Add tool call and response to conversation history for recording."""
253
+
254
+ # Find the most recent assistant message with tool calls
255
+ call_id = None
256
+ for i in range(len(conversation_history) - 1, -1, -1):
257
+ if conversation_history[i]["role"] == "assistant" and "tool_calls" in conversation_history[i]:
258
+ for tc in conversation_history[i]["tool_calls"]:
259
+ if tc["function"]["name"] == tool_call.tool_name:
260
+ call_id = tc["id"]
261
+ break
262
+ if call_id:
263
+ break
264
+
265
+ if not call_id:
266
+ call_id = f"call_{env_index}_{len(conversation_history)}"
267
+
268
+ # Add tool response with control plane metadata
269
+ tool_message = {
270
+ "role": "tool",
271
+ "tool_call_id": call_id,
272
+ "content": tool_response,
273
+ }
274
+
275
+ # Add control plane metadata if provided
276
+ if reward != 0.0 or terminated or info:
277
+ tool_message["metadata"] = {
278
+ "reward": reward,
279
+ "terminated": terminated,
280
+ "info": info or {},
281
+ }
282
+
283
+ conversation_history.append(tool_message)
284
+
285
+ def log_conversation_state_for_playback(
286
+ self, env_index: int, step: int, conversation_history: List[Dict[str, Any]]
287
+ ):
288
+ """Log the current conversation state for playback recording."""
289
+ playback_file = os.environ.get("EP_PLAYBACK_FILE")
290
+ if not playback_file:
291
+ return
292
+
293
+ playback_entry = {
294
+ "env_index": env_index,
295
+ "step": step,
296
+ "messages": conversation_history.copy(),
297
+ }
298
+
299
+ with open(playback_file, "a") as f:
300
+ f.write(json.dumps(playback_entry) + "\n")
301
+
302
+ @property
303
+ def model_id(self) -> str:
304
+ """Model identifier for random policy."""
305
+ return f"random-policy-{self.tool_name}-v1"