eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,187 @@
1
+ """
2
+ Adapters for integrating reward-kit reward functions with TRL (Transformer Reinforcement Learning) trainers.
3
+ """
4
+
5
+ import logging
6
+ from typing import Any, Callable, Dict, List, Optional, Union
7
+
8
+ from eval_protocol.models import Message
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def create_trl_adapter(
14
+ reward_fn: Callable,
15
+ dataset_to_reward_kwargs_map: Dict[str, str],
16
+ static_reward_kwargs: Optional[Dict[str, Any]] = None,
17
+ user_message_fn: Optional[Callable[[Any], str]] = None, # Function to construct user message content
18
+ assistant_message_fn: Optional[Callable[[Any], str]] = None, # Function to construct assistant message content
19
+ ) -> Callable[[List[Any], List[str]], List[float]]:
20
+ """
21
+ Creates an adapter function compatible with TRL trainers (e.g., GRPOTrainer, PPOTrainer)
22
+ from a reward-kit reward function.
23
+
24
+ The TRL trainer expects a reward function with the signature:
25
+ (prompts: List[str], completions: List[str], **kwargs: Any) -> List[float]
26
+ where **kwargs contains other columns from the HuggingFace dataset.
27
+
28
+ Args:
29
+ reward_fn: The reward-kit reward function to adapt. This function should
30
+ already be decorated with @reward_function or follow its
31
+ input/output conventions (takes List[Message] or List[Dict],
32
+ returns Dict with a 'score' key).
33
+ dataset_to_reward_kwargs_map: A dictionary mapping dataset column names
34
+ (which appear as keys in **kwargs passed by TRL)
35
+ to the parameter names of the `reward_fn`.
36
+ Example: {"test_cases_column": "test_cases_param"}
37
+ This tells the adapter to take the data from
38
+ kwargs['test_cases_column'] and pass it as
39
+ the `test_cases_param` argument to `reward_fn`.
40
+ static_reward_kwargs: A dictionary of static keyword arguments that will be
41
+ passed to `reward_fn` for every sample.
42
+ Example: {"language": "python", "timeout": 10}
43
+ user_message_fn: An optional function that takes a prompt string and returns
44
+ the content for the user message. If None, the prompt itself
45
+ is used as content.
46
+ assistant_message_fn: An optional function that takes a completion string and
47
+ returns the content for the assistant message. If None,
48
+ the completion itself is used as content.
49
+
50
+
51
+ Returns:
52
+ An adapter function that can be passed to TRL trainers.
53
+ """
54
+ if static_reward_kwargs is None:
55
+ static_reward_kwargs = {}
56
+
57
+ def trl_reward_pipeline(
58
+ prompts: List[Any], # Changed from List[str] to List[Any]
59
+ completions: Optional[List[str]] = None,
60
+ **kwargs: Any, # Contains other dataset columns, e.g., kwargs['test_cases']
61
+ ) -> List[float]:
62
+ """
63
+ This is the actual function TRL will call.
64
+
65
+ Note: completions parameter is optional to handle cases where prompts already
66
+ contain complete conversations.
67
+ """
68
+ scores: List[float] = []
69
+ num_samples = len(prompts)
70
+
71
+ # If completions is None, assume prompts contains complete conversations
72
+ if completions is None:
73
+ completions = [""] * num_samples
74
+
75
+ if not (len(completions) == num_samples):
76
+ logger.warning(
77
+ f"Mismatch in lengths of prompts ({num_samples}) and "
78
+ f"completions ({len(completions)}). Using min length."
79
+ )
80
+ num_samples = min(num_samples, len(completions))
81
+
82
+ # Pre-extract data for all samples from kwargs based on the map
83
+ # This makes it easier to access per-sample data in the loop
84
+ mapped_kwargs_data: Dict[str, List[Any]] = {}
85
+ for (
86
+ dataset_col_name,
87
+ reward_fn_param_name,
88
+ ) in dataset_to_reward_kwargs_map.items():
89
+ if dataset_col_name not in kwargs:
90
+ logger.warning(
91
+ f"Dataset column '{dataset_col_name}' (mapped to reward_fn param "
92
+ f"'{reward_fn_param_name}') not found in TRL kwargs. "
93
+ f"Reward function will receive None for this parameter for all samples."
94
+ )
95
+ # Ensure the key exists in mapped_kwargs_data with a list of Nones
96
+ mapped_kwargs_data[reward_fn_param_name] = [None] * num_samples
97
+ else:
98
+ # Ensure the data from TRL kwargs is a list of the correct length
99
+ data_list = kwargs[dataset_col_name]
100
+ if not isinstance(data_list, list) or len(data_list) != num_samples:
101
+ logger.error(
102
+ f"Data for dataset column '{dataset_col_name}' is not a list of "
103
+ f"length {num_samples}. Received: {data_list}. "
104
+ f"Reward function will receive None for this parameter for all samples."
105
+ )
106
+ mapped_kwargs_data[reward_fn_param_name] = [None] * num_samples
107
+ else:
108
+ mapped_kwargs_data[reward_fn_param_name] = data_list
109
+
110
+ for i in range(num_samples):
111
+ current_prompt_item: Any = prompts[i]
112
+ current_completion: str = completions[i]
113
+
114
+ # Construct messages
115
+ # If user_message_fn is provided, it's responsible for converting current_prompt_item to string content.
116
+ # If not, and current_prompt_item is not a string, this might error or behave unexpectedly.
117
+ # Default behavior: assume current_prompt_item is a string if user_message_fn is None.
118
+ user_content = user_message_fn(current_prompt_item) if user_message_fn else str(current_prompt_item)
119
+
120
+ # Default extraction for assistant_content if current_completion is not a simple string
121
+ final_assistant_str_content = ""
122
+ if assistant_message_fn:
123
+ final_assistant_str_content = assistant_message_fn(current_completion)
124
+ elif isinstance(current_completion, str):
125
+ final_assistant_str_content = current_completion
126
+ elif (
127
+ isinstance(current_completion, list)
128
+ and len(current_completion) == 1
129
+ and isinstance(current_completion[0], dict)
130
+ and "content" in current_completion[0]
131
+ and isinstance(current_completion[0].get("content"), str)
132
+ ):
133
+ # Handles cases like [{'role':'assistant', 'content':'actual_text'}]
134
+ final_assistant_str_content = current_completion[0]["content"]
135
+ else:
136
+ # Fallback if current_completion is an unexpected type
137
+ logger.warning(
138
+ f"Completion for assistant message was not a string or expected list/dict structure: {current_completion}. Using str()."
139
+ )
140
+ final_assistant_str_content = str(current_completion)
141
+
142
+ # Ensure messages_for_reward is typed as List[Message] as per EvaluateFunction protocol
143
+ messages_for_reward: List[Message] = [
144
+ Message(role="user", content=user_content),
145
+ Message(role="assistant", content=final_assistant_str_content),
146
+ ]
147
+
148
+ # Prepare kwargs for the specific reward_fn call for this sample
149
+ current_dynamic_kwargs: Dict[str, Any] = {}
150
+ for reward_fn_param_name, data_list_for_param in mapped_kwargs_data.items():
151
+ # data_list_for_param is already ensured to be a list of Nones or actual data
152
+ current_dynamic_kwargs[reward_fn_param_name] = data_list_for_param[i]
153
+
154
+ # Combine static and dynamic kwargs
155
+ final_reward_fn_kwargs = {**static_reward_kwargs, **current_dynamic_kwargs}
156
+
157
+ try:
158
+ # reward_fn is expected to be decorated with @reward_function,
159
+ # so it handles Message object creation internally if dicts are passed,
160
+ # and returns a dict.
161
+ reward_output_dict: Dict[str, Any] = reward_fn(messages=messages_for_reward, **final_reward_fn_kwargs)
162
+
163
+ score = reward_output_dict.get("score")
164
+ if score is None:
165
+ logger.warning(
166
+ f"Sample {i}: 'score' key not found in reward_output_dict or is None. "
167
+ f"Output: {reward_output_dict}. Assigning 0.0."
168
+ )
169
+ scores.append(0.0)
170
+ else:
171
+ scores.append(float(score))
172
+
173
+ except Exception as e:
174
+ logger.error(
175
+ f"Error calling reward_fn for sample {i} (prompt: '{str(current_prompt_item)[:50]}...'): {e}",
176
+ exc_info=True,
177
+ )
178
+ scores.append(0.0) # Assign 0 score on error
179
+
180
+ if scores:
181
+ logger.debug(
182
+ f"Batch rewards calculated by TRL adapter. Count: {len(scores)}, "
183
+ f"Min: {min(scores)}, Max: {max(scores)}, Avg: {sum(scores)/len(scores):.2f}"
184
+ )
185
+ return scores
186
+
187
+ return trl_reward_pipeline
@@ -0,0 +1,48 @@
1
+ """
2
+ Reward-Kit MCP Integration Framework
3
+
4
+ This module provides utilities for creating MCP servers that integrate
5
+ with reward-kit environments and evaluation workflows.
6
+
7
+ It also provides the refactored MCP environment components for better modularity.
8
+ """
9
+
10
+ from .adapter import EnvironmentAdapter
11
+
12
+ # New refactored components
13
+ from .client import MCPConnectionManager
14
+ from .execution import LLMBasePolicy, OpenAIPolicy, ExecutionManager
15
+
16
+ # FireworksPolicy is imported conditionally by execution.__init__.py
17
+ try:
18
+ from .execution import FireworksPolicy
19
+ except ImportError:
20
+ FireworksPolicy = None
21
+
22
+ # North Star MCP-Gym Framework
23
+ from .mcpgym import McpGym
24
+ from .session import GeneralMCPVectorEnv
25
+ from .simulation_server import SimulationServerBase
26
+ from .types import DatasetRow, MCPSession, MCPToolCall, Trajectory
27
+
28
+ __all__ = [
29
+ # Legacy MCP server components
30
+ "EnvironmentAdapter",
31
+ "SimulationServerBase",
32
+ # New refactored components
33
+ "MCPConnectionManager",
34
+ "LLMBasePolicy",
35
+ "OpenAIPolicy",
36
+ "ExecutionManager",
37
+ "GeneralMCPVectorEnv",
38
+ "MCPSession",
39
+ "MCPToolCall",
40
+ "DatasetRow",
41
+ "Trajectory",
42
+ # North Star MCP-Gym Framework
43
+ "McpGym",
44
+ ]
45
+
46
+ # Only export FireworksPolicy if it's available
47
+ if FireworksPolicy is not None:
48
+ __all__.insert(__all__.index("OpenAIPolicy") + 1, "FireworksPolicy")
@@ -0,0 +1,131 @@
1
+ """
2
+ Environment Adapter Interface
3
+
4
+ This defines the interface that users implement to connect their
5
+ environments to the MCP framework. It also provides default implementations
6
+ that work with most gymnasium-style and complex environments.
7
+ """
8
+
9
+ import json
10
+ from typing import Any, Dict, Optional, Tuple
11
+
12
+
13
+ class EnvironmentAdapter:
14
+ """
15
+ Environment adapter with default implementations.
16
+
17
+ Users can either use this class directly by providing an env_class,
18
+ or inherit from it to customize specific methods for their environment.
19
+ This provides a clean separation between the MCP protocol layer
20
+ and the environment implementation.
21
+ """
22
+
23
+ def __init__(self, env_class: Any = None, default_config: Optional[Dict[str, Any]] = None):
24
+ """
25
+ Initialize the environment adapter.
26
+
27
+ Args:
28
+ env_class: The environment class to instantiate (required for default implementation)
29
+ default_config: Default configuration for environment creation
30
+ """
31
+ self.env_class = env_class
32
+ self.default_config = default_config or {}
33
+
34
+ def create_environment(self, config: Optional[Dict[str, Any]] = None) -> Any:
35
+ """
36
+ Create and return a new environment instance.
37
+
38
+ Args:
39
+ config: Optional configuration dict for environment creation
40
+
41
+ Returns:
42
+ Environment instance (type depends on the specific implementation)
43
+ """
44
+ if self.env_class is None:
45
+ raise NotImplementedError("env_class must be provided or create_environment must be overridden")
46
+
47
+ env_config = self.get_default_config()
48
+ if config:
49
+ env_config.update(config)
50
+
51
+ env = self.env_class(config=env_config)
52
+ return env
53
+
54
+ def create_environment_with_seed(
55
+ self, config: Optional[Dict[str, Any]] = None, seed: Optional[int] = None
56
+ ) -> Tuple[Any, Any, Dict[str, Any]]:
57
+ """
58
+ Create and return a new environment instance with a specific seed.
59
+ """
60
+ env = self.create_environment(config)
61
+ obs, info = env.reset(seed=seed)
62
+
63
+ return env, obs, info
64
+
65
+ def reset_environment(self, env: Any, seed: Optional[int] = None) -> Tuple[Any, Dict[str, Any]]:
66
+ """
67
+ Reset the environment to initial state.
68
+
69
+ Args:
70
+ env: Environment instance
71
+ seed: Optional seed for reproducibility
72
+
73
+ Returns:
74
+ Tuple of (initial_observation, info_dict)
75
+ """
76
+ return env.reset(seed=seed)
77
+
78
+ def step_environment(self, env: Any, action: Any) -> Tuple[Any, float, bool, bool, Dict[str, Any]]:
79
+ """
80
+ Execute one step in the environment.
81
+
82
+ Args:
83
+ env: Environment instance
84
+ action: Action to execute (type depends on environment)
85
+
86
+ Returns:
87
+ Tuple of (observation, reward, terminated, truncated, info)
88
+ """
89
+ return env.step(action)
90
+
91
+ def close_environment(self, env: Any) -> None:
92
+ """
93
+ Clean up environment resources.
94
+
95
+ Args:
96
+ env: Environment instance to close
97
+ """
98
+ env.close()
99
+
100
+ def parse_action(self, action_str: str) -> Any:
101
+ """
102
+ Parse action string from MCP tool call into environment action.
103
+
104
+ Args:
105
+ action_str: Action string from MCP client
106
+
107
+ Returns:
108
+ Action in format expected by environment
109
+ """
110
+ return json.loads(action_str)
111
+
112
+ def format_observation(self, observation: Any) -> Any:
113
+ """
114
+ Format environment observation for MCP response.
115
+
116
+ Args:
117
+ observation: Raw observation from environment
118
+
119
+ Returns:
120
+ JSON-serializable observation data
121
+ """
122
+ return observation
123
+
124
+ def get_default_config(self) -> Dict[str, Any]:
125
+ """
126
+ Get the default environment configuration.
127
+
128
+ Returns:
129
+ Dict describing the default environment configuration
130
+ """
131
+ return self.default_config
@@ -0,0 +1,12 @@
1
+ """
2
+ MCP Client Connection Management
3
+
4
+ This module handles MCP client connections, session initialization,
5
+ and resource/tool discovery.
6
+ """
7
+
8
+ from .connection import MCPConnectionManager
9
+
10
+ __all__ = [
11
+ "MCPConnectionManager",
12
+ ]