eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,111 @@
1
+ """
2
+ Tool Registry for the Agent Evaluation Framework.
3
+ Provides a mechanism to register and manage tools.
4
+ """
5
+
6
+ import functools
7
+ import inspect
8
+ from typing import Any, Callable, Dict, List, Optional
9
+
10
+
11
+ class ToolRegistry:
12
+ """
13
+ Registry for tools that can be used by agents.
14
+
15
+ Attributes:
16
+ name: Unique identifier for this tool registry
17
+ tools: Dictionary mapping tool names to tool functions
18
+ descriptions: Dictionary mapping tool names to their descriptions
19
+ parameters: Dictionary mapping tool names to their parameter specifications
20
+ """
21
+
22
+ def __init__(self, name: str):
23
+ """
24
+ Initialize a new tool registry.
25
+
26
+ Args:
27
+ name: Unique identifier for this registry
28
+ """
29
+ self.name = name
30
+ self.tools: Dict[str, Callable] = {}
31
+ self.descriptions: Dict[str, str] = {}
32
+ self.parameters: Dict[str, Dict[str, Any]] = {}
33
+
34
+ def tool(self, description: str, parameters: Dict[str, Any]) -> Callable:
35
+ """
36
+ Decorator to register a function as a tool.
37
+
38
+ Args:
39
+ description: Human-readable description of the tool
40
+ parameters: Parameter specifications for the tool
41
+
42
+ Returns:
43
+ Decorator function that registers the decorated function
44
+ """
45
+
46
+ def decorator(func: Callable) -> Callable:
47
+ tool_name = func.__name__
48
+ self.tools[tool_name] = func
49
+ self.descriptions[tool_name] = description
50
+ self.parameters[tool_name] = parameters
51
+
52
+ @functools.wraps(func)
53
+ def wrapper(*args, **kwargs):
54
+ return func(*args, **kwargs)
55
+
56
+ return wrapper
57
+
58
+ return decorator
59
+
60
+ def get_tool(self, tool_name: str) -> Optional[Callable]:
61
+ """
62
+ Get a tool function by name.
63
+
64
+ Args:
65
+ tool_name: Name of the tool to retrieve
66
+
67
+ Returns:
68
+ The tool function if found, None otherwise
69
+ """
70
+ return self.tools.get(tool_name)
71
+
72
+ def get_tools(self) -> Dict[str, Callable]:
73
+ """
74
+ Get all tools in this registry.
75
+
76
+ Returns:
77
+ Dictionary mapping tool names to tool functions
78
+ """
79
+ return self.tools
80
+
81
+ def get_openai_tools(self) -> List[Dict[str, Any]]:
82
+ """
83
+ Get tool specifications in OpenAI function calling format.
84
+
85
+ Returns:
86
+ List of tool specifications compatible with OpenAI API
87
+ """
88
+ tools = []
89
+ for tool_name in self.tools:
90
+ tools.append(
91
+ {
92
+ "name": tool_name,
93
+ "description": self.descriptions.get(tool_name, ""),
94
+ "parameters": {
95
+ "type": "object",
96
+ "properties": self.parameters.get(tool_name, {}),
97
+ "required": list(self.parameters.get(tool_name, {}).keys()),
98
+ },
99
+ }
100
+ )
101
+ return tools
102
+
103
+ def create_fastapi_app(self):
104
+ """
105
+ Create a FastAPI app with endpoints for each tool.
106
+
107
+ Returns:
108
+ A FastAPI app instance with tool endpoints
109
+ """
110
+ # This is a stub implementation
111
+ return {"app_type": "FastAPI", "tools": list(self.tools.keys())}
eval_protocol/auth.py ADDED
@@ -0,0 +1,156 @@
1
+ import configparser
2
+ import logging
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Dict, Optional # Added Dict
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ FIREWORKS_CONFIG_DIR = Path.home() / ".fireworks"
10
+ AUTH_INI_FILE = FIREWORKS_CONFIG_DIR / "auth.ini"
11
+
12
+
13
+ def _parse_simple_auth_file(file_path: Path) -> Dict[str, str]:
14
+ """
15
+ Parses an auth file with simple key=value lines.
16
+ Handles comments starting with # or ;.
17
+ Strips whitespace and basic quotes from values.
18
+ """
19
+ creds = {}
20
+ if not file_path.exists():
21
+ return creds
22
+ try:
23
+ with open(file_path, "r") as f:
24
+ for line in f:
25
+ line = line.strip()
26
+ if not line or line.startswith("#") or line.startswith(";"):
27
+ continue
28
+ if "=" in line:
29
+ key, value = line.split("=", 1)
30
+ key = key.strip()
31
+ value = value.strip()
32
+ # Remove surrounding quotes if present
33
+ if value and (
34
+ (value.startswith('"') and value.endswith('"'))
35
+ or (value.startswith("'") and value.endswith("'"))
36
+ ):
37
+ value = value[1:-1]
38
+
39
+ if key in ["api_key", "account_id"] and value:
40
+ creds[key] = value
41
+ except Exception as e:
42
+ logger.warning(f"Error during simple parsing of {file_path}: {e}")
43
+ return creds
44
+
45
+
46
+ def _get_credential_from_config_file(key_name: str) -> Optional[str]:
47
+ """
48
+ Helper to get a specific credential (api_key or account_id) from auth.ini.
49
+ Tries simple parsing first, then configparser.
50
+ """
51
+ if not AUTH_INI_FILE.exists():
52
+ return None
53
+
54
+ # 1. Try simple key-value parsing first
55
+ simple_creds = _parse_simple_auth_file(AUTH_INI_FILE)
56
+ if key_name in simple_creds:
57
+ logger.debug(f"Using {key_name} from simple key-value parsing of {AUTH_INI_FILE}.")
58
+ return simple_creds[key_name]
59
+
60
+ # 2. Fallback to configparser if not found via simple parsing or if simple parsing failed
61
+ # This path will also generate the "no section headers" warning if applicable,
62
+ # but only if simple parsing didn't yield the key.
63
+ try:
64
+ config = configparser.ConfigParser()
65
+ config.read(AUTH_INI_FILE)
66
+
67
+ # Try [fireworks] section
68
+ if "fireworks" in config and config.has_option("fireworks", key_name):
69
+ value_from_file = config.get("fireworks", key_name)
70
+ if value_from_file:
71
+ logger.debug(f"Using {key_name} from [fireworks] section in {AUTH_INI_FILE}.")
72
+ return value_from_file
73
+
74
+ # Try default section (configparser might place items without section header here)
75
+ if config.has_option(config.default_section, key_name):
76
+ value_from_default = config.get(config.default_section, key_name)
77
+ if value_from_default:
78
+ logger.debug(f"Using {key_name} from default section [{config.default_section}] in {AUTH_INI_FILE}.")
79
+ return value_from_default
80
+
81
+ except configparser.MissingSectionHeaderError:
82
+ # This error implies the file is purely key-value, which simple parsing should have handled.
83
+ # If simple parsing failed to get the key, then it's likely not there or malformed.
84
+ logger.debug(f"{AUTH_INI_FILE} has no section headers, and simple parsing did not find {key_name}.")
85
+ except configparser.Error as e_config:
86
+ logger.warning(f"Configparser error reading {AUTH_INI_FILE} for {key_name}: {e_config}")
87
+ except Exception as e_general:
88
+ logger.warning(f"Unexpected error reading {AUTH_INI_FILE} for {key_name}: {e_general}")
89
+
90
+ return None
91
+
92
+
93
+ def get_fireworks_api_key() -> Optional[str]:
94
+ """
95
+ Retrieves the Fireworks API key.
96
+
97
+ The key is sourced in the following order:
98
+ 1. FIREWORKS_API_KEY environment variable.
99
+ 2. 'api_key' from the [fireworks] section of ~/.fireworks/auth.ini.
100
+
101
+ Returns:
102
+ The API key if found, otherwise None.
103
+ """
104
+ api_key = os.environ.get("FIREWORKS_API_KEY")
105
+ if api_key:
106
+ logger.debug("Using FIREWORKS_API_KEY from environment variable.")
107
+ return api_key
108
+
109
+ api_key_from_file = _get_credential_from_config_file("api_key")
110
+ if api_key_from_file:
111
+ return api_key_from_file
112
+
113
+ logger.debug("Fireworks API key not found in environment variables or auth.ini.")
114
+ return None
115
+
116
+
117
+ def get_fireworks_account_id() -> Optional[str]:
118
+ """
119
+ Retrieves the Fireworks Account ID.
120
+
121
+ The Account ID is sourced in the following order:
122
+ 1. FIREWORKS_ACCOUNT_ID environment variable.
123
+ 2. 'account_id' from the [fireworks] section of ~/.fireworks/auth.ini.
124
+
125
+ Returns:
126
+ The Account ID if found, otherwise None.
127
+ """
128
+ account_id = os.environ.get("FIREWORKS_ACCOUNT_ID")
129
+ if account_id:
130
+ logger.debug("Using FIREWORKS_ACCOUNT_ID from environment variable.")
131
+ return account_id
132
+
133
+ account_id_from_file = _get_credential_from_config_file("account_id")
134
+ if account_id_from_file:
135
+ return account_id_from_file
136
+
137
+ logger.debug("Fireworks Account ID not found in environment variables or auth.ini.")
138
+ return None
139
+
140
+
141
+ def get_fireworks_api_base() -> str:
142
+ """
143
+ Retrieves the Fireworks API base URL.
144
+
145
+ The base URL is sourced from the FIREWORKS_API_BASE environment variable.
146
+ If not set, it defaults to "https://api.fireworks.ai".
147
+
148
+ Returns:
149
+ The API base URL.
150
+ """
151
+ api_base = os.environ.get("FIREWORKS_API_BASE", "https://api.fireworks.ai")
152
+ if os.environ.get("FIREWORKS_API_BASE"):
153
+ logger.debug("Using FIREWORKS_API_BASE from environment variable.")
154
+ else:
155
+ logger.debug(f"FIREWORKS_API_BASE not set in environment, defaulting to {api_base}.")
156
+ return api_base
eval_protocol/cli.py ADDED
@@ -0,0 +1,425 @@
1
+ """
2
+ Command-line interface for reward-kit.
3
+ """
4
+
5
+ import argparse
6
+ import asyncio
7
+ import json
8
+ import logging
9
+ import os
10
+ import sys
11
+ import traceback
12
+ import uuid
13
+ from pathlib import Path
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ from eval_protocol.evaluation import create_evaluation, preview_evaluation
18
+
19
+ from .cli_commands.agent_eval_cmd import agent_eval_command
20
+ from .cli_commands.common import (
21
+ check_agent_environment,
22
+ check_environment,
23
+ setup_logging,
24
+ )
25
+ from .cli_commands.deploy import deploy_command
26
+ from .cli_commands.deploy_mcp import deploy_mcp_command
27
+ from .cli_commands.preview import preview_command
28
+ from .cli_commands.run_eval_cmd import hydra_cli_entry_point
29
+
30
+
31
+ def parse_args(args=None):
32
+ """Parse command line arguments"""
33
+ parser = argparse.ArgumentParser(description="eval-protocol: Tools for evaluation and reward modeling")
34
+ parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
35
+
36
+ subparsers = parser.add_subparsers(dest="command", help="Command to run")
37
+
38
+ # Preview command
39
+ preview_parser = subparsers.add_parser("preview", help="Preview an evaluator with sample data")
40
+ preview_parser.add_argument(
41
+ "--metrics-folders",
42
+ "-m",
43
+ nargs="+",
44
+ help="Metric folders in format 'name=path', e.g., 'clarity=./metrics/clarity'",
45
+ )
46
+
47
+ # Make samples optional to allow HF dataset option
48
+ preview_parser.add_argument(
49
+ "--samples",
50
+ "-s",
51
+ required=False,
52
+ help="Path to JSONL file containing sample data",
53
+ )
54
+ preview_parser.add_argument(
55
+ "--max-samples",
56
+ type=int,
57
+ default=5,
58
+ help="Maximum number of samples to process (default: 5)",
59
+ )
60
+
61
+ # Add HuggingFace dataset options
62
+ hf_group = preview_parser.add_argument_group("HuggingFace Dataset Options")
63
+ hf_group.add_argument(
64
+ "--huggingface-dataset",
65
+ "--hf",
66
+ help="HuggingFace dataset name (e.g., 'deepseek-ai/DeepSeek-ProverBench')",
67
+ )
68
+ hf_group.add_argument(
69
+ "--huggingface-split",
70
+ default="train",
71
+ help="Dataset split to use (default: 'train')",
72
+ )
73
+ hf_group.add_argument(
74
+ "--huggingface-prompt-key",
75
+ default="prompt",
76
+ help="Key in the dataset containing the prompt text (default: 'prompt')",
77
+ )
78
+ hf_group.add_argument(
79
+ "--huggingface-response-key",
80
+ default="response",
81
+ help="Key in the dataset containing the response text (default: 'response')",
82
+ )
83
+ hf_group.add_argument(
84
+ "--huggingface-key-map",
85
+ help="JSON mapping of dataset keys to reward-kit message keys",
86
+ )
87
+ preview_parser.add_argument(
88
+ "--remote-url",
89
+ help="URL of a remote reward function endpoint to preview against. If provided, metrics-folders might be ignored.",
90
+ )
91
+
92
+ # Deploy command
93
+ deploy_parser = subparsers.add_parser("deploy", help="Create and deploy an evaluator, or register a remote one")
94
+ deploy_parser.add_argument("--id", required=True, help="ID for the evaluator")
95
+ deploy_parser.add_argument(
96
+ "--metrics-folders",
97
+ "-m",
98
+ nargs="+",
99
+ required=False, # No longer strictly required if --remote-url is used
100
+ help="Metric folders in format 'name=path', e.g., 'clarity=./metrics/clarity'. Required if not using --remote-url.",
101
+ )
102
+ deploy_parser.add_argument(
103
+ "--display-name",
104
+ help="Display name for the evaluator (defaults to ID if not provided)",
105
+ )
106
+ deploy_parser.add_argument("--description", help="Description for the evaluator")
107
+ deploy_parser.add_argument(
108
+ "--force",
109
+ "-f",
110
+ action="store_true",
111
+ help="Force update if evaluator already exists",
112
+ )
113
+
114
+ # Add HuggingFace dataset options to deploy command
115
+ hf_deploy_group = deploy_parser.add_argument_group("HuggingFace Dataset Options")
116
+ hf_deploy_group.add_argument(
117
+ "--huggingface-dataset",
118
+ "--hf",
119
+ help="HuggingFace dataset name (e.g., 'deepseek-ai/DeepSeek-ProverBench')",
120
+ )
121
+ hf_deploy_group.add_argument(
122
+ "--huggingface-split",
123
+ default="train",
124
+ help="Dataset split to use (default: 'train')",
125
+ )
126
+ hf_deploy_group.add_argument(
127
+ "--huggingface-prompt-key",
128
+ default="prompt",
129
+ help="Key in the dataset containing the prompt text (default: 'prompt')",
130
+ )
131
+ hf_deploy_group.add_argument(
132
+ "--huggingface-response-key",
133
+ default="response",
134
+ help="Key in the dataset containing the response text (default: 'response')",
135
+ )
136
+ hf_deploy_group.add_argument(
137
+ "--huggingface-key-map",
138
+ help="JSON mapping of dataset keys to reward-kit message keys",
139
+ )
140
+ deploy_parser.add_argument(
141
+ "--remote-url",
142
+ help="URL of a pre-deployed remote reward function. If provided, deploys by registering this URL with Fireworks AI.",
143
+ )
144
+
145
+ # Deployment target options
146
+ target_group = deploy_parser.add_argument_group("Deployment Target Options")
147
+ target_group.add_argument(
148
+ "--target",
149
+ choices=["fireworks", "gcp-cloud-run", "local-serve"],
150
+ default="fireworks",
151
+ help="Deployment target. 'fireworks' for standard Fireworks platform deployment, 'gcp-cloud-run' for Google Cloud Run, 'local-serve' for local serving with Serveo tunneling.",
152
+ )
153
+ target_group.add_argument(
154
+ "--function-ref",
155
+ help="Reference to the reward function to deploy (e.g., 'my_module.reward_func'). Required for 'gcp-cloud-run' and 'local-serve' targets.",
156
+ )
157
+
158
+ # Local serving options (relevant if --target is local-serve)
159
+ local_serve_group = deploy_parser.add_argument_group("Local Serving Options (used if --target is local-serve)")
160
+ local_serve_group.add_argument(
161
+ "--local-port",
162
+ type=int,
163
+ default=8001,
164
+ help="Port for the local reward function server to listen on (default: 8001). Used with --target local-serve.",
165
+ )
166
+
167
+ # GCP deployment options
168
+ gcp_group = deploy_parser.add_argument_group(
169
+ "GCP Cloud Run Deployment Options (used if --target is gcp-cloud-run)"
170
+ )
171
+ # --function-ref is now in target_group
172
+ gcp_group.add_argument(
173
+ "--gcp-project",
174
+ required=False,
175
+ help="Google Cloud Project ID. Must be provided via CLI or rewardkit.yaml.",
176
+ )
177
+ gcp_group.add_argument(
178
+ "--gcp-region",
179
+ required=False,
180
+ help="Google Cloud Region for deployment (e.g., 'us-central1'). Must be provided via CLI or rewardkit.yaml.",
181
+ )
182
+ gcp_group.add_argument(
183
+ "--gcp-ar-repo",
184
+ required=False,
185
+ help="Google Artifact Registry repository name. Optional, defaults to value in rewardkit.yaml or 'reward-kit-evaluators' if not specified.",
186
+ )
187
+ gcp_group.add_argument(
188
+ "--service-account",
189
+ help="Email of the GCP service account to run the Cloud Run service. Optional.",
190
+ )
191
+ gcp_group.add_argument(
192
+ "--entry-point",
193
+ default="reward_function",
194
+ help="The name of the entry point function within your --function-ref module (default: reward_function). Only for gcp-cloud-run.",
195
+ )
196
+ gcp_group.add_argument(
197
+ "--runtime",
198
+ default="python311", # Or a sensible default
199
+ help="The Cloud Functions/Run runtime (e.g., python311). Only for gcp-cloud-run.",
200
+ )
201
+ gcp_group.add_argument(
202
+ "--gcp-auth-mode",
203
+ choices=["open", "api-key"], # Add 'iam' later
204
+ default=None, # Default will be resolved in deploy_command
205
+ help="Authentication mode for the deployed GCP Cloud Run service. "
206
+ "'open': Publicly accessible. "
207
+ "'api-key': Service is publicly accessible but requires an API key in requests (handled by the application). "
208
+ "If not specified, defaults to value in rewardkit.yaml or 'api-key'. Optional.",
209
+ )
210
+
211
+ # Deploy MCP command
212
+ deploy_mcp_parser = subparsers.add_parser("deploy-mcp", help="Deploy an MCP server to Google Cloud Run")
213
+ deploy_mcp_parser.add_argument("--id", required=True, help="Unique ID for the MCP server deployment")
214
+ deploy_mcp_parser.add_argument(
215
+ "--mcp-server-module",
216
+ help="Python module containing the MCP server (e.g., 'examples.frozen_lake_mcp.frozen_lake_mcp_server'). Required if --dockerfile is not provided.",
217
+ )
218
+ deploy_mcp_parser.add_argument(
219
+ "--dockerfile",
220
+ help="Path to Dockerfile to use for deployment (recommended for tested local Dockerfiles). When provided, --mcp-server-module is not required.",
221
+ )
222
+ deploy_mcp_parser.add_argument(
223
+ "--gcp-project",
224
+ help="Google Cloud Project ID. Can also be set in rewardkit.yaml",
225
+ )
226
+ deploy_mcp_parser.add_argument(
227
+ "--gcp-region",
228
+ help="Google Cloud Region (e.g., 'us-central1'). Can also be set in rewardkit.yaml",
229
+ )
230
+ deploy_mcp_parser.add_argument(
231
+ "--gcp-ar-repo",
232
+ help="Google Artifact Registry repository name. Defaults to 'reward-kit-mcp-servers'",
233
+ )
234
+ deploy_mcp_parser.add_argument(
235
+ "--port",
236
+ type=int,
237
+ default=8000,
238
+ help="Port for the MCP server to listen on (default: 8000)",
239
+ )
240
+ deploy_mcp_parser.add_argument(
241
+ "--python-version",
242
+ default="3.11",
243
+ help="Python version for the container (default: 3.11)",
244
+ )
245
+ deploy_mcp_parser.add_argument("--requirements", help="Additional pip requirements (newline separated)")
246
+ deploy_mcp_parser.add_argument("--env-vars", nargs="*", help="Environment variables in KEY=VALUE format")
247
+
248
+ # Agent-eval command
249
+ agent_eval_parser = subparsers.add_parser(
250
+ "agent-eval", help="Run agent evaluation using the ForkableResource framework."
251
+ )
252
+ agent_eval_parser.add_argument(
253
+ "--task-def",
254
+ required=True,
255
+ help="Path to task definition file or directory containing task definitions.",
256
+ )
257
+ agent_eval_parser.add_argument(
258
+ "--parallel",
259
+ action="store_true",
260
+ help="Execute tasks in parallel when multiple tasks are specified.",
261
+ )
262
+ agent_eval_parser.add_argument(
263
+ "--max-concurrency",
264
+ type=int,
265
+ default=3,
266
+ help="Maximum number of tasks to execute in parallel (default: 3).",
267
+ )
268
+ agent_eval_parser.add_argument(
269
+ "--filter",
270
+ nargs="+",
271
+ help="Run only tasks matching the specified task IDs.",
272
+ )
273
+ agent_eval_parser.add_argument(
274
+ "--output-dir",
275
+ default="./agent_runs",
276
+ help="Directory to store agent evaluation run results (default: ./agent_runs).",
277
+ )
278
+ agent_eval_parser.add_argument(
279
+ "--model",
280
+ help="Override MODEL_AGENT environment variable (format: provider/model_name).",
281
+ )
282
+ agent_eval_parser.add_argument(
283
+ "--num-rollouts",
284
+ type=int,
285
+ help="Override the number of parallel rollouts to execute for each task.",
286
+ )
287
+
288
+ # Run command (for Hydra-based evaluations)
289
+ # This subparser intentionally defines no arguments itself.
290
+ # All arguments after 'run' will be passed to Hydra by parse_known_args.
291
+ subparsers.add_parser(
292
+ "run",
293
+ help="Run an evaluation using a Hydra configuration. All arguments after 'run' are passed to Hydra.",
294
+ )
295
+
296
+ # Use parse_known_args to allow Hydra to handle its own arguments
297
+ return parser.parse_known_args(args)
298
+
299
+
300
+ def main():
301
+ """Main entry point for the CLI"""
302
+ try:
303
+ from dotenv import load_dotenv
304
+
305
+ # .env.dev for development-specific overrides, .env for general
306
+ load_dotenv(dotenv_path=Path(".") / ".env.dev", override=True)
307
+ load_dotenv(override=True)
308
+ except ImportError:
309
+ pass
310
+
311
+ # Automatic PYTHONPATH enhancement - add current directory to Python path
312
+ # This needs to happen early, before any module loading occurs
313
+ current_dir = os.getcwd()
314
+ current_pythonpath = os.environ.get("PYTHONPATH", "")
315
+ if current_dir not in current_pythonpath.split(os.pathsep):
316
+ if current_pythonpath:
317
+ os.environ["PYTHONPATH"] = f"{current_dir}{os.pathsep}{current_pythonpath}"
318
+ else:
319
+ os.environ["PYTHONPATH"] = current_dir
320
+ logger.debug(f"Added current directory to PYTHONPATH: {current_dir}")
321
+
322
+ # Also add to sys.path so it takes effect immediately for the current process
323
+ if current_dir not in sys.path:
324
+ sys.path.insert(0, current_dir)
325
+
326
+ # Store original sys.argv[0] because Hydra might manipulate it
327
+ # and we need it if we're not calling a Hydra app.
328
+ original_script_name = sys.argv[0]
329
+ args, remaining_argv = parse_args() # Use parse_known_args
330
+
331
+ setup_logging(args.verbose, getattr(args, "debug", False))
332
+
333
+ if args.command == "preview":
334
+ return preview_command(args)
335
+ elif args.command == "deploy":
336
+ return deploy_command(args)
337
+ elif args.command == "deploy-mcp":
338
+ return deploy_mcp_command(args)
339
+ elif args.command == "agent-eval":
340
+ return agent_eval_command(args)
341
+ elif args.command == "run":
342
+ # For the 'run' command, Hydra takes over argument parsing.
343
+
344
+ # Filter out the initial '--' if present in remaining_argv, which parse_known_args might add
345
+ hydra_specific_args = [arg for arg in remaining_argv if arg != "--"]
346
+
347
+ # Auto-detect local conf directory and add it to config path if not explicitly provided
348
+ has_config_path = any(arg.startswith("--config-path") for arg in hydra_specific_args)
349
+ current_dir = os.getcwd()
350
+ local_conf_dir = os.path.join(current_dir, "conf")
351
+
352
+ if not has_config_path and os.path.isdir(local_conf_dir):
353
+ logger.info(f"Auto-detected local conf directory: {local_conf_dir}")
354
+ hydra_specific_args = [
355
+ "--config-path",
356
+ local_conf_dir,
357
+ ] + hydra_specific_args
358
+
359
+ processed_hydra_args = []
360
+ i = 0
361
+ while i < len(hydra_specific_args):
362
+ arg = hydra_specific_args[i]
363
+ if arg == "--config-path":
364
+ processed_hydra_args.append(arg)
365
+ i += 1
366
+ if i < len(hydra_specific_args):
367
+ path_val = hydra_specific_args[i]
368
+ abs_path = os.path.abspath(path_val)
369
+ logger.debug(
370
+ f"Converting relative --config-path '{path_val}' (space separated) to absolute '{abs_path}'"
371
+ )
372
+ processed_hydra_args.append(abs_path)
373
+ else:
374
+ logger.error("--config-path specified without a value.")
375
+ pass
376
+ elif arg.startswith("--config-path="):
377
+ flag_part, path_val = arg.split("=", 1)
378
+ processed_hydra_args.append(flag_part)
379
+ abs_path = os.path.abspath(path_val)
380
+ logger.debug(
381
+ f"Converting relative --config-path '{path_val}' (equals separated) to absolute '{abs_path}'"
382
+ )
383
+ processed_hydra_args.append(abs_path)
384
+ else:
385
+ processed_hydra_args.append(arg)
386
+ i += 1
387
+
388
+ sys.argv = [sys.argv[0]] + processed_hydra_args
389
+ logger.info(f"SYSCALL_ARGV_FOR_HYDRA (after potential abspath conversion): {sys.argv}")
390
+
391
+ try:
392
+ hydra_cli_entry_point()
393
+ return 0
394
+ except Exception as e:
395
+ error_msg = str(e)
396
+ logger.error(f"Evaluation failed: {e}")
397
+
398
+ # Provide helpful suggestions for common Hydra/config errors
399
+ if "Cannot find primary config" in error_msg:
400
+ logger.error("HINT: Configuration file not found.")
401
+ logger.error("SOLUTION: Ensure you have a config file in ./conf/ directory")
402
+ logger.error("Try: reward-kit run --config-name simple_uipath_eval")
403
+ elif "missing from config" in error_msg or "MissingMandatoryValue" in error_msg:
404
+ logger.error("HINT: Required configuration values are missing.")
405
+ logger.error("SOLUTION: Check your config file for missing required fields")
406
+ elif "Config search path" in error_msg:
407
+ logger.error("HINT: Hydra cannot find the configuration directory.")
408
+ logger.error("SOLUTION: Create a ./conf directory with your config files")
409
+ elif "ValidationError" in error_msg:
410
+ logger.error("HINT: Configuration validation failed.")
411
+ logger.error("SOLUTION: Run 'reward-kit validate-data --file your_data.jsonl' to check data")
412
+
413
+ logger.error("\nQuick fix suggestions:")
414
+ logger.error("1. Use the simplified setup: reward-kit run --config-name simple_uipath_eval")
415
+ logger.error("2. Validate your data first: reward-kit validate-data --file data.jsonl --schema agent")
416
+ logger.error("3. Ensure you have: ./conf/simple_uipath_eval.yaml and ./uipath_reward.py")
417
+ return 1
418
+ else:
419
+ temp_parser = argparse.ArgumentParser(prog=os.path.basename(original_script_name))
420
+ temp_parser.print_help()
421
+ return 1
422
+
423
+
424
+ if __name__ == "__main__":
425
+ sys.exit(main())
@@ -0,0 +1 @@
1
+ # This package will contain modules for different CLI commands and common utilities.