eval-protocol 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- development/__init__.py +1 -0
- development/normalize_sandbox_fusion.py +628 -0
- development/utils/__init__.py +1 -0
- development/utils/generate_api_key.py +31 -0
- development/utils/subprocess_manager.py +481 -0
- eval_protocol/__init__.py +86 -0
- eval_protocol/__main__.py +10 -0
- eval_protocol/_version.py +21 -0
- eval_protocol/adapters/__init__.py +1 -0
- eval_protocol/adapters/braintrust.py +8 -0
- eval_protocol/adapters/trl.py +8 -0
- eval_protocol/agent/__init__.py +29 -0
- eval_protocol/agent/models.py +69 -0
- eval_protocol/agent/orchestrator.py +893 -0
- eval_protocol/agent/resource_abc.py +89 -0
- eval_protocol/agent/resource_pool.py +184 -0
- eval_protocol/agent/resources/__init__.py +44 -0
- eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
- eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
- eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
- eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
- eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
- eval_protocol/agent/resources/docker_resource.py +479 -0
- eval_protocol/agent/resources/filesystem_resource.py +371 -0
- eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
- eval_protocol/agent/resources/http_rollout_resource.py +325 -0
- eval_protocol/agent/resources/python_state_resource.py +170 -0
- eval_protocol/agent/resources/sql_resource.py +271 -0
- eval_protocol/agent/task_manager.py +1064 -0
- eval_protocol/agent/tool_registry.py +111 -0
- eval_protocol/auth.py +156 -0
- eval_protocol/cli.py +425 -0
- eval_protocol/cli_commands/__init__.py +1 -0
- eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
- eval_protocol/cli_commands/common.py +242 -0
- eval_protocol/cli_commands/deploy.py +486 -0
- eval_protocol/cli_commands/deploy_mcp.py +287 -0
- eval_protocol/cli_commands/preview.py +186 -0
- eval_protocol/cli_commands/run_eval_cmd.py +202 -0
- eval_protocol/common_utils.py +36 -0
- eval_protocol/config.py +180 -0
- eval_protocol/datasets/__init__.py +1 -0
- eval_protocol/datasets/loader.py +521 -0
- eval_protocol/evaluation.py +1045 -0
- eval_protocol/execution/__init__.py +1 -0
- eval_protocol/execution/pipeline.py +920 -0
- eval_protocol/gcp_tools.py +484 -0
- eval_protocol/generation/cache.py +141 -0
- eval_protocol/generation/clients/base.py +67 -0
- eval_protocol/generation/clients.py +248 -0
- eval_protocol/generic_server.py +165 -0
- eval_protocol/integrations/__init__.py +12 -0
- eval_protocol/integrations/braintrust.py +51 -0
- eval_protocol/integrations/deepeval.py +106 -0
- eval_protocol/integrations/openeval.py +40 -0
- eval_protocol/integrations/trl.py +187 -0
- eval_protocol/mcp/__init__.py +48 -0
- eval_protocol/mcp/adapter.py +131 -0
- eval_protocol/mcp/client/__init__.py +12 -0
- eval_protocol/mcp/client/connection.py +499 -0
- eval_protocol/mcp/clients.py +195 -0
- eval_protocol/mcp/execution/__init__.py +23 -0
- eval_protocol/mcp/execution/base_policy.py +227 -0
- eval_protocol/mcp/execution/fireworks_policy.py +209 -0
- eval_protocol/mcp/execution/manager.py +506 -0
- eval_protocol/mcp/execution/policy.py +421 -0
- eval_protocol/mcp/grid_renderer.py +54 -0
- eval_protocol/mcp/mcpgym.py +637 -0
- eval_protocol/mcp/process_manager.py +177 -0
- eval_protocol/mcp/session/__init__.py +11 -0
- eval_protocol/mcp/session/manager.py +228 -0
- eval_protocol/mcp/simple_process_manager.py +291 -0
- eval_protocol/mcp/simulation_server.py +458 -0
- eval_protocol/mcp/types.py +80 -0
- eval_protocol/mcp_agent/__init__.py +1 -0
- eval_protocol/mcp_agent/config.py +147 -0
- eval_protocol/mcp_agent/intermediary_server.py +542 -0
- eval_protocol/mcp_agent/main.py +210 -0
- eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
- eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
- eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
- eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
- eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
- eval_protocol/mcp_agent/session.py +79 -0
- eval_protocol/mcp_env.py +304 -0
- eval_protocol/models.py +366 -0
- eval_protocol/packaging.py +219 -0
- eval_protocol/platform_api.py +360 -0
- eval_protocol/playback_policy.py +396 -0
- eval_protocol/resources.py +128 -0
- eval_protocol/reward_function.py +410 -0
- eval_protocol/rewards/__init__.py +94 -0
- eval_protocol/rewards/accuracy.py +454 -0
- eval_protocol/rewards/accuracy_length.py +173 -0
- eval_protocol/rewards/apps_coding_reward.py +331 -0
- eval_protocol/rewards/apps_execution_utils.py +149 -0
- eval_protocol/rewards/apps_testing_util.py +559 -0
- eval_protocol/rewards/bfcl_reward.py +313 -0
- eval_protocol/rewards/code_execution.py +1620 -0
- eval_protocol/rewards/code_execution_utils.py +72 -0
- eval_protocol/rewards/cpp_code.py +861 -0
- eval_protocol/rewards/deepcoder_reward.py +161 -0
- eval_protocol/rewards/format.py +129 -0
- eval_protocol/rewards/function_calling.py +541 -0
- eval_protocol/rewards/json_schema.py +422 -0
- eval_protocol/rewards/language_consistency.py +700 -0
- eval_protocol/rewards/lean_prover.py +479 -0
- eval_protocol/rewards/length.py +375 -0
- eval_protocol/rewards/list_comparison_math_reward.py +221 -0
- eval_protocol/rewards/math.py +762 -0
- eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
- eval_protocol/rewards/reasoning_steps.py +249 -0
- eval_protocol/rewards/repetition.py +342 -0
- eval_protocol/rewards/tag_count.py +162 -0
- eval_protocol/rl_processing.py +82 -0
- eval_protocol/server.py +271 -0
- eval_protocol/typed_interface.py +260 -0
- eval_protocol/utils/__init__.py +8 -0
- eval_protocol/utils/batch_evaluation.py +217 -0
- eval_protocol/utils/batch_transformation.py +205 -0
- eval_protocol/utils/dataset_helpers.py +112 -0
- eval_protocol/utils/module_loader.py +56 -0
- eval_protocol/utils/packaging_utils.py +108 -0
- eval_protocol/utils/static_policy.py +305 -0
- eval_protocol-0.0.3.dist-info/METADATA +635 -0
- eval_protocol-0.0.3.dist-info/RECORD +130 -0
- eval_protocol-0.0.3.dist-info/WHEEL +5 -0
- eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
- eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
- eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tool Registry for the Agent Evaluation Framework.
|
|
3
|
+
Provides a mechanism to register and manage tools.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import functools
|
|
7
|
+
import inspect
|
|
8
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ToolRegistry:
|
|
12
|
+
"""
|
|
13
|
+
Registry for tools that can be used by agents.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
name: Unique identifier for this tool registry
|
|
17
|
+
tools: Dictionary mapping tool names to tool functions
|
|
18
|
+
descriptions: Dictionary mapping tool names to their descriptions
|
|
19
|
+
parameters: Dictionary mapping tool names to their parameter specifications
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, name: str):
|
|
23
|
+
"""
|
|
24
|
+
Initialize a new tool registry.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
name: Unique identifier for this registry
|
|
28
|
+
"""
|
|
29
|
+
self.name = name
|
|
30
|
+
self.tools: Dict[str, Callable] = {}
|
|
31
|
+
self.descriptions: Dict[str, str] = {}
|
|
32
|
+
self.parameters: Dict[str, Dict[str, Any]] = {}
|
|
33
|
+
|
|
34
|
+
def tool(self, description: str, parameters: Dict[str, Any]) -> Callable:
|
|
35
|
+
"""
|
|
36
|
+
Decorator to register a function as a tool.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
description: Human-readable description of the tool
|
|
40
|
+
parameters: Parameter specifications for the tool
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Decorator function that registers the decorated function
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def decorator(func: Callable) -> Callable:
|
|
47
|
+
tool_name = func.__name__
|
|
48
|
+
self.tools[tool_name] = func
|
|
49
|
+
self.descriptions[tool_name] = description
|
|
50
|
+
self.parameters[tool_name] = parameters
|
|
51
|
+
|
|
52
|
+
@functools.wraps(func)
|
|
53
|
+
def wrapper(*args, **kwargs):
|
|
54
|
+
return func(*args, **kwargs)
|
|
55
|
+
|
|
56
|
+
return wrapper
|
|
57
|
+
|
|
58
|
+
return decorator
|
|
59
|
+
|
|
60
|
+
def get_tool(self, tool_name: str) -> Optional[Callable]:
|
|
61
|
+
"""
|
|
62
|
+
Get a tool function by name.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
tool_name: Name of the tool to retrieve
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
The tool function if found, None otherwise
|
|
69
|
+
"""
|
|
70
|
+
return self.tools.get(tool_name)
|
|
71
|
+
|
|
72
|
+
def get_tools(self) -> Dict[str, Callable]:
|
|
73
|
+
"""
|
|
74
|
+
Get all tools in this registry.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Dictionary mapping tool names to tool functions
|
|
78
|
+
"""
|
|
79
|
+
return self.tools
|
|
80
|
+
|
|
81
|
+
def get_openai_tools(self) -> List[Dict[str, Any]]:
|
|
82
|
+
"""
|
|
83
|
+
Get tool specifications in OpenAI function calling format.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List of tool specifications compatible with OpenAI API
|
|
87
|
+
"""
|
|
88
|
+
tools = []
|
|
89
|
+
for tool_name in self.tools:
|
|
90
|
+
tools.append(
|
|
91
|
+
{
|
|
92
|
+
"name": tool_name,
|
|
93
|
+
"description": self.descriptions.get(tool_name, ""),
|
|
94
|
+
"parameters": {
|
|
95
|
+
"type": "object",
|
|
96
|
+
"properties": self.parameters.get(tool_name, {}),
|
|
97
|
+
"required": list(self.parameters.get(tool_name, {}).keys()),
|
|
98
|
+
},
|
|
99
|
+
}
|
|
100
|
+
)
|
|
101
|
+
return tools
|
|
102
|
+
|
|
103
|
+
def create_fastapi_app(self):
|
|
104
|
+
"""
|
|
105
|
+
Create a FastAPI app with endpoints for each tool.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
A FastAPI app instance with tool endpoints
|
|
109
|
+
"""
|
|
110
|
+
# This is a stub implementation
|
|
111
|
+
return {"app_type": "FastAPI", "tools": list(self.tools.keys())}
|
eval_protocol/auth.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import configparser
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, Optional # Added Dict
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
FIREWORKS_CONFIG_DIR = Path.home() / ".fireworks"
|
|
10
|
+
AUTH_INI_FILE = FIREWORKS_CONFIG_DIR / "auth.ini"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _parse_simple_auth_file(file_path: Path) -> Dict[str, str]:
|
|
14
|
+
"""
|
|
15
|
+
Parses an auth file with simple key=value lines.
|
|
16
|
+
Handles comments starting with # or ;.
|
|
17
|
+
Strips whitespace and basic quotes from values.
|
|
18
|
+
"""
|
|
19
|
+
creds = {}
|
|
20
|
+
if not file_path.exists():
|
|
21
|
+
return creds
|
|
22
|
+
try:
|
|
23
|
+
with open(file_path, "r") as f:
|
|
24
|
+
for line in f:
|
|
25
|
+
line = line.strip()
|
|
26
|
+
if not line or line.startswith("#") or line.startswith(";"):
|
|
27
|
+
continue
|
|
28
|
+
if "=" in line:
|
|
29
|
+
key, value = line.split("=", 1)
|
|
30
|
+
key = key.strip()
|
|
31
|
+
value = value.strip()
|
|
32
|
+
# Remove surrounding quotes if present
|
|
33
|
+
if value and (
|
|
34
|
+
(value.startswith('"') and value.endswith('"'))
|
|
35
|
+
or (value.startswith("'") and value.endswith("'"))
|
|
36
|
+
):
|
|
37
|
+
value = value[1:-1]
|
|
38
|
+
|
|
39
|
+
if key in ["api_key", "account_id"] and value:
|
|
40
|
+
creds[key] = value
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.warning(f"Error during simple parsing of {file_path}: {e}")
|
|
43
|
+
return creds
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _get_credential_from_config_file(key_name: str) -> Optional[str]:
|
|
47
|
+
"""
|
|
48
|
+
Helper to get a specific credential (api_key or account_id) from auth.ini.
|
|
49
|
+
Tries simple parsing first, then configparser.
|
|
50
|
+
"""
|
|
51
|
+
if not AUTH_INI_FILE.exists():
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
# 1. Try simple key-value parsing first
|
|
55
|
+
simple_creds = _parse_simple_auth_file(AUTH_INI_FILE)
|
|
56
|
+
if key_name in simple_creds:
|
|
57
|
+
logger.debug(f"Using {key_name} from simple key-value parsing of {AUTH_INI_FILE}.")
|
|
58
|
+
return simple_creds[key_name]
|
|
59
|
+
|
|
60
|
+
# 2. Fallback to configparser if not found via simple parsing or if simple parsing failed
|
|
61
|
+
# This path will also generate the "no section headers" warning if applicable,
|
|
62
|
+
# but only if simple parsing didn't yield the key.
|
|
63
|
+
try:
|
|
64
|
+
config = configparser.ConfigParser()
|
|
65
|
+
config.read(AUTH_INI_FILE)
|
|
66
|
+
|
|
67
|
+
# Try [fireworks] section
|
|
68
|
+
if "fireworks" in config and config.has_option("fireworks", key_name):
|
|
69
|
+
value_from_file = config.get("fireworks", key_name)
|
|
70
|
+
if value_from_file:
|
|
71
|
+
logger.debug(f"Using {key_name} from [fireworks] section in {AUTH_INI_FILE}.")
|
|
72
|
+
return value_from_file
|
|
73
|
+
|
|
74
|
+
# Try default section (configparser might place items without section header here)
|
|
75
|
+
if config.has_option(config.default_section, key_name):
|
|
76
|
+
value_from_default = config.get(config.default_section, key_name)
|
|
77
|
+
if value_from_default:
|
|
78
|
+
logger.debug(f"Using {key_name} from default section [{config.default_section}] in {AUTH_INI_FILE}.")
|
|
79
|
+
return value_from_default
|
|
80
|
+
|
|
81
|
+
except configparser.MissingSectionHeaderError:
|
|
82
|
+
# This error implies the file is purely key-value, which simple parsing should have handled.
|
|
83
|
+
# If simple parsing failed to get the key, then it's likely not there or malformed.
|
|
84
|
+
logger.debug(f"{AUTH_INI_FILE} has no section headers, and simple parsing did not find {key_name}.")
|
|
85
|
+
except configparser.Error as e_config:
|
|
86
|
+
logger.warning(f"Configparser error reading {AUTH_INI_FILE} for {key_name}: {e_config}")
|
|
87
|
+
except Exception as e_general:
|
|
88
|
+
logger.warning(f"Unexpected error reading {AUTH_INI_FILE} for {key_name}: {e_general}")
|
|
89
|
+
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_fireworks_api_key() -> Optional[str]:
|
|
94
|
+
"""
|
|
95
|
+
Retrieves the Fireworks API key.
|
|
96
|
+
|
|
97
|
+
The key is sourced in the following order:
|
|
98
|
+
1. FIREWORKS_API_KEY environment variable.
|
|
99
|
+
2. 'api_key' from the [fireworks] section of ~/.fireworks/auth.ini.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
The API key if found, otherwise None.
|
|
103
|
+
"""
|
|
104
|
+
api_key = os.environ.get("FIREWORKS_API_KEY")
|
|
105
|
+
if api_key:
|
|
106
|
+
logger.debug("Using FIREWORKS_API_KEY from environment variable.")
|
|
107
|
+
return api_key
|
|
108
|
+
|
|
109
|
+
api_key_from_file = _get_credential_from_config_file("api_key")
|
|
110
|
+
if api_key_from_file:
|
|
111
|
+
return api_key_from_file
|
|
112
|
+
|
|
113
|
+
logger.debug("Fireworks API key not found in environment variables or auth.ini.")
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_fireworks_account_id() -> Optional[str]:
|
|
118
|
+
"""
|
|
119
|
+
Retrieves the Fireworks Account ID.
|
|
120
|
+
|
|
121
|
+
The Account ID is sourced in the following order:
|
|
122
|
+
1. FIREWORKS_ACCOUNT_ID environment variable.
|
|
123
|
+
2. 'account_id' from the [fireworks] section of ~/.fireworks/auth.ini.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
The Account ID if found, otherwise None.
|
|
127
|
+
"""
|
|
128
|
+
account_id = os.environ.get("FIREWORKS_ACCOUNT_ID")
|
|
129
|
+
if account_id:
|
|
130
|
+
logger.debug("Using FIREWORKS_ACCOUNT_ID from environment variable.")
|
|
131
|
+
return account_id
|
|
132
|
+
|
|
133
|
+
account_id_from_file = _get_credential_from_config_file("account_id")
|
|
134
|
+
if account_id_from_file:
|
|
135
|
+
return account_id_from_file
|
|
136
|
+
|
|
137
|
+
logger.debug("Fireworks Account ID not found in environment variables or auth.ini.")
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def get_fireworks_api_base() -> str:
|
|
142
|
+
"""
|
|
143
|
+
Retrieves the Fireworks API base URL.
|
|
144
|
+
|
|
145
|
+
The base URL is sourced from the FIREWORKS_API_BASE environment variable.
|
|
146
|
+
If not set, it defaults to "https://api.fireworks.ai".
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
The API base URL.
|
|
150
|
+
"""
|
|
151
|
+
api_base = os.environ.get("FIREWORKS_API_BASE", "https://api.fireworks.ai")
|
|
152
|
+
if os.environ.get("FIREWORKS_API_BASE"):
|
|
153
|
+
logger.debug("Using FIREWORKS_API_BASE from environment variable.")
|
|
154
|
+
else:
|
|
155
|
+
logger.debug(f"FIREWORKS_API_BASE not set in environment, defaulting to {api_base}.")
|
|
156
|
+
return api_base
|
eval_protocol/cli.py
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for reward-kit.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import asyncio
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import sys
|
|
11
|
+
import traceback
|
|
12
|
+
import uuid
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
from eval_protocol.evaluation import create_evaluation, preview_evaluation
|
|
18
|
+
|
|
19
|
+
from .cli_commands.agent_eval_cmd import agent_eval_command
|
|
20
|
+
from .cli_commands.common import (
|
|
21
|
+
check_agent_environment,
|
|
22
|
+
check_environment,
|
|
23
|
+
setup_logging,
|
|
24
|
+
)
|
|
25
|
+
from .cli_commands.deploy import deploy_command
|
|
26
|
+
from .cli_commands.deploy_mcp import deploy_mcp_command
|
|
27
|
+
from .cli_commands.preview import preview_command
|
|
28
|
+
from .cli_commands.run_eval_cmd import hydra_cli_entry_point
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_args(args=None):
|
|
32
|
+
"""Parse command line arguments"""
|
|
33
|
+
parser = argparse.ArgumentParser(description="eval-protocol: Tools for evaluation and reward modeling")
|
|
34
|
+
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
|
|
35
|
+
|
|
36
|
+
subparsers = parser.add_subparsers(dest="command", help="Command to run")
|
|
37
|
+
|
|
38
|
+
# Preview command
|
|
39
|
+
preview_parser = subparsers.add_parser("preview", help="Preview an evaluator with sample data")
|
|
40
|
+
preview_parser.add_argument(
|
|
41
|
+
"--metrics-folders",
|
|
42
|
+
"-m",
|
|
43
|
+
nargs="+",
|
|
44
|
+
help="Metric folders in format 'name=path', e.g., 'clarity=./metrics/clarity'",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Make samples optional to allow HF dataset option
|
|
48
|
+
preview_parser.add_argument(
|
|
49
|
+
"--samples",
|
|
50
|
+
"-s",
|
|
51
|
+
required=False,
|
|
52
|
+
help="Path to JSONL file containing sample data",
|
|
53
|
+
)
|
|
54
|
+
preview_parser.add_argument(
|
|
55
|
+
"--max-samples",
|
|
56
|
+
type=int,
|
|
57
|
+
default=5,
|
|
58
|
+
help="Maximum number of samples to process (default: 5)",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Add HuggingFace dataset options
|
|
62
|
+
hf_group = preview_parser.add_argument_group("HuggingFace Dataset Options")
|
|
63
|
+
hf_group.add_argument(
|
|
64
|
+
"--huggingface-dataset",
|
|
65
|
+
"--hf",
|
|
66
|
+
help="HuggingFace dataset name (e.g., 'deepseek-ai/DeepSeek-ProverBench')",
|
|
67
|
+
)
|
|
68
|
+
hf_group.add_argument(
|
|
69
|
+
"--huggingface-split",
|
|
70
|
+
default="train",
|
|
71
|
+
help="Dataset split to use (default: 'train')",
|
|
72
|
+
)
|
|
73
|
+
hf_group.add_argument(
|
|
74
|
+
"--huggingface-prompt-key",
|
|
75
|
+
default="prompt",
|
|
76
|
+
help="Key in the dataset containing the prompt text (default: 'prompt')",
|
|
77
|
+
)
|
|
78
|
+
hf_group.add_argument(
|
|
79
|
+
"--huggingface-response-key",
|
|
80
|
+
default="response",
|
|
81
|
+
help="Key in the dataset containing the response text (default: 'response')",
|
|
82
|
+
)
|
|
83
|
+
hf_group.add_argument(
|
|
84
|
+
"--huggingface-key-map",
|
|
85
|
+
help="JSON mapping of dataset keys to reward-kit message keys",
|
|
86
|
+
)
|
|
87
|
+
preview_parser.add_argument(
|
|
88
|
+
"--remote-url",
|
|
89
|
+
help="URL of a remote reward function endpoint to preview against. If provided, metrics-folders might be ignored.",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Deploy command
|
|
93
|
+
deploy_parser = subparsers.add_parser("deploy", help="Create and deploy an evaluator, or register a remote one")
|
|
94
|
+
deploy_parser.add_argument("--id", required=True, help="ID for the evaluator")
|
|
95
|
+
deploy_parser.add_argument(
|
|
96
|
+
"--metrics-folders",
|
|
97
|
+
"-m",
|
|
98
|
+
nargs="+",
|
|
99
|
+
required=False, # No longer strictly required if --remote-url is used
|
|
100
|
+
help="Metric folders in format 'name=path', e.g., 'clarity=./metrics/clarity'. Required if not using --remote-url.",
|
|
101
|
+
)
|
|
102
|
+
deploy_parser.add_argument(
|
|
103
|
+
"--display-name",
|
|
104
|
+
help="Display name for the evaluator (defaults to ID if not provided)",
|
|
105
|
+
)
|
|
106
|
+
deploy_parser.add_argument("--description", help="Description for the evaluator")
|
|
107
|
+
deploy_parser.add_argument(
|
|
108
|
+
"--force",
|
|
109
|
+
"-f",
|
|
110
|
+
action="store_true",
|
|
111
|
+
help="Force update if evaluator already exists",
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Add HuggingFace dataset options to deploy command
|
|
115
|
+
hf_deploy_group = deploy_parser.add_argument_group("HuggingFace Dataset Options")
|
|
116
|
+
hf_deploy_group.add_argument(
|
|
117
|
+
"--huggingface-dataset",
|
|
118
|
+
"--hf",
|
|
119
|
+
help="HuggingFace dataset name (e.g., 'deepseek-ai/DeepSeek-ProverBench')",
|
|
120
|
+
)
|
|
121
|
+
hf_deploy_group.add_argument(
|
|
122
|
+
"--huggingface-split",
|
|
123
|
+
default="train",
|
|
124
|
+
help="Dataset split to use (default: 'train')",
|
|
125
|
+
)
|
|
126
|
+
hf_deploy_group.add_argument(
|
|
127
|
+
"--huggingface-prompt-key",
|
|
128
|
+
default="prompt",
|
|
129
|
+
help="Key in the dataset containing the prompt text (default: 'prompt')",
|
|
130
|
+
)
|
|
131
|
+
hf_deploy_group.add_argument(
|
|
132
|
+
"--huggingface-response-key",
|
|
133
|
+
default="response",
|
|
134
|
+
help="Key in the dataset containing the response text (default: 'response')",
|
|
135
|
+
)
|
|
136
|
+
hf_deploy_group.add_argument(
|
|
137
|
+
"--huggingface-key-map",
|
|
138
|
+
help="JSON mapping of dataset keys to reward-kit message keys",
|
|
139
|
+
)
|
|
140
|
+
deploy_parser.add_argument(
|
|
141
|
+
"--remote-url",
|
|
142
|
+
help="URL of a pre-deployed remote reward function. If provided, deploys by registering this URL with Fireworks AI.",
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Deployment target options
|
|
146
|
+
target_group = deploy_parser.add_argument_group("Deployment Target Options")
|
|
147
|
+
target_group.add_argument(
|
|
148
|
+
"--target",
|
|
149
|
+
choices=["fireworks", "gcp-cloud-run", "local-serve"],
|
|
150
|
+
default="fireworks",
|
|
151
|
+
help="Deployment target. 'fireworks' for standard Fireworks platform deployment, 'gcp-cloud-run' for Google Cloud Run, 'local-serve' for local serving with Serveo tunneling.",
|
|
152
|
+
)
|
|
153
|
+
target_group.add_argument(
|
|
154
|
+
"--function-ref",
|
|
155
|
+
help="Reference to the reward function to deploy (e.g., 'my_module.reward_func'). Required for 'gcp-cloud-run' and 'local-serve' targets.",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Local serving options (relevant if --target is local-serve)
|
|
159
|
+
local_serve_group = deploy_parser.add_argument_group("Local Serving Options (used if --target is local-serve)")
|
|
160
|
+
local_serve_group.add_argument(
|
|
161
|
+
"--local-port",
|
|
162
|
+
type=int,
|
|
163
|
+
default=8001,
|
|
164
|
+
help="Port for the local reward function server to listen on (default: 8001). Used with --target local-serve.",
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# GCP deployment options
|
|
168
|
+
gcp_group = deploy_parser.add_argument_group(
|
|
169
|
+
"GCP Cloud Run Deployment Options (used if --target is gcp-cloud-run)"
|
|
170
|
+
)
|
|
171
|
+
# --function-ref is now in target_group
|
|
172
|
+
gcp_group.add_argument(
|
|
173
|
+
"--gcp-project",
|
|
174
|
+
required=False,
|
|
175
|
+
help="Google Cloud Project ID. Must be provided via CLI or rewardkit.yaml.",
|
|
176
|
+
)
|
|
177
|
+
gcp_group.add_argument(
|
|
178
|
+
"--gcp-region",
|
|
179
|
+
required=False,
|
|
180
|
+
help="Google Cloud Region for deployment (e.g., 'us-central1'). Must be provided via CLI or rewardkit.yaml.",
|
|
181
|
+
)
|
|
182
|
+
gcp_group.add_argument(
|
|
183
|
+
"--gcp-ar-repo",
|
|
184
|
+
required=False,
|
|
185
|
+
help="Google Artifact Registry repository name. Optional, defaults to value in rewardkit.yaml or 'reward-kit-evaluators' if not specified.",
|
|
186
|
+
)
|
|
187
|
+
gcp_group.add_argument(
|
|
188
|
+
"--service-account",
|
|
189
|
+
help="Email of the GCP service account to run the Cloud Run service. Optional.",
|
|
190
|
+
)
|
|
191
|
+
gcp_group.add_argument(
|
|
192
|
+
"--entry-point",
|
|
193
|
+
default="reward_function",
|
|
194
|
+
help="The name of the entry point function within your --function-ref module (default: reward_function). Only for gcp-cloud-run.",
|
|
195
|
+
)
|
|
196
|
+
gcp_group.add_argument(
|
|
197
|
+
"--runtime",
|
|
198
|
+
default="python311", # Or a sensible default
|
|
199
|
+
help="The Cloud Functions/Run runtime (e.g., python311). Only for gcp-cloud-run.",
|
|
200
|
+
)
|
|
201
|
+
gcp_group.add_argument(
|
|
202
|
+
"--gcp-auth-mode",
|
|
203
|
+
choices=["open", "api-key"], # Add 'iam' later
|
|
204
|
+
default=None, # Default will be resolved in deploy_command
|
|
205
|
+
help="Authentication mode for the deployed GCP Cloud Run service. "
|
|
206
|
+
"'open': Publicly accessible. "
|
|
207
|
+
"'api-key': Service is publicly accessible but requires an API key in requests (handled by the application). "
|
|
208
|
+
"If not specified, defaults to value in rewardkit.yaml or 'api-key'. Optional.",
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Deploy MCP command
|
|
212
|
+
deploy_mcp_parser = subparsers.add_parser("deploy-mcp", help="Deploy an MCP server to Google Cloud Run")
|
|
213
|
+
deploy_mcp_parser.add_argument("--id", required=True, help="Unique ID for the MCP server deployment")
|
|
214
|
+
deploy_mcp_parser.add_argument(
|
|
215
|
+
"--mcp-server-module",
|
|
216
|
+
help="Python module containing the MCP server (e.g., 'examples.frozen_lake_mcp.frozen_lake_mcp_server'). Required if --dockerfile is not provided.",
|
|
217
|
+
)
|
|
218
|
+
deploy_mcp_parser.add_argument(
|
|
219
|
+
"--dockerfile",
|
|
220
|
+
help="Path to Dockerfile to use for deployment (recommended for tested local Dockerfiles). When provided, --mcp-server-module is not required.",
|
|
221
|
+
)
|
|
222
|
+
deploy_mcp_parser.add_argument(
|
|
223
|
+
"--gcp-project",
|
|
224
|
+
help="Google Cloud Project ID. Can also be set in rewardkit.yaml",
|
|
225
|
+
)
|
|
226
|
+
deploy_mcp_parser.add_argument(
|
|
227
|
+
"--gcp-region",
|
|
228
|
+
help="Google Cloud Region (e.g., 'us-central1'). Can also be set in rewardkit.yaml",
|
|
229
|
+
)
|
|
230
|
+
deploy_mcp_parser.add_argument(
|
|
231
|
+
"--gcp-ar-repo",
|
|
232
|
+
help="Google Artifact Registry repository name. Defaults to 'reward-kit-mcp-servers'",
|
|
233
|
+
)
|
|
234
|
+
deploy_mcp_parser.add_argument(
|
|
235
|
+
"--port",
|
|
236
|
+
type=int,
|
|
237
|
+
default=8000,
|
|
238
|
+
help="Port for the MCP server to listen on (default: 8000)",
|
|
239
|
+
)
|
|
240
|
+
deploy_mcp_parser.add_argument(
|
|
241
|
+
"--python-version",
|
|
242
|
+
default="3.11",
|
|
243
|
+
help="Python version for the container (default: 3.11)",
|
|
244
|
+
)
|
|
245
|
+
deploy_mcp_parser.add_argument("--requirements", help="Additional pip requirements (newline separated)")
|
|
246
|
+
deploy_mcp_parser.add_argument("--env-vars", nargs="*", help="Environment variables in KEY=VALUE format")
|
|
247
|
+
|
|
248
|
+
# Agent-eval command
|
|
249
|
+
agent_eval_parser = subparsers.add_parser(
|
|
250
|
+
"agent-eval", help="Run agent evaluation using the ForkableResource framework."
|
|
251
|
+
)
|
|
252
|
+
agent_eval_parser.add_argument(
|
|
253
|
+
"--task-def",
|
|
254
|
+
required=True,
|
|
255
|
+
help="Path to task definition file or directory containing task definitions.",
|
|
256
|
+
)
|
|
257
|
+
agent_eval_parser.add_argument(
|
|
258
|
+
"--parallel",
|
|
259
|
+
action="store_true",
|
|
260
|
+
help="Execute tasks in parallel when multiple tasks are specified.",
|
|
261
|
+
)
|
|
262
|
+
agent_eval_parser.add_argument(
|
|
263
|
+
"--max-concurrency",
|
|
264
|
+
type=int,
|
|
265
|
+
default=3,
|
|
266
|
+
help="Maximum number of tasks to execute in parallel (default: 3).",
|
|
267
|
+
)
|
|
268
|
+
agent_eval_parser.add_argument(
|
|
269
|
+
"--filter",
|
|
270
|
+
nargs="+",
|
|
271
|
+
help="Run only tasks matching the specified task IDs.",
|
|
272
|
+
)
|
|
273
|
+
agent_eval_parser.add_argument(
|
|
274
|
+
"--output-dir",
|
|
275
|
+
default="./agent_runs",
|
|
276
|
+
help="Directory to store agent evaluation run results (default: ./agent_runs).",
|
|
277
|
+
)
|
|
278
|
+
agent_eval_parser.add_argument(
|
|
279
|
+
"--model",
|
|
280
|
+
help="Override MODEL_AGENT environment variable (format: provider/model_name).",
|
|
281
|
+
)
|
|
282
|
+
agent_eval_parser.add_argument(
|
|
283
|
+
"--num-rollouts",
|
|
284
|
+
type=int,
|
|
285
|
+
help="Override the number of parallel rollouts to execute for each task.",
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Run command (for Hydra-based evaluations)
|
|
289
|
+
# This subparser intentionally defines no arguments itself.
|
|
290
|
+
# All arguments after 'run' will be passed to Hydra by parse_known_args.
|
|
291
|
+
subparsers.add_parser(
|
|
292
|
+
"run",
|
|
293
|
+
help="Run an evaluation using a Hydra configuration. All arguments after 'run' are passed to Hydra.",
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Use parse_known_args to allow Hydra to handle its own arguments
|
|
297
|
+
return parser.parse_known_args(args)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def main():
|
|
301
|
+
"""Main entry point for the CLI"""
|
|
302
|
+
try:
|
|
303
|
+
from dotenv import load_dotenv
|
|
304
|
+
|
|
305
|
+
# .env.dev for development-specific overrides, .env for general
|
|
306
|
+
load_dotenv(dotenv_path=Path(".") / ".env.dev", override=True)
|
|
307
|
+
load_dotenv(override=True)
|
|
308
|
+
except ImportError:
|
|
309
|
+
pass
|
|
310
|
+
|
|
311
|
+
# Automatic PYTHONPATH enhancement - add current directory to Python path
|
|
312
|
+
# This needs to happen early, before any module loading occurs
|
|
313
|
+
current_dir = os.getcwd()
|
|
314
|
+
current_pythonpath = os.environ.get("PYTHONPATH", "")
|
|
315
|
+
if current_dir not in current_pythonpath.split(os.pathsep):
|
|
316
|
+
if current_pythonpath:
|
|
317
|
+
os.environ["PYTHONPATH"] = f"{current_dir}{os.pathsep}{current_pythonpath}"
|
|
318
|
+
else:
|
|
319
|
+
os.environ["PYTHONPATH"] = current_dir
|
|
320
|
+
logger.debug(f"Added current directory to PYTHONPATH: {current_dir}")
|
|
321
|
+
|
|
322
|
+
# Also add to sys.path so it takes effect immediately for the current process
|
|
323
|
+
if current_dir not in sys.path:
|
|
324
|
+
sys.path.insert(0, current_dir)
|
|
325
|
+
|
|
326
|
+
# Store original sys.argv[0] because Hydra might manipulate it
|
|
327
|
+
# and we need it if we're not calling a Hydra app.
|
|
328
|
+
original_script_name = sys.argv[0]
|
|
329
|
+
args, remaining_argv = parse_args() # Use parse_known_args
|
|
330
|
+
|
|
331
|
+
setup_logging(args.verbose, getattr(args, "debug", False))
|
|
332
|
+
|
|
333
|
+
if args.command == "preview":
|
|
334
|
+
return preview_command(args)
|
|
335
|
+
elif args.command == "deploy":
|
|
336
|
+
return deploy_command(args)
|
|
337
|
+
elif args.command == "deploy-mcp":
|
|
338
|
+
return deploy_mcp_command(args)
|
|
339
|
+
elif args.command == "agent-eval":
|
|
340
|
+
return agent_eval_command(args)
|
|
341
|
+
elif args.command == "run":
|
|
342
|
+
# For the 'run' command, Hydra takes over argument parsing.
|
|
343
|
+
|
|
344
|
+
# Filter out the initial '--' if present in remaining_argv, which parse_known_args might add
|
|
345
|
+
hydra_specific_args = [arg for arg in remaining_argv if arg != "--"]
|
|
346
|
+
|
|
347
|
+
# Auto-detect local conf directory and add it to config path if not explicitly provided
|
|
348
|
+
has_config_path = any(arg.startswith("--config-path") for arg in hydra_specific_args)
|
|
349
|
+
current_dir = os.getcwd()
|
|
350
|
+
local_conf_dir = os.path.join(current_dir, "conf")
|
|
351
|
+
|
|
352
|
+
if not has_config_path and os.path.isdir(local_conf_dir):
|
|
353
|
+
logger.info(f"Auto-detected local conf directory: {local_conf_dir}")
|
|
354
|
+
hydra_specific_args = [
|
|
355
|
+
"--config-path",
|
|
356
|
+
local_conf_dir,
|
|
357
|
+
] + hydra_specific_args
|
|
358
|
+
|
|
359
|
+
processed_hydra_args = []
|
|
360
|
+
i = 0
|
|
361
|
+
while i < len(hydra_specific_args):
|
|
362
|
+
arg = hydra_specific_args[i]
|
|
363
|
+
if arg == "--config-path":
|
|
364
|
+
processed_hydra_args.append(arg)
|
|
365
|
+
i += 1
|
|
366
|
+
if i < len(hydra_specific_args):
|
|
367
|
+
path_val = hydra_specific_args[i]
|
|
368
|
+
abs_path = os.path.abspath(path_val)
|
|
369
|
+
logger.debug(
|
|
370
|
+
f"Converting relative --config-path '{path_val}' (space separated) to absolute '{abs_path}'"
|
|
371
|
+
)
|
|
372
|
+
processed_hydra_args.append(abs_path)
|
|
373
|
+
else:
|
|
374
|
+
logger.error("--config-path specified without a value.")
|
|
375
|
+
pass
|
|
376
|
+
elif arg.startswith("--config-path="):
|
|
377
|
+
flag_part, path_val = arg.split("=", 1)
|
|
378
|
+
processed_hydra_args.append(flag_part)
|
|
379
|
+
abs_path = os.path.abspath(path_val)
|
|
380
|
+
logger.debug(
|
|
381
|
+
f"Converting relative --config-path '{path_val}' (equals separated) to absolute '{abs_path}'"
|
|
382
|
+
)
|
|
383
|
+
processed_hydra_args.append(abs_path)
|
|
384
|
+
else:
|
|
385
|
+
processed_hydra_args.append(arg)
|
|
386
|
+
i += 1
|
|
387
|
+
|
|
388
|
+
sys.argv = [sys.argv[0]] + processed_hydra_args
|
|
389
|
+
logger.info(f"SYSCALL_ARGV_FOR_HYDRA (after potential abspath conversion): {sys.argv}")
|
|
390
|
+
|
|
391
|
+
try:
|
|
392
|
+
hydra_cli_entry_point()
|
|
393
|
+
return 0
|
|
394
|
+
except Exception as e:
|
|
395
|
+
error_msg = str(e)
|
|
396
|
+
logger.error(f"Evaluation failed: {e}")
|
|
397
|
+
|
|
398
|
+
# Provide helpful suggestions for common Hydra/config errors
|
|
399
|
+
if "Cannot find primary config" in error_msg:
|
|
400
|
+
logger.error("HINT: Configuration file not found.")
|
|
401
|
+
logger.error("SOLUTION: Ensure you have a config file in ./conf/ directory")
|
|
402
|
+
logger.error("Try: reward-kit run --config-name simple_uipath_eval")
|
|
403
|
+
elif "missing from config" in error_msg or "MissingMandatoryValue" in error_msg:
|
|
404
|
+
logger.error("HINT: Required configuration values are missing.")
|
|
405
|
+
logger.error("SOLUTION: Check your config file for missing required fields")
|
|
406
|
+
elif "Config search path" in error_msg:
|
|
407
|
+
logger.error("HINT: Hydra cannot find the configuration directory.")
|
|
408
|
+
logger.error("SOLUTION: Create a ./conf directory with your config files")
|
|
409
|
+
elif "ValidationError" in error_msg:
|
|
410
|
+
logger.error("HINT: Configuration validation failed.")
|
|
411
|
+
logger.error("SOLUTION: Run 'reward-kit validate-data --file your_data.jsonl' to check data")
|
|
412
|
+
|
|
413
|
+
logger.error("\nQuick fix suggestions:")
|
|
414
|
+
logger.error("1. Use the simplified setup: reward-kit run --config-name simple_uipath_eval")
|
|
415
|
+
logger.error("2. Validate your data first: reward-kit validate-data --file data.jsonl --schema agent")
|
|
416
|
+
logger.error("3. Ensure you have: ./conf/simple_uipath_eval.yaml and ./uipath_reward.py")
|
|
417
|
+
return 1
|
|
418
|
+
else:
|
|
419
|
+
temp_parser = argparse.ArgumentParser(prog=os.path.basename(original_script_name))
|
|
420
|
+
temp_parser.print_help()
|
|
421
|
+
return 1
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
if __name__ == "__main__":
|
|
425
|
+
sys.exit(main())
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# This package will contain modules for different CLI commands and common utilities.
|