eval-protocol 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- development/__init__.py +1 -0
- development/normalize_sandbox_fusion.py +628 -0
- development/utils/__init__.py +1 -0
- development/utils/generate_api_key.py +31 -0
- development/utils/subprocess_manager.py +481 -0
- eval_protocol/__init__.py +86 -0
- eval_protocol/__main__.py +10 -0
- eval_protocol/_version.py +21 -0
- eval_protocol/adapters/__init__.py +1 -0
- eval_protocol/adapters/braintrust.py +8 -0
- eval_protocol/adapters/trl.py +8 -0
- eval_protocol/agent/__init__.py +29 -0
- eval_protocol/agent/models.py +69 -0
- eval_protocol/agent/orchestrator.py +893 -0
- eval_protocol/agent/resource_abc.py +89 -0
- eval_protocol/agent/resource_pool.py +184 -0
- eval_protocol/agent/resources/__init__.py +44 -0
- eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
- eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
- eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
- eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
- eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
- eval_protocol/agent/resources/docker_resource.py +479 -0
- eval_protocol/agent/resources/filesystem_resource.py +371 -0
- eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
- eval_protocol/agent/resources/http_rollout_resource.py +325 -0
- eval_protocol/agent/resources/python_state_resource.py +170 -0
- eval_protocol/agent/resources/sql_resource.py +271 -0
- eval_protocol/agent/task_manager.py +1064 -0
- eval_protocol/agent/tool_registry.py +111 -0
- eval_protocol/auth.py +156 -0
- eval_protocol/cli.py +425 -0
- eval_protocol/cli_commands/__init__.py +1 -0
- eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
- eval_protocol/cli_commands/common.py +242 -0
- eval_protocol/cli_commands/deploy.py +486 -0
- eval_protocol/cli_commands/deploy_mcp.py +287 -0
- eval_protocol/cli_commands/preview.py +186 -0
- eval_protocol/cli_commands/run_eval_cmd.py +202 -0
- eval_protocol/common_utils.py +36 -0
- eval_protocol/config.py +180 -0
- eval_protocol/datasets/__init__.py +1 -0
- eval_protocol/datasets/loader.py +521 -0
- eval_protocol/evaluation.py +1045 -0
- eval_protocol/execution/__init__.py +1 -0
- eval_protocol/execution/pipeline.py +920 -0
- eval_protocol/gcp_tools.py +484 -0
- eval_protocol/generation/cache.py +141 -0
- eval_protocol/generation/clients/base.py +67 -0
- eval_protocol/generation/clients.py +248 -0
- eval_protocol/generic_server.py +165 -0
- eval_protocol/integrations/__init__.py +12 -0
- eval_protocol/integrations/braintrust.py +51 -0
- eval_protocol/integrations/deepeval.py +106 -0
- eval_protocol/integrations/openeval.py +40 -0
- eval_protocol/integrations/trl.py +187 -0
- eval_protocol/mcp/__init__.py +48 -0
- eval_protocol/mcp/adapter.py +131 -0
- eval_protocol/mcp/client/__init__.py +12 -0
- eval_protocol/mcp/client/connection.py +499 -0
- eval_protocol/mcp/clients.py +195 -0
- eval_protocol/mcp/execution/__init__.py +23 -0
- eval_protocol/mcp/execution/base_policy.py +227 -0
- eval_protocol/mcp/execution/fireworks_policy.py +209 -0
- eval_protocol/mcp/execution/manager.py +506 -0
- eval_protocol/mcp/execution/policy.py +421 -0
- eval_protocol/mcp/grid_renderer.py +54 -0
- eval_protocol/mcp/mcpgym.py +637 -0
- eval_protocol/mcp/process_manager.py +177 -0
- eval_protocol/mcp/session/__init__.py +11 -0
- eval_protocol/mcp/session/manager.py +228 -0
- eval_protocol/mcp/simple_process_manager.py +291 -0
- eval_protocol/mcp/simulation_server.py +458 -0
- eval_protocol/mcp/types.py +80 -0
- eval_protocol/mcp_agent/__init__.py +1 -0
- eval_protocol/mcp_agent/config.py +147 -0
- eval_protocol/mcp_agent/intermediary_server.py +542 -0
- eval_protocol/mcp_agent/main.py +210 -0
- eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
- eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
- eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
- eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
- eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
- eval_protocol/mcp_agent/session.py +79 -0
- eval_protocol/mcp_env.py +304 -0
- eval_protocol/models.py +366 -0
- eval_protocol/packaging.py +219 -0
- eval_protocol/platform_api.py +360 -0
- eval_protocol/playback_policy.py +396 -0
- eval_protocol/resources.py +128 -0
- eval_protocol/reward_function.py +410 -0
- eval_protocol/rewards/__init__.py +94 -0
- eval_protocol/rewards/accuracy.py +454 -0
- eval_protocol/rewards/accuracy_length.py +173 -0
- eval_protocol/rewards/apps_coding_reward.py +331 -0
- eval_protocol/rewards/apps_execution_utils.py +149 -0
- eval_protocol/rewards/apps_testing_util.py +559 -0
- eval_protocol/rewards/bfcl_reward.py +313 -0
- eval_protocol/rewards/code_execution.py +1620 -0
- eval_protocol/rewards/code_execution_utils.py +72 -0
- eval_protocol/rewards/cpp_code.py +861 -0
- eval_protocol/rewards/deepcoder_reward.py +161 -0
- eval_protocol/rewards/format.py +129 -0
- eval_protocol/rewards/function_calling.py +541 -0
- eval_protocol/rewards/json_schema.py +422 -0
- eval_protocol/rewards/language_consistency.py +700 -0
- eval_protocol/rewards/lean_prover.py +479 -0
- eval_protocol/rewards/length.py +375 -0
- eval_protocol/rewards/list_comparison_math_reward.py +221 -0
- eval_protocol/rewards/math.py +762 -0
- eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
- eval_protocol/rewards/reasoning_steps.py +249 -0
- eval_protocol/rewards/repetition.py +342 -0
- eval_protocol/rewards/tag_count.py +162 -0
- eval_protocol/rl_processing.py +82 -0
- eval_protocol/server.py +271 -0
- eval_protocol/typed_interface.py +260 -0
- eval_protocol/utils/__init__.py +8 -0
- eval_protocol/utils/batch_evaluation.py +217 -0
- eval_protocol/utils/batch_transformation.py +205 -0
- eval_protocol/utils/dataset_helpers.py +112 -0
- eval_protocol/utils/module_loader.py +56 -0
- eval_protocol/utils/packaging_utils.py +108 -0
- eval_protocol/utils/static_policy.py +305 -0
- eval_protocol-0.0.3.dist-info/METADATA +635 -0
- eval_protocol-0.0.3.dist-info/RECORD +130 -0
- eval_protocol-0.0.3.dist-info/WHEEL +5 -0
- eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
- eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
- eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
|
|
9
|
+
"""
|
|
10
|
+
Reads a JSONL file where each line is a valid JSON object and returns a list of these objects.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
file_path: Path to the JSONL file.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
A list of dictionaries, where each dictionary is a parsed JSON object from a line.
|
|
17
|
+
Returns an empty list if the file is not found or if errors occur during parsing,
|
|
18
|
+
with errors logged.
|
|
19
|
+
"""
|
|
20
|
+
data: List[Dict[str, Any]] = []
|
|
21
|
+
try:
|
|
22
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
23
|
+
for i, line in enumerate(f):
|
|
24
|
+
try:
|
|
25
|
+
data.append(json.loads(line.strip()))
|
|
26
|
+
except json.JSONDecodeError as e:
|
|
27
|
+
logger.error(f"Error decoding JSON on line {i+1} in {file_path}: {e} - Line: '{line.strip()}'")
|
|
28
|
+
# Optionally, re-raise, or return partial data, or handle as per desired strictness
|
|
29
|
+
# For now, we'll log and continue, returning successfully parsed lines.
|
|
30
|
+
except FileNotFoundError:
|
|
31
|
+
logger.error(f"File not found: {file_path}")
|
|
32
|
+
return []
|
|
33
|
+
except Exception as e:
|
|
34
|
+
logger.error(f"An unexpected error occurred while reading {file_path}: {e}")
|
|
35
|
+
return []
|
|
36
|
+
return data
|
eval_protocol/config.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, Literal, Optional
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
from pydantic import BaseModel, ValidationError
|
|
6
|
+
|
|
7
|
+
CONFIG_FILE_NAME = "rewardkit.yaml"
|
|
8
|
+
|
|
9
|
+
# --- Pydantic Models for Configuration Structure ---
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GCPCloudRunConfig(BaseModel):
|
|
13
|
+
project_id: Optional[str] = None # Default will be applied in deploy_command if not set
|
|
14
|
+
region: Optional[str] = None # Default will be applied in deploy_command if not set
|
|
15
|
+
artifact_registry_repository: Optional[str] = None # Default will be applied in deploy_command
|
|
16
|
+
service_name_template: Optional[str] = "rewardeval-{evaluator_id}"
|
|
17
|
+
default_auth_mode: Optional[Literal["api-key", "iam", "mtls-client-auth"]] = (
|
|
18
|
+
"api-key" # Default auth mode if using GCP target and not specified
|
|
19
|
+
)
|
|
20
|
+
secrets: Optional[Dict[str, str]] = {} # Maps ENV_VAR_NAME to GCP Secret Manager ID
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AWSLambdaConfig(BaseModel):
|
|
24
|
+
region: Optional[str] = None
|
|
25
|
+
function_name_template: Optional[str] = "rewardeval-{evaluator_id}"
|
|
26
|
+
default_auth_mode: Optional[Literal["api-key", "iam", "mtls-client-auth"]] = "api-key"
|
|
27
|
+
secrets: Optional[Dict[str, str]] = {} # Maps ENV_VAR_NAME to AWS Secret ARN
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class RewardKitConfig(BaseModel):
|
|
31
|
+
default_deployment_target: Optional[Literal["gcp-cloud-run", "aws-lambda", "fireworks", "local"]] = "fireworks"
|
|
32
|
+
gcp_cloud_run: Optional[GCPCloudRunConfig] = GCPCloudRunConfig()
|
|
33
|
+
aws_lambda: Optional[AWSLambdaConfig] = AWSLambdaConfig()
|
|
34
|
+
evaluator_endpoint_keys: Optional[Dict[str, str]] = (
|
|
35
|
+
{} # Stores generated API keys for self-hosted evaluator endpoints
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# --- Global variable to hold the loaded configuration ---
|
|
40
|
+
_loaded_config: Optional[RewardKitConfig] = None
|
|
41
|
+
_config_file_path: Optional[str] = None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def find_config_file(start_path: Optional[str] = None) -> Optional[str]:
|
|
45
|
+
"""
|
|
46
|
+
Finds the rewardkit.yaml file by searching upwards from start_path (or CWD).
|
|
47
|
+
"""
|
|
48
|
+
if start_path is None:
|
|
49
|
+
start_path = os.getcwd()
|
|
50
|
+
|
|
51
|
+
current_path = os.path.abspath(start_path)
|
|
52
|
+
while True:
|
|
53
|
+
potential_path = os.path.join(current_path, CONFIG_FILE_NAME)
|
|
54
|
+
if os.path.isfile(potential_path):
|
|
55
|
+
return potential_path
|
|
56
|
+
|
|
57
|
+
parent_path = os.path.dirname(current_path)
|
|
58
|
+
if parent_path == current_path:
|
|
59
|
+
return None
|
|
60
|
+
current_path = parent_path
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def load_config(config_path: Optional[str] = None) -> RewardKitConfig:
|
|
64
|
+
"""
|
|
65
|
+
Loads the rewardkit.yaml configuration.
|
|
66
|
+
If already loaded, returns the cached version unless a new path is provided.
|
|
67
|
+
If no path is provided, it tries to find rewardkit.yaml in CWD or parent directories.
|
|
68
|
+
"""
|
|
69
|
+
global _loaded_config, _config_file_path
|
|
70
|
+
|
|
71
|
+
if config_path:
|
|
72
|
+
pass
|
|
73
|
+
elif _loaded_config and not config_path:
|
|
74
|
+
return _loaded_config
|
|
75
|
+
else:
|
|
76
|
+
config_path = find_config_file()
|
|
77
|
+
|
|
78
|
+
if not config_path:
|
|
79
|
+
_loaded_config = RewardKitConfig()
|
|
80
|
+
_config_file_path = None
|
|
81
|
+
return _loaded_config
|
|
82
|
+
|
|
83
|
+
if _loaded_config and config_path == _config_file_path:
|
|
84
|
+
return _loaded_config
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
with open(config_path, "r") as f:
|
|
88
|
+
raw_config = yaml.safe_load(f)
|
|
89
|
+
|
|
90
|
+
if raw_config is None:
|
|
91
|
+
_loaded_config = RewardKitConfig()
|
|
92
|
+
else:
|
|
93
|
+
_loaded_config = RewardKitConfig(**raw_config)
|
|
94
|
+
|
|
95
|
+
_config_file_path = config_path
|
|
96
|
+
return _loaded_config
|
|
97
|
+
except FileNotFoundError:
|
|
98
|
+
_loaded_config = RewardKitConfig()
|
|
99
|
+
_config_file_path = None
|
|
100
|
+
return _loaded_config
|
|
101
|
+
except yaml.YAMLError as e:
|
|
102
|
+
print(f"Error parsing YAML from {config_path}: {e}")
|
|
103
|
+
# Decide: raise error or return default? For now, return default and warn.
|
|
104
|
+
_loaded_config = RewardKitConfig()
|
|
105
|
+
_config_file_path = config_path # So it doesn't try to reload this broken file again
|
|
106
|
+
return _loaded_config
|
|
107
|
+
except ValidationError as e:
|
|
108
|
+
print(f"Error validating configuration from {config_path}: {e}")
|
|
109
|
+
_loaded_config = RewardKitConfig()
|
|
110
|
+
_config_file_path = config_path
|
|
111
|
+
return _loaded_config
|
|
112
|
+
except Exception as e:
|
|
113
|
+
print(f"An unexpected error occurred while loading configuration from {config_path}: {e}")
|
|
114
|
+
_loaded_config = RewardKitConfig()
|
|
115
|
+
_config_file_path = config_path
|
|
116
|
+
return _loaded_config
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def get_config() -> RewardKitConfig:
|
|
120
|
+
"""
|
|
121
|
+
Returns the loaded configuration. Loads it if not already loaded.
|
|
122
|
+
"""
|
|
123
|
+
if _loaded_config is None:
|
|
124
|
+
return load_config()
|
|
125
|
+
return _loaded_config
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# Example usage (can be removed or kept for testing module directly)
|
|
129
|
+
if __name__ == "__main__":
|
|
130
|
+
# Create a dummy rewardkit.yaml for testing
|
|
131
|
+
dummy_config_content = """
|
|
132
|
+
default_deployment_target: gcp-cloud-run
|
|
133
|
+
|
|
134
|
+
gcp_cloud_run:
|
|
135
|
+
project_id: "test-gcp-project"
|
|
136
|
+
region: "europe-west1"
|
|
137
|
+
default_auth_mode: "iam"
|
|
138
|
+
secrets:
|
|
139
|
+
DB_PASSWORD: "projects/test-gcp-project/secrets/db-password/versions/latest"
|
|
140
|
+
|
|
141
|
+
aws_lambda:
|
|
142
|
+
region: "eu-north-1"
|
|
143
|
+
|
|
144
|
+
evaluator_endpoint_keys:
|
|
145
|
+
my_test_eval: "dummy_key_for_test_eval"
|
|
146
|
+
"""
|
|
147
|
+
dummy_file_path = os.path.join(os.getcwd(), CONFIG_FILE_NAME)
|
|
148
|
+
with open(dummy_file_path, "w") as f:
|
|
149
|
+
f.write(dummy_config_content)
|
|
150
|
+
|
|
151
|
+
print(f"Created dummy {CONFIG_FILE_NAME} for testing.")
|
|
152
|
+
|
|
153
|
+
config = get_config()
|
|
154
|
+
print("\nLoaded configuration:")
|
|
155
|
+
print(f" Default Target: {config.default_deployment_target}")
|
|
156
|
+
if config.gcp_cloud_run:
|
|
157
|
+
print(f" GCP Project ID: {config.gcp_cloud_run.project_id}")
|
|
158
|
+
print(f" GCP Region: {config.gcp_cloud_run.region}")
|
|
159
|
+
print(f" GCP Auth Mode: {config.gcp_cloud_run.default_auth_mode}")
|
|
160
|
+
print(f" GCP Secrets: {config.gcp_cloud_run.secrets}")
|
|
161
|
+
if config.aws_lambda:
|
|
162
|
+
print(f" AWS Region: {config.aws_lambda.region}")
|
|
163
|
+
print(f" Endpoint Keys: {config.evaluator_endpoint_keys}")
|
|
164
|
+
|
|
165
|
+
# Test finding config
|
|
166
|
+
print("\nTesting find_config_file():")
|
|
167
|
+
found_path = find_config_file()
|
|
168
|
+
print(f" Found at: {found_path}")
|
|
169
|
+
|
|
170
|
+
# Clean up dummy file
|
|
171
|
+
os.remove(dummy_file_path)
|
|
172
|
+
print(f"\nRemoved dummy {CONFIG_FILE_NAME}.")
|
|
173
|
+
|
|
174
|
+
# Test loading with no file
|
|
175
|
+
_loaded_config = None # Reset cache
|
|
176
|
+
_config_file_path = None
|
|
177
|
+
print("\nTesting get_config() with no file present:")
|
|
178
|
+
config_no_file = get_config()
|
|
179
|
+
print(f" Default Target (no file): {config_no_file.default_deployment_target}")
|
|
180
|
+
print(f" GCP Config (no file): {config_no_file.gcp_cloud_run}")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Dataset loading and processing utilities."""
|