eval-protocol 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- development/__init__.py +1 -0
- development/normalize_sandbox_fusion.py +628 -0
- development/utils/__init__.py +1 -0
- development/utils/generate_api_key.py +31 -0
- development/utils/subprocess_manager.py +481 -0
- eval_protocol/__init__.py +86 -0
- eval_protocol/__main__.py +10 -0
- eval_protocol/_version.py +21 -0
- eval_protocol/adapters/__init__.py +1 -0
- eval_protocol/adapters/braintrust.py +8 -0
- eval_protocol/adapters/trl.py +8 -0
- eval_protocol/agent/__init__.py +29 -0
- eval_protocol/agent/models.py +69 -0
- eval_protocol/agent/orchestrator.py +893 -0
- eval_protocol/agent/resource_abc.py +89 -0
- eval_protocol/agent/resource_pool.py +184 -0
- eval_protocol/agent/resources/__init__.py +44 -0
- eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
- eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
- eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
- eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
- eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
- eval_protocol/agent/resources/docker_resource.py +479 -0
- eval_protocol/agent/resources/filesystem_resource.py +371 -0
- eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
- eval_protocol/agent/resources/http_rollout_resource.py +325 -0
- eval_protocol/agent/resources/python_state_resource.py +170 -0
- eval_protocol/agent/resources/sql_resource.py +271 -0
- eval_protocol/agent/task_manager.py +1064 -0
- eval_protocol/agent/tool_registry.py +111 -0
- eval_protocol/auth.py +156 -0
- eval_protocol/cli.py +425 -0
- eval_protocol/cli_commands/__init__.py +1 -0
- eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
- eval_protocol/cli_commands/common.py +242 -0
- eval_protocol/cli_commands/deploy.py +486 -0
- eval_protocol/cli_commands/deploy_mcp.py +287 -0
- eval_protocol/cli_commands/preview.py +186 -0
- eval_protocol/cli_commands/run_eval_cmd.py +202 -0
- eval_protocol/common_utils.py +36 -0
- eval_protocol/config.py +180 -0
- eval_protocol/datasets/__init__.py +1 -0
- eval_protocol/datasets/loader.py +521 -0
- eval_protocol/evaluation.py +1045 -0
- eval_protocol/execution/__init__.py +1 -0
- eval_protocol/execution/pipeline.py +920 -0
- eval_protocol/gcp_tools.py +484 -0
- eval_protocol/generation/cache.py +141 -0
- eval_protocol/generation/clients/base.py +67 -0
- eval_protocol/generation/clients.py +248 -0
- eval_protocol/generic_server.py +165 -0
- eval_protocol/integrations/__init__.py +12 -0
- eval_protocol/integrations/braintrust.py +51 -0
- eval_protocol/integrations/deepeval.py +106 -0
- eval_protocol/integrations/openeval.py +40 -0
- eval_protocol/integrations/trl.py +187 -0
- eval_protocol/mcp/__init__.py +48 -0
- eval_protocol/mcp/adapter.py +131 -0
- eval_protocol/mcp/client/__init__.py +12 -0
- eval_protocol/mcp/client/connection.py +499 -0
- eval_protocol/mcp/clients.py +195 -0
- eval_protocol/mcp/execution/__init__.py +23 -0
- eval_protocol/mcp/execution/base_policy.py +227 -0
- eval_protocol/mcp/execution/fireworks_policy.py +209 -0
- eval_protocol/mcp/execution/manager.py +506 -0
- eval_protocol/mcp/execution/policy.py +421 -0
- eval_protocol/mcp/grid_renderer.py +54 -0
- eval_protocol/mcp/mcpgym.py +637 -0
- eval_protocol/mcp/process_manager.py +177 -0
- eval_protocol/mcp/session/__init__.py +11 -0
- eval_protocol/mcp/session/manager.py +228 -0
- eval_protocol/mcp/simple_process_manager.py +291 -0
- eval_protocol/mcp/simulation_server.py +458 -0
- eval_protocol/mcp/types.py +80 -0
- eval_protocol/mcp_agent/__init__.py +1 -0
- eval_protocol/mcp_agent/config.py +147 -0
- eval_protocol/mcp_agent/intermediary_server.py +542 -0
- eval_protocol/mcp_agent/main.py +210 -0
- eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
- eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
- eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
- eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
- eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
- eval_protocol/mcp_agent/session.py +79 -0
- eval_protocol/mcp_env.py +304 -0
- eval_protocol/models.py +366 -0
- eval_protocol/packaging.py +219 -0
- eval_protocol/platform_api.py +360 -0
- eval_protocol/playback_policy.py +396 -0
- eval_protocol/resources.py +128 -0
- eval_protocol/reward_function.py +410 -0
- eval_protocol/rewards/__init__.py +94 -0
- eval_protocol/rewards/accuracy.py +454 -0
- eval_protocol/rewards/accuracy_length.py +173 -0
- eval_protocol/rewards/apps_coding_reward.py +331 -0
- eval_protocol/rewards/apps_execution_utils.py +149 -0
- eval_protocol/rewards/apps_testing_util.py +559 -0
- eval_protocol/rewards/bfcl_reward.py +313 -0
- eval_protocol/rewards/code_execution.py +1620 -0
- eval_protocol/rewards/code_execution_utils.py +72 -0
- eval_protocol/rewards/cpp_code.py +861 -0
- eval_protocol/rewards/deepcoder_reward.py +161 -0
- eval_protocol/rewards/format.py +129 -0
- eval_protocol/rewards/function_calling.py +541 -0
- eval_protocol/rewards/json_schema.py +422 -0
- eval_protocol/rewards/language_consistency.py +700 -0
- eval_protocol/rewards/lean_prover.py +479 -0
- eval_protocol/rewards/length.py +375 -0
- eval_protocol/rewards/list_comparison_math_reward.py +221 -0
- eval_protocol/rewards/math.py +762 -0
- eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
- eval_protocol/rewards/reasoning_steps.py +249 -0
- eval_protocol/rewards/repetition.py +342 -0
- eval_protocol/rewards/tag_count.py +162 -0
- eval_protocol/rl_processing.py +82 -0
- eval_protocol/server.py +271 -0
- eval_protocol/typed_interface.py +260 -0
- eval_protocol/utils/__init__.py +8 -0
- eval_protocol/utils/batch_evaluation.py +217 -0
- eval_protocol/utils/batch_transformation.py +205 -0
- eval_protocol/utils/dataset_helpers.py +112 -0
- eval_protocol/utils/module_loader.py +56 -0
- eval_protocol/utils/packaging_utils.py +108 -0
- eval_protocol/utils/static_policy.py +305 -0
- eval_protocol-0.0.3.dist-info/METADATA +635 -0
- eval_protocol-0.0.3.dist-info/RECORD +130 -0
- eval_protocol-0.0.3.dist-info/WHEEL +5 -0
- eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
- eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
- eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,635 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: eval-protocol
|
|
3
|
+
Version: 0.0.3
|
|
4
|
+
Summary: A Python library for defining, testing, and using reward functions
|
|
5
|
+
Author-email: Fireworks AI <info@fireworks.ai>
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/fireworks-ai/eval-protocol
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: requests>=2.25.0
|
|
14
|
+
Requires-Dist: pydantic>=2.0.0
|
|
15
|
+
Requires-Dist: dataclasses-json>=0.5.7
|
|
16
|
+
Requires-Dist: fastapi>=0.68.0
|
|
17
|
+
Requires-Dist: uvicorn>=0.15.0
|
|
18
|
+
Requires-Dist: python-dotenv>=0.19.0
|
|
19
|
+
Requires-Dist: openai==1.78.1
|
|
20
|
+
Requires-Dist: aiosqlite
|
|
21
|
+
Requires-Dist: aiohttp
|
|
22
|
+
Requires-Dist: mcp>=1.9.2
|
|
23
|
+
Requires-Dist: PyYAML>=5.0
|
|
24
|
+
Requires-Dist: datasets==3.6.0
|
|
25
|
+
Requires-Dist: fsspec==2025.3.0
|
|
26
|
+
Requires-Dist: hydra-core>=1.3.2
|
|
27
|
+
Requires-Dist: omegaconf>=2.3.0
|
|
28
|
+
Requires-Dist: gymnasium>=0.29.0
|
|
29
|
+
Requires-Dist: httpx>=0.24.0
|
|
30
|
+
Requires-Dist: anthropic>=0.59.0
|
|
31
|
+
Requires-Dist: ipykernel>=6.30.0
|
|
32
|
+
Requires-Dist: jupyter>=1.1.1
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: build; extra == "dev"
|
|
35
|
+
Requires-Dist: twine; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest>=6.0.0; extra == "dev"
|
|
37
|
+
Requires-Dist: pytest-asyncio; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-httpserver; extra == "dev"
|
|
39
|
+
Requires-Dist: werkzeug>=2.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: black>=21.5b2; extra == "dev"
|
|
41
|
+
Requires-Dist: mypy>=0.812; extra == "dev"
|
|
42
|
+
Requires-Dist: flake8>=3.9.2; extra == "dev"
|
|
43
|
+
Requires-Dist: autopep8>=1.5.0; extra == "dev"
|
|
44
|
+
Requires-Dist: transformers>=4.0.0; extra == "dev"
|
|
45
|
+
Requires-Dist: types-setuptools; extra == "dev"
|
|
46
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
47
|
+
Requires-Dist: types-PyYAML; extra == "dev"
|
|
48
|
+
Requires-Dist: types-docker; extra == "dev"
|
|
49
|
+
Requires-Dist: versioneer>=0.20; extra == "dev"
|
|
50
|
+
Requires-Dist: openai==1.78.1; extra == "dev"
|
|
51
|
+
Requires-Dist: pre-commit; extra == "dev"
|
|
52
|
+
Requires-Dist: e2b; extra == "dev"
|
|
53
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
54
|
+
Requires-Dist: pytest-xdist; extra == "dev"
|
|
55
|
+
Requires-Dist: docker==7.1.0; extra == "dev"
|
|
56
|
+
Requires-Dist: ipykernel>=6.30.0; extra == "dev"
|
|
57
|
+
Requires-Dist: jupyter>=1.1.1; extra == "dev"
|
|
58
|
+
Requires-Dist: pip>=25.1.1; extra == "dev"
|
|
59
|
+
Provides-Extra: trl
|
|
60
|
+
Requires-Dist: torch>=1.9; extra == "trl"
|
|
61
|
+
Requires-Dist: trl>=0.7.0; extra == "trl"
|
|
62
|
+
Requires-Dist: peft>=0.7.0; extra == "trl"
|
|
63
|
+
Requires-Dist: transformers>=4.0.0; extra == "trl"
|
|
64
|
+
Requires-Dist: accelerate>=0.28.0; extra == "trl"
|
|
65
|
+
Provides-Extra: openevals
|
|
66
|
+
Requires-Dist: openevals>=0.1.0; extra == "openevals"
|
|
67
|
+
Provides-Extra: fireworks
|
|
68
|
+
Requires-Dist: fireworks-ai>=0.19.10; extra == "fireworks"
|
|
69
|
+
Dynamic: license-file
|
|
70
|
+
|
|
71
|
+
# Reward Protocol
|
|
72
|
+
|
|
73
|
+
**Reward-Protocol: Author, reproduce, and evaluate reward functions seamlessly on Fireworks, TRL, and your own infrastructure.**
|
|
74
|
+
|
|
75
|
+
## Key Features
|
|
76
|
+
|
|
77
|
+
* **Easy-to-use Decorator**: Define reward functions with a simple `@reward_function` decorator.
|
|
78
|
+
* **Local Testing**: Quickly test your reward functions with sample data.
|
|
79
|
+
* **Flexible Evaluation**: Evaluate model outputs based on single or multiple custom metrics.
|
|
80
|
+
* **Seamless Deployment**: Deploy your reward functions to platforms like Fireworks AI.
|
|
81
|
+
* **Comprehensive CLI**: Manage reward functions, preview evaluations (`eval-protocol preview`), deploy (`eval-protocol deploy`), and run complex evaluation pipelines (`eval-protocol run`).
|
|
82
|
+
* **Simplified Dataset Integration**: Direct integration with HuggingFace datasets and on-the-fly format conversion.
|
|
83
|
+
* **Extensible**: Designed to be adaptable for various LLM evaluation scenarios.
|
|
84
|
+
|
|
85
|
+
## Installation
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install eval-protocol
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Optional TRL Extras
|
|
92
|
+
|
|
93
|
+
Install the additional dependencies required for running the TRL-based training
|
|
94
|
+
examples:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
pip install "eval-protocol[trl]"
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Getting Started
|
|
101
|
+
|
|
102
|
+
Reward Protocol simplifies the creation and deployment of reward functions for evaluating AI model outputs.
|
|
103
|
+
|
|
104
|
+
### 1. Creating a Reward Function for Tool Calling
|
|
105
|
+
|
|
106
|
+
Reward Protocol allows you to define custom logic to evaluate model responses. Here's an example of how you might use the built-in `exact_tool_match_reward` for evaluating tool/function calls. This reward function checks if the model's generated tool calls exactly match the expected ones.
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
# This is a conceptual example of how exact_tool_match_reward is defined and used.
|
|
110
|
+
# You would typically import it from eval_protocol.rewards.function_calling.
|
|
111
|
+
# For actual usage, you configure it in your YAML files for `eval-protocol run`.
|
|
112
|
+
|
|
113
|
+
from eval_protocol import reward_function
|
|
114
|
+
from eval_protocol.models import EvaluateResult, Message, MetricResult
|
|
115
|
+
from typing import List, Dict, Any, Optional, Union
|
|
116
|
+
|
|
117
|
+
# Definition of exact_tool_match_reward (simplified for brevity, see source for full details)
|
|
118
|
+
# from eval_protocol.rewards.function_calling import exact_tool_match_reward, eval_tool_call
|
|
119
|
+
|
|
120
|
+
@reward_function
|
|
121
|
+
def exact_tool_match_reward(
|
|
122
|
+
messages: Union[List[Message], List[Dict[str, Any]]],
|
|
123
|
+
ground_truth: Optional[Dict[str, Any]] = None,
|
|
124
|
+
**kwargs,
|
|
125
|
+
) -> EvaluateResult:
|
|
126
|
+
if not messages:
|
|
127
|
+
return EvaluateResult(
|
|
128
|
+
score=0.0, reason="No messages provided for evaluation.", metrics={}
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
generation_message_obj = messages[-1]
|
|
132
|
+
generation_dict: Dict[str, Any]
|
|
133
|
+
|
|
134
|
+
if isinstance(generation_message_obj, Message):
|
|
135
|
+
generation_dict = {
|
|
136
|
+
"role": generation_message_obj.role,
|
|
137
|
+
"content": generation_message_obj.content,
|
|
138
|
+
}
|
|
139
|
+
if generation_message_obj.tool_calls:
|
|
140
|
+
generation_dict["tool_calls"] = [
|
|
141
|
+
tc.model_dump() if hasattr(tc, "model_dump") else tc
|
|
142
|
+
for tc in generation_message_obj.tool_calls
|
|
143
|
+
]
|
|
144
|
+
elif isinstance(generation_message_obj, dict):
|
|
145
|
+
generation_dict = generation_message_obj
|
|
146
|
+
else:
|
|
147
|
+
# Handle error for unexpected type
|
|
148
|
+
return EvaluateResult(score=0.0, reason="Unexpected generation message type.", metrics={})
|
|
149
|
+
|
|
150
|
+
if ground_truth is None:
|
|
151
|
+
# Handle missing ground truth (e.g., score 0 if generation has tool calls, 1 if not)
|
|
152
|
+
# This logic is simplified here.
|
|
153
|
+
has_gen_tc = bool(generation_dict.get("tool_calls") or "<tool_call>" in generation_dict.get("content", ""))
|
|
154
|
+
score = 0.0 if has_gen_tc else 1.0
|
|
155
|
+
return EvaluateResult(score=score, reason="Ground truth not provided.", metrics={})
|
|
156
|
+
|
|
157
|
+
# Ensure ground_truth is a dict (it might be a JSON string from some datasets)
|
|
158
|
+
if isinstance(ground_truth, str):
|
|
159
|
+
try:
|
|
160
|
+
ground_truth = json.loads(ground_truth)
|
|
161
|
+
except json.JSONDecodeError:
|
|
162
|
+
return EvaluateResult(score=0.0, reason="Ground truth string failed to parse.", metrics={})
|
|
163
|
+
|
|
164
|
+
if not isinstance(ground_truth, dict):
|
|
165
|
+
return EvaluateResult(score=0.0, reason="Ground truth is not a dictionary.", metrics={})
|
|
166
|
+
|
|
167
|
+
# This simplified check compares generated tool calls with the expected ones.
|
|
168
|
+
expected_tcs = ground_truth.get("tool_calls", [])
|
|
169
|
+
generated_tcs = generation_dict.get("tool_calls", [])
|
|
170
|
+
|
|
171
|
+
# This is a highly simplified check. The actual function is much more robust.
|
|
172
|
+
is_match = (len(expected_tcs) == len(generated_tcs)) # Placeholder
|
|
173
|
+
score = 1.0 if is_match else 0.0
|
|
174
|
+
|
|
175
|
+
reason = f"Exact tool match evaluation score: {score}"
|
|
176
|
+
return EvaluateResult(score=score, reason=reason, metrics={
|
|
177
|
+
"tool_call_match": MetricResult(score=score, success=is_match, reason=reason)
|
|
178
|
+
})
|
|
179
|
+
|
|
180
|
+
```
|
|
181
|
+
This example illustrates the structure. The actual `exact_tool_match_reward` in `eval_protocol.rewards.function_calling` handles complex parsing and comparison of tool calls.
|
|
182
|
+
|
|
183
|
+
### 2. Testing Your Reward Function with a Dataset
|
|
184
|
+
|
|
185
|
+
Effective testing of a reward function involves evaluating it against a representative dataset. The key is the **dataset/reward function pair**: your dataset should provide the necessary `ground_truth` information that your reward function expects.
|
|
186
|
+
|
|
187
|
+
**Crafting Your Dataset:**
|
|
188
|
+
|
|
189
|
+
1. **Define `ground_truth`**: For each sample in your dataset, the `ground_truth_for_eval` (or a similarly named field specified in your dataset configuration) must contain the information your reward function needs to make a judgment.
|
|
190
|
+
* For `exact_tool_match_reward`, `ground_truth` should be a dictionary, often with a `tool_calls` key. This key would hold a list of expected tool calls, each specifying the `name` and `arguments` of the function call. Example:
|
|
191
|
+
```json
|
|
192
|
+
{
|
|
193
|
+
"role": "assistant",
|
|
194
|
+
"tool_calls": [
|
|
195
|
+
{
|
|
196
|
+
"name": "get_weather",
|
|
197
|
+
"arguments": {"location": "San Francisco, CA", "unit": "celsius"}
|
|
198
|
+
}
|
|
199
|
+
]
|
|
200
|
+
}
|
|
201
|
+
```
|
|
202
|
+
2. **Format**: Datasets are typically JSONL files, where each line is a JSON object representing a sample. Each sample should include:
|
|
203
|
+
* `messages`: The input conversation history for the model.
|
|
204
|
+
* `tools` (optional, for tool calling): A list of available tools the model can use.
|
|
205
|
+
* `ground_truth_for_eval`: The expected output or data for the reward function (e.g., the structure shown above for tool calling).
|
|
206
|
+
* An `id` for tracking.
|
|
207
|
+
|
|
208
|
+
**Example Test Snippet (Conceptual):**
|
|
209
|
+
|
|
210
|
+
While `eval-protocol run` is the primary way to evaluate with datasets, here's a conceptual local test:
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from eval_protocol.rewards.function_calling import exact_tool_match_reward # Import the actual function
|
|
214
|
+
from eval_protocol.models import Message
|
|
215
|
+
|
|
216
|
+
# Sample 1: Correct tool call
|
|
217
|
+
test_messages_correct = [
|
|
218
|
+
Message(role="user", content="What's the weather in SF?"),
|
|
219
|
+
Message(role="assistant", tool_calls=[ # Model's generated tool call
|
|
220
|
+
{"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": '{"location": "San Francisco, CA", "unit": "celsius"}'}}
|
|
221
|
+
])
|
|
222
|
+
]
|
|
223
|
+
ground_truth_correct = { # Expected tool call for the reward function
|
|
224
|
+
"tool_calls": [
|
|
225
|
+
{"name": "get_weather", "arguments": {"location": "San Francisco, CA", "unit": "celsius"}}
|
|
226
|
+
]
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
# Sample 2: Incorrect tool call
|
|
230
|
+
test_messages_incorrect = [
|
|
231
|
+
Message(role="user", content="What's the weather in SF?"),
|
|
232
|
+
Message(role="assistant", tool_calls=[
|
|
233
|
+
{"id": "call_456", "type": "function", "function": {"name": "get_current_time", "arguments": '{}'}}
|
|
234
|
+
])
|
|
235
|
+
]
|
|
236
|
+
# Ground truth remains the same as we expect get_weather
|
|
237
|
+
|
|
238
|
+
# Test with the actual reward function
|
|
239
|
+
result_correct = exact_tool_match_reward(messages=test_messages_correct, ground_truth=ground_truth_correct)
|
|
240
|
+
print(f"Correct Call - Score: {result_correct.score}, Reason: {result_correct.reason}")
|
|
241
|
+
|
|
242
|
+
result_incorrect = exact_tool_match_reward(messages=test_messages_incorrect, ground_truth=ground_truth_correct)
|
|
243
|
+
print(f"Incorrect Call - Score: {result_incorrect.score}, Reason: {result_incorrect.reason}")
|
|
244
|
+
```
|
|
245
|
+
This local test helps verify the reward function's logic with specific inputs. For comprehensive evaluation, use `eval-protocol run` with a full dataset (see next section).
|
|
246
|
+
|
|
247
|
+
### 3. Running Local Evaluations with `eval-protocol run`
|
|
248
|
+
|
|
249
|
+
For comprehensive local evaluations, especially when working with datasets and complex configurations, the `eval-protocol run` command is the recommended tool. It leverages Hydra for configuration management, allowing you to define your evaluation pipeline (dataset, model, reward function, etc.) in YAML files.
|
|
250
|
+
|
|
251
|
+
**Example: Math Evaluation using `codeparrot/gsm8k`**
|
|
252
|
+
|
|
253
|
+
The `examples/math_example` demonstrates evaluating models on math word problems.
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
# Ensure you are in the repository root
|
|
257
|
+
# cd /path/to/eval-protocol
|
|
258
|
+
|
|
259
|
+
# Run evaluation with the math configuration
|
|
260
|
+
eval-protocol run --config-name run_math_eval.yaml --config-path examples/math_example/conf
|
|
261
|
+
|
|
262
|
+
# Override parameters directly from the command line:
|
|
263
|
+
eval-protocol run --config-name run_math_eval.yaml --config-path examples/math_example/conf \
|
|
264
|
+
generation.model_name="accounts/fireworks/models/llama-v3p1-405b-instruct" \
|
|
265
|
+
evaluation_params.limit_samples=10
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
**What this command does (typically):**
|
|
269
|
+
* Loads the specified dataset (e.g., GSM8K directly from HuggingFace).
|
|
270
|
+
* Applies any dataset-specific prompts or preprocessing defined in the configuration.
|
|
271
|
+
* Generates model responses (e.g., using the Fireworks API or other configured providers).
|
|
272
|
+
* Evaluates the generated responses using the specified reward function(s).
|
|
273
|
+
* Saves detailed evaluation results to `<config_output_name>.jsonl` (e.g., `math_example_results.jsonl`) in a timestamped output directory (e.g., under `outputs/`).
|
|
274
|
+
* Saves generated prompt/response pairs to `preview_input_output_pairs.jsonl` in the same output directory, suitable for inspection or re-evaluation with `eval-protocol preview`.
|
|
275
|
+
|
|
276
|
+
**Example: APPS Coding Evaluation**
|
|
277
|
+
|
|
278
|
+
The `examples/apps_coding_example` shows evaluation on code generation tasks using the `codeparrot/apps` dataset.
|
|
279
|
+
|
|
280
|
+
```bash
|
|
281
|
+
# Run evaluation with the APPS coding configuration
|
|
282
|
+
eval-protocol run --config-path examples/apps_coding_example/conf --config-name run_eval
|
|
283
|
+
|
|
284
|
+
# Example: Limit samples for a quick test
|
|
285
|
+
eval-protocol run --config-path examples/apps_coding_example/conf --config-name run_eval evaluation_params.limit_samples=2
|
|
286
|
+
|
|
287
|
+
# Example: Disable generation to test reward function on cached responses
|
|
288
|
+
eval-protocol run --config-path examples/apps_coding_example/conf --config-name run_eval generation.enabled=false
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
These examples showcase how `eval-protocol run` can be adapted for different tasks and datasets through configuration files.
|
|
292
|
+
|
|
293
|
+
For more details on this command, Hydra configuration, and advanced usage, see the [CLI Overview](docs/cli_reference/cli_overview.mdx) and [Hydra Configuration Guide](docs/developer_guide/hydra_configuration.mdx).
|
|
294
|
+
|
|
295
|
+
### Fireworks Authentication Setup (Required for Preview/Deploy with Fireworks)
|
|
296
|
+
|
|
297
|
+
To interact with the Fireworks AI platform for deploying and managing evaluations (including some preview scenarios that might use remote evaluators or if `eval-protocol run` uses a Fireworks-hosted model), Reward Protocol needs your Fireworks AI credentials. You can configure these in two ways:
|
|
298
|
+
|
|
299
|
+
**A. Environment Variables (Highest Priority)**
|
|
300
|
+
|
|
301
|
+
Set the following environment variables:
|
|
302
|
+
|
|
303
|
+
* `FIREWORKS_API_KEY`: Your Fireworks AI API key. This is required for all interactions with the Fireworks API.
|
|
304
|
+
* `FIREWORKS_ACCOUNT_ID`: Your Fireworks AI Account ID. This is often required for operations like creating or listing evaluators under your account.
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
export FIREWORKS_API_KEY="your_fireworks_api_key"
|
|
308
|
+
export FIREWORKS_ACCOUNT_ID="your_fireworks_account_id"
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
**B. Configuration File (Lower Priority)**
|
|
312
|
+
|
|
313
|
+
Alternatively, you can store your credentials in a configuration file located at `~/.fireworks/auth.ini`. If environment variables are not set, Reward Protocol will look for this file.
|
|
314
|
+
|
|
315
|
+
Create the file with the following format:
|
|
316
|
+
|
|
317
|
+
```ini
|
|
318
|
+
[fireworks]
|
|
319
|
+
api_key = YOUR_FIREWORKS_API_KEY
|
|
320
|
+
account_id = YOUR_FIREWORKS_ACCOUNT_ID
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
Replace `YOUR_FIREWORKS_API_KEY` and `YOUR_FIREWORKS_ACCOUNT_ID` with your actual credentials.
|
|
324
|
+
|
|
325
|
+
**Credential Sourcing Order:**
|
|
326
|
+
|
|
327
|
+
Reward Protocol will prioritize credentials in the following order:
|
|
328
|
+
1. Environment Variables (`FIREWORKS_API_KEY`, `FIREWORKS_ACCOUNT_ID`)
|
|
329
|
+
2. `~/.fireworks/auth.ini` configuration file
|
|
330
|
+
|
|
331
|
+
Ensure that the `auth.ini` file has appropriate permissions to protect your sensitive credentials.
|
|
332
|
+
|
|
333
|
+
The `FIREWORKS_API_KEY` is essential for authenticating your requests to the Fireworks AI service. The `FIREWORKS_ACCOUNT_ID` is used to identify your specific account context for operations that are account-specific, such as managing your evaluators. While the API key authenticates *who* you are, the account ID often specifies *where* (under which account) an operation should take place. Some Fireworks API endpoints may require both.
|
|
334
|
+
|
|
335
|
+
### 4. Evaluating with Sample Data (Preview)
|
|
336
|
+
|
|
337
|
+
Create a JSONL file with sample conversations to evaluate:
|
|
338
|
+
|
|
339
|
+
```json
|
|
340
|
+
{"messages": [{"role": "user", "content": "Tell me about AI"}, {"role": "assistant", "content": "AI refers to systems designed to mimic human intelligence."}]}
|
|
341
|
+
{"messages": [{"role": "user", "content": "What is machine learning?"}, {"role": "assistant", "content": "Machine learning is a subset of AI that focuses on building systems that can learn from data."}]}
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
Preview your evaluation using the CLI:
|
|
345
|
+
|
|
346
|
+
```bash
|
|
347
|
+
eval-protocol preview --metrics-folders "word_count=./path/to/metrics" --samples ./path/to/samples.jsonl
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
For example
|
|
351
|
+
```
|
|
352
|
+
eval-protocol preview --metrics-folders "word_count=examples/metrics/word_count" --samples development/CODING_DATASET.jsonl
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
### 5. Deploying Your Reward Function
|
|
356
|
+
|
|
357
|
+
Deploy your reward function to use in training workflows:
|
|
358
|
+
|
|
359
|
+
```bash
|
|
360
|
+
eval-protocol deploy --id my-evaluator --metrics-folders "word_count=./path/to/metrics" --force
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
#### Local Development Server
|
|
364
|
+
|
|
365
|
+
For local development and testing, you can deploy a reward function as a local server with external tunnel access:
|
|
366
|
+
|
|
367
|
+
```bash
|
|
368
|
+
# Deploy as local server with automatic tunnel (ngrok/serveo)
|
|
369
|
+
eval-protocol deploy --id test-local-serve-eval --target local-serve --function-ref examples.row_wise.dummy_example.dummy_rewards.simple_echo_reward --verbose --force
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
**What this does:**
|
|
373
|
+
- Starts a local HTTP server on port 8001 serving your reward function
|
|
374
|
+
- Creates an external tunnel (using ngrok or serveo.net) to make the server publicly accessible
|
|
375
|
+
- Registers the tunnel URL with Fireworks AI for remote evaluation
|
|
376
|
+
- Keeps the server running indefinitely in the background
|
|
377
|
+
|
|
378
|
+
**Key points:**
|
|
379
|
+
- The CLI returns to prompt after deployment, but the server continues running in background
|
|
380
|
+
- Check running processes: `ps aux | grep -E "(generic_server|ngrok)"`
|
|
381
|
+
- Test locally: `curl -X POST http://localhost:8001/evaluate -H "Content-Type: application/json" -d '{"messages": [{"role": "user", "content": "test"}]}'`
|
|
382
|
+
- Monitor logs: `tail -f logs/eval-protocol-local/generic_server_*.log`
|
|
383
|
+
- Stop server: Kill the background processes manually when done
|
|
384
|
+
|
|
385
|
+
This is ideal for development, testing webhook integrations, or accessing your reward function from remote services without full cloud deployment.
|
|
386
|
+
|
|
387
|
+
Or deploy programmatically:
|
|
388
|
+
|
|
389
|
+
```python
|
|
390
|
+
from eval_protocol.evaluation import create_evaluation
|
|
391
|
+
|
|
392
|
+
evaluator = create_evaluation(
|
|
393
|
+
evaluator_id="my-evaluator",
|
|
394
|
+
metric_folders=["word_count=./path/to/metrics"],
|
|
395
|
+
display_name="My Word Count Evaluator",
|
|
396
|
+
description="Evaluates responses based on word count",
|
|
397
|
+
force=True # Update if already exists
|
|
398
|
+
)
|
|
399
|
+
```
|
|
400
|
+
|
|
401
|
+
## Advanced Usage
|
|
402
|
+
|
|
403
|
+
### Multiple Metrics
|
|
404
|
+
|
|
405
|
+
Combine multiple metrics in a single reward function:
|
|
406
|
+
|
|
407
|
+
```python
|
|
408
|
+
from eval_protocol import reward_function
|
|
409
|
+
from eval_protocol.models import EvaluateResult, MetricResult, Message # Assuming models are here
|
|
410
|
+
from typing import List, Dict, Any, Optional
|
|
411
|
+
|
|
412
|
+
@reward_function
|
|
413
|
+
def combined_reward(
|
|
414
|
+
messages: List[Dict[str, Any]], # Or List[Message]
|
|
415
|
+
original_messages: Optional[List[Dict[str, Any]]] = None, # Or List[Message]
|
|
416
|
+
**kwargs: Any
|
|
417
|
+
) -> EvaluateResult:
|
|
418
|
+
"""Evaluate with multiple metrics."""
|
|
419
|
+
response = messages[-1].get("content", "")
|
|
420
|
+
|
|
421
|
+
# Word count metric
|
|
422
|
+
word_count = len(response.split())
|
|
423
|
+
word_score = min(word_count / 100.0, 1.0)
|
|
424
|
+
word_metric_success = word_count > 10
|
|
425
|
+
|
|
426
|
+
# Specificity metric
|
|
427
|
+
specificity_markers = ["specifically", "for example", "such as"]
|
|
428
|
+
marker_count = sum(1 for marker in specificity_markers if marker.lower() in response.lower())
|
|
429
|
+
specificity_score = min(marker_count / 2.0, 1.0)
|
|
430
|
+
specificity_metric_success = marker_count > 0
|
|
431
|
+
|
|
432
|
+
# Combined score with weighted components
|
|
433
|
+
final_score = word_score * 0.3 + specificity_score * 0.7
|
|
434
|
+
|
|
435
|
+
return EvaluateResult(
|
|
436
|
+
score=final_score,
|
|
437
|
+
reason=f"Combined score based on word count ({word_count}) and specificity markers ({marker_count})",
|
|
438
|
+
metrics={
|
|
439
|
+
"word_count": MetricResult(
|
|
440
|
+
score=word_score,
|
|
441
|
+
success=word_metric_success,
|
|
442
|
+
reason=f"Word count: {word_count}"
|
|
443
|
+
),
|
|
444
|
+
"specificity": MetricResult(
|
|
445
|
+
score=specificity_score,
|
|
446
|
+
success=specificity_metric_success,
|
|
447
|
+
reason=f"Found {marker_count} specificity markers"
|
|
448
|
+
)
|
|
449
|
+
}
|
|
450
|
+
)
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
### Custom Model Providers
|
|
454
|
+
|
|
455
|
+
Deploy your reward function with a specific model provider:
|
|
456
|
+
|
|
457
|
+
```python
|
|
458
|
+
# Deploy with a custom provider
|
|
459
|
+
my_function.deploy(
|
|
460
|
+
name="my-evaluator-anthropic",
|
|
461
|
+
description="My evaluator using Claude model",
|
|
462
|
+
providers=[
|
|
463
|
+
{
|
|
464
|
+
"providerType": "anthropic",
|
|
465
|
+
"modelId": "claude-3-sonnet-20240229"
|
|
466
|
+
}
|
|
467
|
+
],
|
|
468
|
+
force=True
|
|
469
|
+
)
|
|
470
|
+
```
|
|
471
|
+
|
|
472
|
+
## Dataset Integration
|
|
473
|
+
|
|
474
|
+
Reward Protocol provides seamless integration with popular datasets through a simplified configuration system:
|
|
475
|
+
|
|
476
|
+
### Direct HuggingFace Integration
|
|
477
|
+
|
|
478
|
+
Load datasets directly from HuggingFace Hub without manual preprocessing:
|
|
479
|
+
|
|
480
|
+
```bash
|
|
481
|
+
# Evaluate using GSM8K dataset with math-specific prompts
|
|
482
|
+
eval-protocol run --config-name run_math_eval.yaml --config-path examples/math_example/conf
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
### Derived Datasets
|
|
486
|
+
|
|
487
|
+
Create specialized dataset configurations that reference base datasets and apply transformations:
|
|
488
|
+
|
|
489
|
+
```yaml
|
|
490
|
+
# conf/dataset/gsm8k_math_prompts.yaml
|
|
491
|
+
defaults:
|
|
492
|
+
- base_derived_dataset
|
|
493
|
+
- _self_
|
|
494
|
+
|
|
495
|
+
base_dataset: "gsm8k"
|
|
496
|
+
system_prompt: "Solve the following math problem. Show your work clearly. Put the final numerical answer between <answer> and </answer> tags."
|
|
497
|
+
output_format: "evaluation_format"
|
|
498
|
+
derived_max_samples: 5
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
### Key Benefits
|
|
502
|
+
|
|
503
|
+
- **No Manual Conversion**: Datasets are converted to evaluation format on-the-fly
|
|
504
|
+
- **System Prompt Integration**: Prompts are part of dataset configuration, not evaluation logic
|
|
505
|
+
- **Flexible Column Mapping**: Automatically adapts different dataset formats
|
|
506
|
+
- **Reusable Configurations**: Base datasets can be extended for different use cases
|
|
507
|
+
|
|
508
|
+
See the [math example](examples/math_example/) for a complete demonstration of the dataset system.
|
|
509
|
+
|
|
510
|
+
## Detailed Documentation
|
|
511
|
+
|
|
512
|
+
For more comprehensive information, including API references, tutorials, and advanced guides, please see our [full documentation](docs/documentation_home.mdx).
|
|
513
|
+
|
|
514
|
+
## Examples
|
|
515
|
+
|
|
516
|
+
Check the `examples` directory for complete examples:
|
|
517
|
+
|
|
518
|
+
- `evaluation_preview_example.py`: How to preview an evaluator.
|
|
519
|
+
- `deploy_example.py`: How to deploy a reward function to Fireworks.
|
|
520
|
+
- `math_example/`: Demonstrates CLI-based evaluation (`eval-protocol run`) and TRL GRPO training for math problems (GSM8K dataset).
|
|
521
|
+
- `apps_coding_example/`: Shows CLI-based evaluation (`eval-protocol run`) for code generation tasks (APPS dataset).
|
|
522
|
+
- `apps_coding_example/`: Shows CLI-based evaluation (`eval-protocol run`) for code generation tasks (APPS dataset).
|
|
523
|
+
|
|
524
|
+
The OpenEvals project provides a suite of evaluators that can be used directly within Reward Protocol. The helper `eval_protocol.integrations.openeval.adapt` converts any OpenEvals evaluator into a reward function returning an `EvaluateResult`.
|
|
525
|
+
|
|
526
|
+
```python
|
|
527
|
+
from openevals import exact_match
|
|
528
|
+
from eval_protocol.integrations.openeval import adapt
|
|
529
|
+
|
|
530
|
+
exact_match_reward = adapt(exact_match)
|
|
531
|
+
result = exact_match_reward(
|
|
532
|
+
messages=[{"role": "assistant", "content": "hello"}],
|
|
533
|
+
ground_truth="hello",
|
|
534
|
+
)
|
|
535
|
+
print(result.score)
|
|
536
|
+
```
|
|
537
|
+
|
|
538
|
+
The [deepeval](https://github.com/confident-ai/deepeval) project also offers a
|
|
539
|
+
variety of metrics. The helper `eval_protocol.integrations.deepeval.adapt_metric`
|
|
540
|
+
converts a deepeval metric instance into a reward function returning an
|
|
541
|
+
`EvaluateResult`.
|
|
542
|
+
|
|
543
|
+
```python
|
|
544
|
+
from deepeval.metrics import FaithfulnessMetric
|
|
545
|
+
from eval_protocol.integrations.deepeval import adapt_metric
|
|
546
|
+
|
|
547
|
+
faithfulness_reward = adapt_metric(FaithfulnessMetric())
|
|
548
|
+
result = faithfulness_reward(
|
|
549
|
+
messages=[{"role": "assistant", "content": "hello"}],
|
|
550
|
+
ground_truth="hello",
|
|
551
|
+
)
|
|
552
|
+
print(result.score)
|
|
553
|
+
```
|
|
554
|
+
|
|
555
|
+
The GEval metric family uses an LLM-as-a-judge to score outputs based on
|
|
556
|
+
custom criteria. You can construct a `GEval` metric and adapt it in the same
|
|
557
|
+
way:
|
|
558
|
+
|
|
559
|
+
```python
|
|
560
|
+
from deepeval.metrics import GEval
|
|
561
|
+
from deepeval.test_case import LLMTestCaseParams
|
|
562
|
+
from eval_protocol.integrations.deepeval import adapt_metric
|
|
563
|
+
|
|
564
|
+
correctness_metric = GEval(
|
|
565
|
+
name="Correctness",
|
|
566
|
+
criteria="Determine whether the answer is factually correct",
|
|
567
|
+
evaluation_params=[
|
|
568
|
+
LLMTestCaseParams.INPUT,
|
|
569
|
+
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
570
|
+
LLMTestCaseParams.EXPECTED_OUTPUT,
|
|
571
|
+
],
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
correctness_reward = adapt_metric(correctness_metric)
|
|
575
|
+
result = correctness_reward(
|
|
576
|
+
messages=[{"role": "user", "content": "Who wrote 1984?"}, {"role": "assistant", "content": "George Orwell"}],
|
|
577
|
+
ground_truth="George Orwell",
|
|
578
|
+
)
|
|
579
|
+
print(result.score)
|
|
580
|
+
```
|
|
581
|
+
|
|
582
|
+
## Command Line Interface
|
|
583
|
+
|
|
584
|
+
Reward Protocol includes a CLI for common operations:
|
|
585
|
+
|
|
586
|
+
```bash
|
|
587
|
+
# Show help
|
|
588
|
+
eval-protocol --help
|
|
589
|
+
|
|
590
|
+
# Preview an evaluator
|
|
591
|
+
eval-protocol preview --metrics-folders "metric=./path" --samples ./samples.jsonl
|
|
592
|
+
|
|
593
|
+
# Deploy an evaluator
|
|
594
|
+
eval-protocol deploy --id my-evaluator --metrics-folders "metric=./path" --force
|
|
595
|
+
```
|
|
596
|
+
|
|
597
|
+
## Community and Support
|
|
598
|
+
|
|
599
|
+
* **GitHub Issues**: For bug reports and feature requests, please use [GitHub Issues](https://github.com/eval-protocol/python-sdk/issues).
|
|
600
|
+
* **GitHub Discussions**: (If enabled) For general questions, ideas, and discussions.
|
|
601
|
+
* Please also review our [Contributing Guidelines](development/CONTRIBUTING.md) and [Code of Conduct](CODE_OF_CONDUCT.md).
|
|
602
|
+
|
|
603
|
+
## Development
|
|
604
|
+
|
|
605
|
+
### Type Checking
|
|
606
|
+
|
|
607
|
+
The codebase uses mypy for static type checking. To run type checking:
|
|
608
|
+
|
|
609
|
+
```bash
|
|
610
|
+
# Install development dependencies
|
|
611
|
+
pip install -e ".[dev]"
|
|
612
|
+
|
|
613
|
+
# Run mypy
|
|
614
|
+
mypy eval_protocol
|
|
615
|
+
```
|
|
616
|
+
|
|
617
|
+
Our CI pipeline enforces type checking, so please ensure your code passes mypy checks before submitting PRs.
|
|
618
|
+
|
|
619
|
+
### Running Tests
|
|
620
|
+
|
|
621
|
+
```bash
|
|
622
|
+
# Install test dependencies
|
|
623
|
+
pip install -e ".[dev]"
|
|
624
|
+
|
|
625
|
+
# Run tests
|
|
626
|
+
pytest
|
|
627
|
+
```
|
|
628
|
+
|
|
629
|
+
## Code of Conduct
|
|
630
|
+
|
|
631
|
+
We are dedicated to providing a welcoming and inclusive experience for everyone. Please review and adhere to our [Code of Conduct](CODE_OF_CONDUCT.md).
|
|
632
|
+
|
|
633
|
+
## License
|
|
634
|
+
|
|
635
|
+
Reward Protocol is released under the Apache License 2.0.
|