eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,635 @@
1
+ Metadata-Version: 2.4
2
+ Name: eval-protocol
3
+ Version: 0.0.3
4
+ Summary: A Python library for defining, testing, and using reward functions
5
+ Author-email: Fireworks AI <info@fireworks.ai>
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/fireworks-ai/eval-protocol
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: requests>=2.25.0
14
+ Requires-Dist: pydantic>=2.0.0
15
+ Requires-Dist: dataclasses-json>=0.5.7
16
+ Requires-Dist: fastapi>=0.68.0
17
+ Requires-Dist: uvicorn>=0.15.0
18
+ Requires-Dist: python-dotenv>=0.19.0
19
+ Requires-Dist: openai==1.78.1
20
+ Requires-Dist: aiosqlite
21
+ Requires-Dist: aiohttp
22
+ Requires-Dist: mcp>=1.9.2
23
+ Requires-Dist: PyYAML>=5.0
24
+ Requires-Dist: datasets==3.6.0
25
+ Requires-Dist: fsspec==2025.3.0
26
+ Requires-Dist: hydra-core>=1.3.2
27
+ Requires-Dist: omegaconf>=2.3.0
28
+ Requires-Dist: gymnasium>=0.29.0
29
+ Requires-Dist: httpx>=0.24.0
30
+ Requires-Dist: anthropic>=0.59.0
31
+ Requires-Dist: ipykernel>=6.30.0
32
+ Requires-Dist: jupyter>=1.1.1
33
+ Provides-Extra: dev
34
+ Requires-Dist: build; extra == "dev"
35
+ Requires-Dist: twine; extra == "dev"
36
+ Requires-Dist: pytest>=6.0.0; extra == "dev"
37
+ Requires-Dist: pytest-asyncio; extra == "dev"
38
+ Requires-Dist: pytest-httpserver; extra == "dev"
39
+ Requires-Dist: werkzeug>=2.0.0; extra == "dev"
40
+ Requires-Dist: black>=21.5b2; extra == "dev"
41
+ Requires-Dist: mypy>=0.812; extra == "dev"
42
+ Requires-Dist: flake8>=3.9.2; extra == "dev"
43
+ Requires-Dist: autopep8>=1.5.0; extra == "dev"
44
+ Requires-Dist: transformers>=4.0.0; extra == "dev"
45
+ Requires-Dist: types-setuptools; extra == "dev"
46
+ Requires-Dist: types-requests; extra == "dev"
47
+ Requires-Dist: types-PyYAML; extra == "dev"
48
+ Requires-Dist: types-docker; extra == "dev"
49
+ Requires-Dist: versioneer>=0.20; extra == "dev"
50
+ Requires-Dist: openai==1.78.1; extra == "dev"
51
+ Requires-Dist: pre-commit; extra == "dev"
52
+ Requires-Dist: e2b; extra == "dev"
53
+ Requires-Dist: pytest-cov; extra == "dev"
54
+ Requires-Dist: pytest-xdist; extra == "dev"
55
+ Requires-Dist: docker==7.1.0; extra == "dev"
56
+ Requires-Dist: ipykernel>=6.30.0; extra == "dev"
57
+ Requires-Dist: jupyter>=1.1.1; extra == "dev"
58
+ Requires-Dist: pip>=25.1.1; extra == "dev"
59
+ Provides-Extra: trl
60
+ Requires-Dist: torch>=1.9; extra == "trl"
61
+ Requires-Dist: trl>=0.7.0; extra == "trl"
62
+ Requires-Dist: peft>=0.7.0; extra == "trl"
63
+ Requires-Dist: transformers>=4.0.0; extra == "trl"
64
+ Requires-Dist: accelerate>=0.28.0; extra == "trl"
65
+ Provides-Extra: openevals
66
+ Requires-Dist: openevals>=0.1.0; extra == "openevals"
67
+ Provides-Extra: fireworks
68
+ Requires-Dist: fireworks-ai>=0.19.10; extra == "fireworks"
69
+ Dynamic: license-file
70
+
71
+ # Reward Protocol
72
+
73
+ **Reward-Protocol: Author, reproduce, and evaluate reward functions seamlessly on Fireworks, TRL, and your own infrastructure.**
74
+
75
+ ## Key Features
76
+
77
+ * **Easy-to-use Decorator**: Define reward functions with a simple `@reward_function` decorator.
78
+ * **Local Testing**: Quickly test your reward functions with sample data.
79
+ * **Flexible Evaluation**: Evaluate model outputs based on single or multiple custom metrics.
80
+ * **Seamless Deployment**: Deploy your reward functions to platforms like Fireworks AI.
81
+ * **Comprehensive CLI**: Manage reward functions, preview evaluations (`eval-protocol preview`), deploy (`eval-protocol deploy`), and run complex evaluation pipelines (`eval-protocol run`).
82
+ * **Simplified Dataset Integration**: Direct integration with HuggingFace datasets and on-the-fly format conversion.
83
+ * **Extensible**: Designed to be adaptable for various LLM evaluation scenarios.
84
+
85
+ ## Installation
86
+
87
+ ```bash
88
+ pip install eval-protocol
89
+ ```
90
+
91
+ ### Optional TRL Extras
92
+
93
+ Install the additional dependencies required for running the TRL-based training
94
+ examples:
95
+
96
+ ```bash
97
+ pip install "eval-protocol[trl]"
98
+ ```
99
+
100
+ ## Getting Started
101
+
102
+ Reward Protocol simplifies the creation and deployment of reward functions for evaluating AI model outputs.
103
+
104
+ ### 1. Creating a Reward Function for Tool Calling
105
+
106
+ Reward Protocol allows you to define custom logic to evaluate model responses. Here's an example of how you might use the built-in `exact_tool_match_reward` for evaluating tool/function calls. This reward function checks if the model's generated tool calls exactly match the expected ones.
107
+
108
+ ```python
109
+ # This is a conceptual example of how exact_tool_match_reward is defined and used.
110
+ # You would typically import it from eval_protocol.rewards.function_calling.
111
+ # For actual usage, you configure it in your YAML files for `eval-protocol run`.
112
+
113
+ from eval_protocol import reward_function
114
+ from eval_protocol.models import EvaluateResult, Message, MetricResult
115
+ from typing import List, Dict, Any, Optional, Union
116
+
117
+ # Definition of exact_tool_match_reward (simplified for brevity, see source for full details)
118
+ # from eval_protocol.rewards.function_calling import exact_tool_match_reward, eval_tool_call
119
+
120
+ @reward_function
121
+ def exact_tool_match_reward(
122
+ messages: Union[List[Message], List[Dict[str, Any]]],
123
+ ground_truth: Optional[Dict[str, Any]] = None,
124
+ **kwargs,
125
+ ) -> EvaluateResult:
126
+ if not messages:
127
+ return EvaluateResult(
128
+ score=0.0, reason="No messages provided for evaluation.", metrics={}
129
+ )
130
+
131
+ generation_message_obj = messages[-1]
132
+ generation_dict: Dict[str, Any]
133
+
134
+ if isinstance(generation_message_obj, Message):
135
+ generation_dict = {
136
+ "role": generation_message_obj.role,
137
+ "content": generation_message_obj.content,
138
+ }
139
+ if generation_message_obj.tool_calls:
140
+ generation_dict["tool_calls"] = [
141
+ tc.model_dump() if hasattr(tc, "model_dump") else tc
142
+ for tc in generation_message_obj.tool_calls
143
+ ]
144
+ elif isinstance(generation_message_obj, dict):
145
+ generation_dict = generation_message_obj
146
+ else:
147
+ # Handle error for unexpected type
148
+ return EvaluateResult(score=0.0, reason="Unexpected generation message type.", metrics={})
149
+
150
+ if ground_truth is None:
151
+ # Handle missing ground truth (e.g., score 0 if generation has tool calls, 1 if not)
152
+ # This logic is simplified here.
153
+ has_gen_tc = bool(generation_dict.get("tool_calls") or "<tool_call>" in generation_dict.get("content", ""))
154
+ score = 0.0 if has_gen_tc else 1.0
155
+ return EvaluateResult(score=score, reason="Ground truth not provided.", metrics={})
156
+
157
+ # Ensure ground_truth is a dict (it might be a JSON string from some datasets)
158
+ if isinstance(ground_truth, str):
159
+ try:
160
+ ground_truth = json.loads(ground_truth)
161
+ except json.JSONDecodeError:
162
+ return EvaluateResult(score=0.0, reason="Ground truth string failed to parse.", metrics={})
163
+
164
+ if not isinstance(ground_truth, dict):
165
+ return EvaluateResult(score=0.0, reason="Ground truth is not a dictionary.", metrics={})
166
+
167
+ # This simplified check compares generated tool calls with the expected ones.
168
+ expected_tcs = ground_truth.get("tool_calls", [])
169
+ generated_tcs = generation_dict.get("tool_calls", [])
170
+
171
+ # This is a highly simplified check. The actual function is much more robust.
172
+ is_match = (len(expected_tcs) == len(generated_tcs)) # Placeholder
173
+ score = 1.0 if is_match else 0.0
174
+
175
+ reason = f"Exact tool match evaluation score: {score}"
176
+ return EvaluateResult(score=score, reason=reason, metrics={
177
+ "tool_call_match": MetricResult(score=score, success=is_match, reason=reason)
178
+ })
179
+
180
+ ```
181
+ This example illustrates the structure. The actual `exact_tool_match_reward` in `eval_protocol.rewards.function_calling` handles complex parsing and comparison of tool calls.
182
+
183
+ ### 2. Testing Your Reward Function with a Dataset
184
+
185
+ Effective testing of a reward function involves evaluating it against a representative dataset. The key is the **dataset/reward function pair**: your dataset should provide the necessary `ground_truth` information that your reward function expects.
186
+
187
+ **Crafting Your Dataset:**
188
+
189
+ 1. **Define `ground_truth`**: For each sample in your dataset, the `ground_truth_for_eval` (or a similarly named field specified in your dataset configuration) must contain the information your reward function needs to make a judgment.
190
+ * For `exact_tool_match_reward`, `ground_truth` should be a dictionary, often with a `tool_calls` key. This key would hold a list of expected tool calls, each specifying the `name` and `arguments` of the function call. Example:
191
+ ```json
192
+ {
193
+ "role": "assistant",
194
+ "tool_calls": [
195
+ {
196
+ "name": "get_weather",
197
+ "arguments": {"location": "San Francisco, CA", "unit": "celsius"}
198
+ }
199
+ ]
200
+ }
201
+ ```
202
+ 2. **Format**: Datasets are typically JSONL files, where each line is a JSON object representing a sample. Each sample should include:
203
+ * `messages`: The input conversation history for the model.
204
+ * `tools` (optional, for tool calling): A list of available tools the model can use.
205
+ * `ground_truth_for_eval`: The expected output or data for the reward function (e.g., the structure shown above for tool calling).
206
+ * An `id` for tracking.
207
+
208
+ **Example Test Snippet (Conceptual):**
209
+
210
+ While `eval-protocol run` is the primary way to evaluate with datasets, here's a conceptual local test:
211
+
212
+ ```python
213
+ from eval_protocol.rewards.function_calling import exact_tool_match_reward # Import the actual function
214
+ from eval_protocol.models import Message
215
+
216
+ # Sample 1: Correct tool call
217
+ test_messages_correct = [
218
+ Message(role="user", content="What's the weather in SF?"),
219
+ Message(role="assistant", tool_calls=[ # Model's generated tool call
220
+ {"id": "call_123", "type": "function", "function": {"name": "get_weather", "arguments": '{"location": "San Francisco, CA", "unit": "celsius"}'}}
221
+ ])
222
+ ]
223
+ ground_truth_correct = { # Expected tool call for the reward function
224
+ "tool_calls": [
225
+ {"name": "get_weather", "arguments": {"location": "San Francisco, CA", "unit": "celsius"}}
226
+ ]
227
+ }
228
+
229
+ # Sample 2: Incorrect tool call
230
+ test_messages_incorrect = [
231
+ Message(role="user", content="What's the weather in SF?"),
232
+ Message(role="assistant", tool_calls=[
233
+ {"id": "call_456", "type": "function", "function": {"name": "get_current_time", "arguments": '{}'}}
234
+ ])
235
+ ]
236
+ # Ground truth remains the same as we expect get_weather
237
+
238
+ # Test with the actual reward function
239
+ result_correct = exact_tool_match_reward(messages=test_messages_correct, ground_truth=ground_truth_correct)
240
+ print(f"Correct Call - Score: {result_correct.score}, Reason: {result_correct.reason}")
241
+
242
+ result_incorrect = exact_tool_match_reward(messages=test_messages_incorrect, ground_truth=ground_truth_correct)
243
+ print(f"Incorrect Call - Score: {result_incorrect.score}, Reason: {result_incorrect.reason}")
244
+ ```
245
+ This local test helps verify the reward function's logic with specific inputs. For comprehensive evaluation, use `eval-protocol run` with a full dataset (see next section).
246
+
247
+ ### 3. Running Local Evaluations with `eval-protocol run`
248
+
249
+ For comprehensive local evaluations, especially when working with datasets and complex configurations, the `eval-protocol run` command is the recommended tool. It leverages Hydra for configuration management, allowing you to define your evaluation pipeline (dataset, model, reward function, etc.) in YAML files.
250
+
251
+ **Example: Math Evaluation using `codeparrot/gsm8k`**
252
+
253
+ The `examples/math_example` demonstrates evaluating models on math word problems.
254
+
255
+ ```bash
256
+ # Ensure you are in the repository root
257
+ # cd /path/to/eval-protocol
258
+
259
+ # Run evaluation with the math configuration
260
+ eval-protocol run --config-name run_math_eval.yaml --config-path examples/math_example/conf
261
+
262
+ # Override parameters directly from the command line:
263
+ eval-protocol run --config-name run_math_eval.yaml --config-path examples/math_example/conf \
264
+ generation.model_name="accounts/fireworks/models/llama-v3p1-405b-instruct" \
265
+ evaluation_params.limit_samples=10
266
+ ```
267
+
268
+ **What this command does (typically):**
269
+ * Loads the specified dataset (e.g., GSM8K directly from HuggingFace).
270
+ * Applies any dataset-specific prompts or preprocessing defined in the configuration.
271
+ * Generates model responses (e.g., using the Fireworks API or other configured providers).
272
+ * Evaluates the generated responses using the specified reward function(s).
273
+ * Saves detailed evaluation results to `<config_output_name>.jsonl` (e.g., `math_example_results.jsonl`) in a timestamped output directory (e.g., under `outputs/`).
274
+ * Saves generated prompt/response pairs to `preview_input_output_pairs.jsonl` in the same output directory, suitable for inspection or re-evaluation with `eval-protocol preview`.
275
+
276
+ **Example: APPS Coding Evaluation**
277
+
278
+ The `examples/apps_coding_example` shows evaluation on code generation tasks using the `codeparrot/apps` dataset.
279
+
280
+ ```bash
281
+ # Run evaluation with the APPS coding configuration
282
+ eval-protocol run --config-path examples/apps_coding_example/conf --config-name run_eval
283
+
284
+ # Example: Limit samples for a quick test
285
+ eval-protocol run --config-path examples/apps_coding_example/conf --config-name run_eval evaluation_params.limit_samples=2
286
+
287
+ # Example: Disable generation to test reward function on cached responses
288
+ eval-protocol run --config-path examples/apps_coding_example/conf --config-name run_eval generation.enabled=false
289
+ ```
290
+
291
+ These examples showcase how `eval-protocol run` can be adapted for different tasks and datasets through configuration files.
292
+
293
+ For more details on this command, Hydra configuration, and advanced usage, see the [CLI Overview](docs/cli_reference/cli_overview.mdx) and [Hydra Configuration Guide](docs/developer_guide/hydra_configuration.mdx).
294
+
295
+ ### Fireworks Authentication Setup (Required for Preview/Deploy with Fireworks)
296
+
297
+ To interact with the Fireworks AI platform for deploying and managing evaluations (including some preview scenarios that might use remote evaluators or if `eval-protocol run` uses a Fireworks-hosted model), Reward Protocol needs your Fireworks AI credentials. You can configure these in two ways:
298
+
299
+ **A. Environment Variables (Highest Priority)**
300
+
301
+ Set the following environment variables:
302
+
303
+ * `FIREWORKS_API_KEY`: Your Fireworks AI API key. This is required for all interactions with the Fireworks API.
304
+ * `FIREWORKS_ACCOUNT_ID`: Your Fireworks AI Account ID. This is often required for operations like creating or listing evaluators under your account.
305
+
306
+ ```bash
307
+ export FIREWORKS_API_KEY="your_fireworks_api_key"
308
+ export FIREWORKS_ACCOUNT_ID="your_fireworks_account_id"
309
+ ```
310
+
311
+ **B. Configuration File (Lower Priority)**
312
+
313
+ Alternatively, you can store your credentials in a configuration file located at `~/.fireworks/auth.ini`. If environment variables are not set, Reward Protocol will look for this file.
314
+
315
+ Create the file with the following format:
316
+
317
+ ```ini
318
+ [fireworks]
319
+ api_key = YOUR_FIREWORKS_API_KEY
320
+ account_id = YOUR_FIREWORKS_ACCOUNT_ID
321
+ ```
322
+
323
+ Replace `YOUR_FIREWORKS_API_KEY` and `YOUR_FIREWORKS_ACCOUNT_ID` with your actual credentials.
324
+
325
+ **Credential Sourcing Order:**
326
+
327
+ Reward Protocol will prioritize credentials in the following order:
328
+ 1. Environment Variables (`FIREWORKS_API_KEY`, `FIREWORKS_ACCOUNT_ID`)
329
+ 2. `~/.fireworks/auth.ini` configuration file
330
+
331
+ Ensure that the `auth.ini` file has appropriate permissions to protect your sensitive credentials.
332
+
333
+ The `FIREWORKS_API_KEY` is essential for authenticating your requests to the Fireworks AI service. The `FIREWORKS_ACCOUNT_ID` is used to identify your specific account context for operations that are account-specific, such as managing your evaluators. While the API key authenticates *who* you are, the account ID often specifies *where* (under which account) an operation should take place. Some Fireworks API endpoints may require both.
334
+
335
+ ### 4. Evaluating with Sample Data (Preview)
336
+
337
+ Create a JSONL file with sample conversations to evaluate:
338
+
339
+ ```json
340
+ {"messages": [{"role": "user", "content": "Tell me about AI"}, {"role": "assistant", "content": "AI refers to systems designed to mimic human intelligence."}]}
341
+ {"messages": [{"role": "user", "content": "What is machine learning?"}, {"role": "assistant", "content": "Machine learning is a subset of AI that focuses on building systems that can learn from data."}]}
342
+ ```
343
+
344
+ Preview your evaluation using the CLI:
345
+
346
+ ```bash
347
+ eval-protocol preview --metrics-folders "word_count=./path/to/metrics" --samples ./path/to/samples.jsonl
348
+ ```
349
+
350
+ For example
351
+ ```
352
+ eval-protocol preview --metrics-folders "word_count=examples/metrics/word_count" --samples development/CODING_DATASET.jsonl
353
+ ```
354
+
355
+ ### 5. Deploying Your Reward Function
356
+
357
+ Deploy your reward function to use in training workflows:
358
+
359
+ ```bash
360
+ eval-protocol deploy --id my-evaluator --metrics-folders "word_count=./path/to/metrics" --force
361
+ ```
362
+
363
+ #### Local Development Server
364
+
365
+ For local development and testing, you can deploy a reward function as a local server with external tunnel access:
366
+
367
+ ```bash
368
+ # Deploy as local server with automatic tunnel (ngrok/serveo)
369
+ eval-protocol deploy --id test-local-serve-eval --target local-serve --function-ref examples.row_wise.dummy_example.dummy_rewards.simple_echo_reward --verbose --force
370
+ ```
371
+
372
+ **What this does:**
373
+ - Starts a local HTTP server on port 8001 serving your reward function
374
+ - Creates an external tunnel (using ngrok or serveo.net) to make the server publicly accessible
375
+ - Registers the tunnel URL with Fireworks AI for remote evaluation
376
+ - Keeps the server running indefinitely in the background
377
+
378
+ **Key points:**
379
+ - The CLI returns to prompt after deployment, but the server continues running in background
380
+ - Check running processes: `ps aux | grep -E "(generic_server|ngrok)"`
381
+ - Test locally: `curl -X POST http://localhost:8001/evaluate -H "Content-Type: application/json" -d '{"messages": [{"role": "user", "content": "test"}]}'`
382
+ - Monitor logs: `tail -f logs/eval-protocol-local/generic_server_*.log`
383
+ - Stop server: Kill the background processes manually when done
384
+
385
+ This is ideal for development, testing webhook integrations, or accessing your reward function from remote services without full cloud deployment.
386
+
387
+ Or deploy programmatically:
388
+
389
+ ```python
390
+ from eval_protocol.evaluation import create_evaluation
391
+
392
+ evaluator = create_evaluation(
393
+ evaluator_id="my-evaluator",
394
+ metric_folders=["word_count=./path/to/metrics"],
395
+ display_name="My Word Count Evaluator",
396
+ description="Evaluates responses based on word count",
397
+ force=True # Update if already exists
398
+ )
399
+ ```
400
+
401
+ ## Advanced Usage
402
+
403
+ ### Multiple Metrics
404
+
405
+ Combine multiple metrics in a single reward function:
406
+
407
+ ```python
408
+ from eval_protocol import reward_function
409
+ from eval_protocol.models import EvaluateResult, MetricResult, Message # Assuming models are here
410
+ from typing import List, Dict, Any, Optional
411
+
412
+ @reward_function
413
+ def combined_reward(
414
+ messages: List[Dict[str, Any]], # Or List[Message]
415
+ original_messages: Optional[List[Dict[str, Any]]] = None, # Or List[Message]
416
+ **kwargs: Any
417
+ ) -> EvaluateResult:
418
+ """Evaluate with multiple metrics."""
419
+ response = messages[-1].get("content", "")
420
+
421
+ # Word count metric
422
+ word_count = len(response.split())
423
+ word_score = min(word_count / 100.0, 1.0)
424
+ word_metric_success = word_count > 10
425
+
426
+ # Specificity metric
427
+ specificity_markers = ["specifically", "for example", "such as"]
428
+ marker_count = sum(1 for marker in specificity_markers if marker.lower() in response.lower())
429
+ specificity_score = min(marker_count / 2.0, 1.0)
430
+ specificity_metric_success = marker_count > 0
431
+
432
+ # Combined score with weighted components
433
+ final_score = word_score * 0.3 + specificity_score * 0.7
434
+
435
+ return EvaluateResult(
436
+ score=final_score,
437
+ reason=f"Combined score based on word count ({word_count}) and specificity markers ({marker_count})",
438
+ metrics={
439
+ "word_count": MetricResult(
440
+ score=word_score,
441
+ success=word_metric_success,
442
+ reason=f"Word count: {word_count}"
443
+ ),
444
+ "specificity": MetricResult(
445
+ score=specificity_score,
446
+ success=specificity_metric_success,
447
+ reason=f"Found {marker_count} specificity markers"
448
+ )
449
+ }
450
+ )
451
+ ```
452
+
453
+ ### Custom Model Providers
454
+
455
+ Deploy your reward function with a specific model provider:
456
+
457
+ ```python
458
+ # Deploy with a custom provider
459
+ my_function.deploy(
460
+ name="my-evaluator-anthropic",
461
+ description="My evaluator using Claude model",
462
+ providers=[
463
+ {
464
+ "providerType": "anthropic",
465
+ "modelId": "claude-3-sonnet-20240229"
466
+ }
467
+ ],
468
+ force=True
469
+ )
470
+ ```
471
+
472
+ ## Dataset Integration
473
+
474
+ Reward Protocol provides seamless integration with popular datasets through a simplified configuration system:
475
+
476
+ ### Direct HuggingFace Integration
477
+
478
+ Load datasets directly from HuggingFace Hub without manual preprocessing:
479
+
480
+ ```bash
481
+ # Evaluate using GSM8K dataset with math-specific prompts
482
+ eval-protocol run --config-name run_math_eval.yaml --config-path examples/math_example/conf
483
+ ```
484
+
485
+ ### Derived Datasets
486
+
487
+ Create specialized dataset configurations that reference base datasets and apply transformations:
488
+
489
+ ```yaml
490
+ # conf/dataset/gsm8k_math_prompts.yaml
491
+ defaults:
492
+ - base_derived_dataset
493
+ - _self_
494
+
495
+ base_dataset: "gsm8k"
496
+ system_prompt: "Solve the following math problem. Show your work clearly. Put the final numerical answer between <answer> and </answer> tags."
497
+ output_format: "evaluation_format"
498
+ derived_max_samples: 5
499
+ ```
500
+
501
+ ### Key Benefits
502
+
503
+ - **No Manual Conversion**: Datasets are converted to evaluation format on-the-fly
504
+ - **System Prompt Integration**: Prompts are part of dataset configuration, not evaluation logic
505
+ - **Flexible Column Mapping**: Automatically adapts different dataset formats
506
+ - **Reusable Configurations**: Base datasets can be extended for different use cases
507
+
508
+ See the [math example](examples/math_example/) for a complete demonstration of the dataset system.
509
+
510
+ ## Detailed Documentation
511
+
512
+ For more comprehensive information, including API references, tutorials, and advanced guides, please see our [full documentation](docs/documentation_home.mdx).
513
+
514
+ ## Examples
515
+
516
+ Check the `examples` directory for complete examples:
517
+
518
+ - `evaluation_preview_example.py`: How to preview an evaluator.
519
+ - `deploy_example.py`: How to deploy a reward function to Fireworks.
520
+ - `math_example/`: Demonstrates CLI-based evaluation (`eval-protocol run`) and TRL GRPO training for math problems (GSM8K dataset).
521
+ - `apps_coding_example/`: Shows CLI-based evaluation (`eval-protocol run`) for code generation tasks (APPS dataset).
522
+ - `apps_coding_example/`: Shows CLI-based evaluation (`eval-protocol run`) for code generation tasks (APPS dataset).
523
+
524
+ The OpenEvals project provides a suite of evaluators that can be used directly within Reward Protocol. The helper `eval_protocol.integrations.openeval.adapt` converts any OpenEvals evaluator into a reward function returning an `EvaluateResult`.
525
+
526
+ ```python
527
+ from openevals import exact_match
528
+ from eval_protocol.integrations.openeval import adapt
529
+
530
+ exact_match_reward = adapt(exact_match)
531
+ result = exact_match_reward(
532
+ messages=[{"role": "assistant", "content": "hello"}],
533
+ ground_truth="hello",
534
+ )
535
+ print(result.score)
536
+ ```
537
+
538
+ The [deepeval](https://github.com/confident-ai/deepeval) project also offers a
539
+ variety of metrics. The helper `eval_protocol.integrations.deepeval.adapt_metric`
540
+ converts a deepeval metric instance into a reward function returning an
541
+ `EvaluateResult`.
542
+
543
+ ```python
544
+ from deepeval.metrics import FaithfulnessMetric
545
+ from eval_protocol.integrations.deepeval import adapt_metric
546
+
547
+ faithfulness_reward = adapt_metric(FaithfulnessMetric())
548
+ result = faithfulness_reward(
549
+ messages=[{"role": "assistant", "content": "hello"}],
550
+ ground_truth="hello",
551
+ )
552
+ print(result.score)
553
+ ```
554
+
555
+ The GEval metric family uses an LLM-as-a-judge to score outputs based on
556
+ custom criteria. You can construct a `GEval` metric and adapt it in the same
557
+ way:
558
+
559
+ ```python
560
+ from deepeval.metrics import GEval
561
+ from deepeval.test_case import LLMTestCaseParams
562
+ from eval_protocol.integrations.deepeval import adapt_metric
563
+
564
+ correctness_metric = GEval(
565
+ name="Correctness",
566
+ criteria="Determine whether the answer is factually correct",
567
+ evaluation_params=[
568
+ LLMTestCaseParams.INPUT,
569
+ LLMTestCaseParams.ACTUAL_OUTPUT,
570
+ LLMTestCaseParams.EXPECTED_OUTPUT,
571
+ ],
572
+ )
573
+
574
+ correctness_reward = adapt_metric(correctness_metric)
575
+ result = correctness_reward(
576
+ messages=[{"role": "user", "content": "Who wrote 1984?"}, {"role": "assistant", "content": "George Orwell"}],
577
+ ground_truth="George Orwell",
578
+ )
579
+ print(result.score)
580
+ ```
581
+
582
+ ## Command Line Interface
583
+
584
+ Reward Protocol includes a CLI for common operations:
585
+
586
+ ```bash
587
+ # Show help
588
+ eval-protocol --help
589
+
590
+ # Preview an evaluator
591
+ eval-protocol preview --metrics-folders "metric=./path" --samples ./samples.jsonl
592
+
593
+ # Deploy an evaluator
594
+ eval-protocol deploy --id my-evaluator --metrics-folders "metric=./path" --force
595
+ ```
596
+
597
+ ## Community and Support
598
+
599
+ * **GitHub Issues**: For bug reports and feature requests, please use [GitHub Issues](https://github.com/eval-protocol/python-sdk/issues).
600
+ * **GitHub Discussions**: (If enabled) For general questions, ideas, and discussions.
601
+ * Please also review our [Contributing Guidelines](development/CONTRIBUTING.md) and [Code of Conduct](CODE_OF_CONDUCT.md).
602
+
603
+ ## Development
604
+
605
+ ### Type Checking
606
+
607
+ The codebase uses mypy for static type checking. To run type checking:
608
+
609
+ ```bash
610
+ # Install development dependencies
611
+ pip install -e ".[dev]"
612
+
613
+ # Run mypy
614
+ mypy eval_protocol
615
+ ```
616
+
617
+ Our CI pipeline enforces type checking, so please ensure your code passes mypy checks before submitting PRs.
618
+
619
+ ### Running Tests
620
+
621
+ ```bash
622
+ # Install test dependencies
623
+ pip install -e ".[dev]"
624
+
625
+ # Run tests
626
+ pytest
627
+ ```
628
+
629
+ ## Code of Conduct
630
+
631
+ We are dedicated to providing a welcoming and inclusive experience for everyone. Please review and adhere to our [Code of Conduct](CODE_OF_CONDUCT.md).
632
+
633
+ ## License
634
+
635
+ Reward Protocol is released under the Apache License 2.0.