freesolo 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. freesolo-0.2.2/README.md → freesolo-0.2.3/PKG-INFO +144 -28
  2. freesolo-0.2.2/PKG-INFO → freesolo-0.2.3/README.md +126 -42
  3. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/.env.example +0 -1
  4. freesolo-0.2.3/pypi/examples/evals/__init__.py +1 -0
  5. freesolo-0.2.3/pypi/examples/evals/exact_match.py +105 -0
  6. freesolo-0.2.3/pypi/examples/evals/llm_judge.py +149 -0
  7. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/__init__.py +4 -2
  8. freesolo-0.2.3/pypi/freesolo/contracts/__init__.py +23 -0
  9. freesolo-0.2.3/pypi/freesolo/contracts/markdown.py +80 -0
  10. freesolo-0.2.3/pypi/freesolo/contracts/types.py +30 -0
  11. freesolo-0.2.3/pypi/freesolo/environments/__init__.py +32 -0
  12. freesolo-0.2.3/pypi/freesolo/environments/base.py +235 -0
  13. freesolo-0.2.3/pypi/freesolo/environments/evaluation.py +321 -0
  14. freesolo-0.2.3/pypi/freesolo/environments/types.py +157 -0
  15. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/evaluation/__init__.py +14 -2
  16. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/evaluation/client.py +70 -6
  17. freesolo-0.2.3/pypi/freesolo/evaluation/judges/__init__.py +27 -0
  18. freesolo-0.2.3/pypi/freesolo/evaluation/judges/base.py +182 -0
  19. freesolo-0.2.3/pypi/freesolo/evaluation/judges/groundedness.py +34 -0
  20. freesolo-0.2.3/pypi/freesolo/evaluation/judges/instruction_following.py +31 -0
  21. freesolo-0.2.3/pypi/freesolo/evaluation/judges/pairwise_preference.py +45 -0
  22. freesolo-0.2.3/pypi/freesolo/evaluation/judges/reference_correctness.py +26 -0
  23. freesolo-0.2.3/pypi/freesolo/evaluation/judges/rubric.py +46 -0
  24. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/evaluation/responses.py +16 -8
  25. freesolo-0.2.3/pypi/freesolo/evaluation/results.py +93 -0
  26. freesolo-0.2.3/pypi/freesolo/evaluation/types.py +16 -0
  27. freesolo-0.2.3/pypi/freesolo/gepa/__init__.py +47 -0
  28. freesolo-0.2.3/pypi/freesolo/gepa/adapter.py +223 -0
  29. freesolo-0.2.3/pypi/freesolo/gepa/reflection.py +88 -0
  30. freesolo-0.2.3/pypi/freesolo/gepa/setup.py +227 -0
  31. freesolo-0.2.3/pypi/freesolo/gepa/types.py +127 -0
  32. freesolo-0.2.3/pypi/freesolo/storage.py +198 -0
  33. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/__init__.py +4 -0
  34. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/client.py +65 -100
  35. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/decorators.py +8 -5
  36. freesolo-0.2.3/pypi/freesolo/tracing/types.py +79 -0
  37. freesolo-0.2.3/pypi/freesolo/training/__init__.py +14 -0
  38. freesolo-0.2.3/pypi/freesolo/training/grpo/__init__.py +20 -0
  39. freesolo-0.2.3/pypi/freesolo/training/grpo/config.py +44 -0
  40. freesolo-0.2.3/pypi/freesolo/training/grpo/datums.py +205 -0
  41. freesolo-0.2.3/pypi/freesolo/training/grpo/rewards.py +139 -0
  42. freesolo-0.2.3/pypi/freesolo/training/grpo/sampling.py +135 -0
  43. freesolo-0.2.3/pypi/freesolo/training/storage.py +57 -0
  44. freesolo-0.2.3/pypi/freesolo/training/train_grpo.py +433 -0
  45. freesolo-0.2.3/pypi/freesolo/training/train_sft.py +264 -0
  46. freesolo-0.2.3/pypi/freesolo/training/types.py +22 -0
  47. freesolo-0.2.3/pypi/freesolo/util.py +462 -0
  48. freesolo-0.2.3/pypi/freesolo/utils/__init__.py +119 -0
  49. freesolo-0.2.3/pypi/freesolo/utils/checkpoints.py +262 -0
  50. freesolo-0.2.3/pypi/freesolo/utils/deployment.py +70 -0
  51. freesolo-0.2.3/pypi/freesolo/utils/openrouter.py +217 -0
  52. freesolo-0.2.3/pypi/freesolo/utils/oracle.py +241 -0
  53. freesolo-0.2.3/pypi/freesolo/utils/wandb.py +344 -0
  54. {freesolo-0.2.2 → freesolo-0.2.3}/pyproject.toml +7 -2
  55. freesolo-0.2.3/tests/test_evaluation_client.py +160 -0
  56. freesolo-0.2.3/tests/test_gepa_adapter.py +95 -0
  57. freesolo-0.2.3/tests/test_storage_sync.py +324 -0
  58. freesolo-0.2.3/tests/test_utils_checkpoints.py +106 -0
  59. {freesolo-0.2.2 → freesolo-0.2.3}/uv.lock +302 -2
  60. freesolo-0.2.2/pypi/freesolo/evaluation/hosted.py +0 -404
  61. freesolo-0.2.2/pypi/freesolo/evaluation/judges.py +0 -27
  62. freesolo-0.2.2/pypi/freesolo/evaluation/results.py +0 -61
  63. freesolo-0.2.2/pypi/freesolo/evaluation/utils.py +0 -11
  64. freesolo-0.2.2/pypi/freesolo/sdk.py +0 -52
  65. freesolo-0.2.2/pypi/freesolo/tracing/utils.py +0 -15
  66. freesolo-0.2.2/pypi/freesolo/utils.py +0 -37
  67. {freesolo-0.2.2 → freesolo-0.2.3}/.env.example +0 -0
  68. {freesolo-0.2.2 → freesolo-0.2.3}/.gitignore +0 -0
  69. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/.gitignore +0 -0
  70. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/__init__.py +0 -0
  71. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/anthropic/__init__.py +0 -0
  72. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/anthropic/chat.py +0 -0
  73. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/anthropic/vision.py +0 -0
  74. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/gemini/__init__.py +0 -0
  75. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/gemini/chat.py +0 -0
  76. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/gemini/vision.py +0 -0
  77. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/openai/__init__.py +0 -0
  78. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/openai/chat.py +0 -0
  79. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/openai/vision.py +0 -0
  80. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/openrouter/__init__.py +0 -0
  81. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/openrouter/chat.py +0 -0
  82. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/utils.py +0 -0
  83. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/py.typed +0 -0
  84. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/__init__.py +0 -0
  85. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/anthropic.py +0 -0
  86. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/config.py +0 -0
  87. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/gemini.py +0 -0
  88. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/openai.py +0 -0
  89. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/utils.py +0 -0
  90. {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/sanitize.py +0 -0
  91. {freesolo-0.2.2 → freesolo-0.2.3}/ruff.toml +0 -0
@@ -1,3 +1,21 @@
1
+ Metadata-Version: 2.4
2
+ Name: freesolo
3
+ Version: 0.2.3
4
+ Summary: Tracing, evaluation, and training utilities for LLM applications.
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: httpx>=0.27.0
7
+ Requires-Dist: wandb>=0.17.0
8
+ Provides-Extra: dev
9
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
10
+ Requires-Dist: ruff>=0.11.0; extra == 'dev'
11
+ Provides-Extra: examples
12
+ Requires-Dist: anthropic>=0.40.0; extra == 'examples'
13
+ Requires-Dist: google-genai>=1.0.0; extra == 'examples'
14
+ Requires-Dist: openai>=1.0.0; extra == 'examples'
15
+ Provides-Extra: gepa
16
+ Requires-Dist: gepa>=0.1.1; extra == 'gepa'
17
+ Description-Content-Type: text/markdown
18
+
1
19
  # freesolo
2
20
 
3
21
  `freesolo` is a Python tracing and evaluation package for LLM apps.
@@ -7,7 +25,7 @@ It is built for the lowest-friction integration possible:
7
25
  1. Install the package
8
26
  2. Set `FREESOLO_API_KEY`
9
27
  3. Wrap your OpenAI, Anthropic, Gemini, or OpenAI-compatible client
10
- 4. Run traces and evaluations from the same SDK
28
+ 4. Run traces and evaluations from the package APIs
11
29
 
12
30
  ## Current provider support
13
31
 
@@ -20,7 +38,7 @@ It is built for the lowest-friction integration possible:
20
38
 
21
39
  ## Install
22
40
 
23
- Install the package plus the provider SDK you use:
41
+ Install the package plus the provider client you use:
24
42
 
25
43
  ```bash
26
44
  pip install freesolo openai
@@ -154,7 +172,7 @@ with start_trace("support-agent-run"):
154
172
 
155
173
  ## Evaluations
156
174
 
157
- `freesolo` also includes a small evaluation SDK for CI jobs, GitHub bots, and
175
+ `freesolo` also includes a small evaluation API for CI jobs, GitHub bots, and
158
176
  eval scripts. All evaluation runs require `FREESOLO_API_KEY` or an explicit
159
177
  `api_key`.
160
178
 
@@ -168,8 +186,7 @@ results with your API key. Pass scorer objects, not strings.
168
186
  ```python
169
187
  from typing import Any
170
188
 
171
- from freesolo import Freesolo
172
- from freesolo.evaluation import BinaryResponse, CustomScorer
189
+ from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
173
190
 
174
191
 
175
192
  class ExactMatch(CustomScorer[BinaryResponse]):
@@ -182,9 +199,9 @@ class ExactMatch(CustomScorer[BinaryResponse]):
182
199
  )
183
200
 
184
201
 
185
- client = Freesolo()
202
+ client = EvaluationClient()
186
203
 
187
- results = client.evals.run(
204
+ results = client.run(
188
205
  name="support-agent-correctness",
189
206
  data=[
190
207
  {
@@ -199,13 +216,123 @@ results = client.evals.run(
199
216
  print(results[0].success)
200
217
  ```
201
218
 
219
+ ## Tinker Deployment
220
+
221
+ `freesolo.utils.deployment` is a thin proxy for the Modal deployment server. It posts
222
+ a Tinker checkpoint URL to the pinned Modal `/deployments` endpoint and returns
223
+ the server JSON response.
224
+
225
+ ```python
226
+ from freesolo.utils.deployment import deploy_tinker_checkpoint
227
+
228
+ result = deploy_tinker_checkpoint(
229
+ "tinker://<run_id>/sampler_weights/final",
230
+ base_model="Qwen/Qwen3.5-35B-A3B",
231
+ )
232
+
233
+ print(result["repoId"])
234
+ ```
235
+
236
+ ### Environment-driven evaluations
237
+
238
+ For training contracts, you can use the same `Environment` adapter for evals,
239
+ SFT, and GRPO. `run_environment` loads examples, builds prompt messages, calls
240
+ your model callback, scores the response through the environment, and uploads
241
+ the same `scorers_data` shape used by the eval DB.
242
+
243
+ ```python
244
+ from typing import Any
245
+
246
+ from openai import OpenAI
247
+
248
+ from freesolo.environments import (
249
+ Environment,
250
+ EnvironmentGeneration,
251
+ RewardMetric,
252
+ RewardResult,
253
+ TaskExample,
254
+ )
255
+ from freesolo.evaluation import EvaluationClient
256
+
257
+
258
+ class ContractEnvironment(Environment):
259
+ def build_prompt_messages(
260
+ self,
261
+ example: TaskExample,
262
+ contract_text: str,
263
+ ):
264
+ return [
265
+ {"role": "system", "content": contract_text},
266
+ {"role": "user", "content": example.task},
267
+ ]
268
+
269
+ def score_response(
270
+ self,
271
+ example: TaskExample,
272
+ response_text: str,
273
+ ) -> RewardResult:
274
+ passed = response_text.strip() == str(example.expected_output).strip()
275
+ return RewardResult(
276
+ name="exact_match",
277
+ score=1.0 if passed else 0.0,
278
+ success=passed,
279
+ threshold=1.0,
280
+ reason="matched expected output" if passed else "mismatch",
281
+ return_type="binary",
282
+ metrics=(
283
+ RewardMetric(
284
+ name="canonical_match",
285
+ score=1.0 if passed else 0.0,
286
+ success=passed,
287
+ threshold=1.0,
288
+ ),
289
+ ),
290
+ )
291
+
292
+
293
+ model = OpenAI()
294
+
295
+
296
+ def generate(messages: list[dict[str, str]], example: TaskExample):
297
+ response = model.chat.completions.create(
298
+ model="gpt-4.1-mini",
299
+ messages=messages,
300
+ )
301
+ return EnvironmentGeneration(
302
+ response_text=response.choices[0].message.content or "",
303
+ total_tokens=response.usage.total_tokens if response.usage else None,
304
+ )
305
+
306
+
307
+ results = EvaluationClient().run_environment(
308
+ name="contract-eval",
309
+ source="eval.jsonl",
310
+ contract_path="TRAINING_CONTRACT.md",
311
+ environment=ContractEnvironment(),
312
+ generate=generate,
313
+ )
314
+ ```
315
+
316
+ `RewardResult` is the top-level scorer entry stored in
317
+ `eval_tasks.scorers_data`. Its fields are:
318
+
319
+ - `name`: scorer name shown in the UI.
320
+ - `score`: numeric reward value.
321
+ - `success`: pass/fail. If omitted, Freesolo derives it from `threshold`, then
322
+ from whether `score > 0`.
323
+ - `threshold`, `value`, `reason`, `error`, `return_type`: scorer display and
324
+ pass/fail context.
325
+ - `latency_ms`, `total_tokens`: optional per-response usage metadata.
326
+ - `metadata`: JSON object for scorer-specific details.
327
+ - `metrics`: optional `RewardMetric` components, also JSON-only, with `name`,
328
+ `score`, `value`, `success`, `threshold`, `weight`, `reason`, and `metadata`.
329
+
202
330
  Custom scorer:
203
331
 
204
332
  ```python
205
333
  from typing import Any
206
334
 
207
- from freesolo import Freesolo
208
- from freesolo.evaluation import BinaryResponse, CustomScorer
335
+ from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
209
336
 
210
337
 
211
338
  class NoEmptyAnswer(CustomScorer[BinaryResponse]):
@@ -214,7 +341,7 @@ class NoEmptyAnswer(CustomScorer[BinaryResponse]):
214
341
  return BinaryResponse(value=ok, reason="actual_output is non-empty")
215
342
 
216
343
 
217
- results = Freesolo().evals.run(
344
+ results = EvaluationClient().run(
218
345
  name="support-agent-non-empty",
219
346
  data=[{"actual_output": "hello"}],
220
347
  scorers=[NoEmptyAnswer()],
@@ -232,8 +359,8 @@ from typing import Any
232
359
 
233
360
  from openai import OpenAI
234
361
 
235
- from freesolo import Freesolo, instrument_openai
236
- from freesolo.evaluation import CustomScorer, NumericResponse
362
+ from freesolo import instrument_openai
363
+ from freesolo.evaluation import CustomScorer, EvaluationClient, NumericResponse
237
364
 
238
365
 
239
366
  class CorrectnessJudge(CustomScorer[NumericResponse]):
@@ -278,7 +405,7 @@ class CorrectnessJudge(CustomScorer[NumericResponse]):
278
405
 
279
406
  judge_client = instrument_openai(OpenAI())
280
407
 
281
- results = Freesolo().evals.run(
408
+ results = EvaluationClient().run(
282
409
  name="support-agent-correctness",
283
410
  data=[
284
411
  {
@@ -302,27 +429,16 @@ Hosted scorers are also available out of the box and use OpenRouter by default:
302
429
  ```python
303
430
  from freesolo.evaluation import HostedJudgeClient, ReferenceCorrectnessScorer
304
431
 
305
- judge = HostedJudgeClient(
306
- api_key="YOUR_OPENROUTER_API_KEY",
307
- model="openai/gpt-oss-120b",
308
- )
432
+ judge = HostedJudgeClient(api_key="YOUR_OPENROUTER_API_KEY")
309
433
 
310
434
  scorer = ReferenceCorrectnessScorer(client=judge)
311
435
  ```
312
436
 
313
- Tracing is available from the same root client:
437
+ Tracing is available through namespaced helpers:
314
438
 
315
439
  ```python
316
- from freesolo import Freesolo
440
+ from freesolo.tracing import start_trace
317
441
 
318
- client = Freesolo()
319
-
320
- with client.traces.start("support-agent-run"):
442
+ with start_trace("support-agent-run"):
321
443
  ...
322
444
  ```
323
-
324
- You can also import namespaced tracing helpers directly:
325
-
326
- ```python
327
- from freesolo.tracing import start_trace, wrap
328
- ```
@@ -1,17 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: freesolo
3
- Version: 0.2.2
4
- Summary: Tracing and evaluation SDK for LLM applications.
5
- Requires-Python: >=3.10
6
- Requires-Dist: httpx>=0.27.0
7
- Provides-Extra: dev
8
- Requires-Dist: ruff>=0.11.0; extra == 'dev'
9
- Provides-Extra: examples
10
- Requires-Dist: anthropic>=0.40.0; extra == 'examples'
11
- Requires-Dist: google-genai>=1.0.0; extra == 'examples'
12
- Requires-Dist: openai>=1.0.0; extra == 'examples'
13
- Description-Content-Type: text/markdown
14
-
15
1
  # freesolo
16
2
 
17
3
  `freesolo` is a Python tracing and evaluation package for LLM apps.
@@ -21,7 +7,7 @@ It is built for the lowest-friction integration possible:
21
7
  1. Install the package
22
8
  2. Set `FREESOLO_API_KEY`
23
9
  3. Wrap your OpenAI, Anthropic, Gemini, or OpenAI-compatible client
24
- 4. Run traces and evaluations from the same SDK
10
+ 4. Run traces and evaluations from the package APIs
25
11
 
26
12
  ## Current provider support
27
13
 
@@ -34,7 +20,7 @@ It is built for the lowest-friction integration possible:
34
20
 
35
21
  ## Install
36
22
 
37
- Install the package plus the provider SDK you use:
23
+ Install the package plus the provider client you use:
38
24
 
39
25
  ```bash
40
26
  pip install freesolo openai
@@ -168,7 +154,7 @@ with start_trace("support-agent-run"):
168
154
 
169
155
  ## Evaluations
170
156
 
171
- `freesolo` also includes a small evaluation SDK for CI jobs, GitHub bots, and
157
+ `freesolo` also includes a small evaluation API for CI jobs, GitHub bots, and
172
158
  eval scripts. All evaluation runs require `FREESOLO_API_KEY` or an explicit
173
159
  `api_key`.
174
160
 
@@ -182,8 +168,7 @@ results with your API key. Pass scorer objects, not strings.
182
168
  ```python
183
169
  from typing import Any
184
170
 
185
- from freesolo import Freesolo
186
- from freesolo.evaluation import BinaryResponse, CustomScorer
171
+ from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
187
172
 
188
173
 
189
174
  class ExactMatch(CustomScorer[BinaryResponse]):
@@ -196,9 +181,9 @@ class ExactMatch(CustomScorer[BinaryResponse]):
196
181
  )
197
182
 
198
183
 
199
- client = Freesolo()
184
+ client = EvaluationClient()
200
185
 
201
- results = client.evals.run(
186
+ results = client.run(
202
187
  name="support-agent-correctness",
203
188
  data=[
204
189
  {
@@ -213,13 +198,123 @@ results = client.evals.run(
213
198
  print(results[0].success)
214
199
  ```
215
200
 
201
+ ## Tinker Deployment
202
+
203
+ `freesolo.utils.deployment` is a thin proxy for the Modal deployment server. It posts
204
+ a Tinker checkpoint URL to the pinned Modal `/deployments` endpoint and returns
205
+ the server JSON response.
206
+
207
+ ```python
208
+ from freesolo.utils.deployment import deploy_tinker_checkpoint
209
+
210
+ result = deploy_tinker_checkpoint(
211
+ "tinker://<run_id>/sampler_weights/final",
212
+ base_model="Qwen/Qwen3.5-35B-A3B",
213
+ )
214
+
215
+ print(result["repoId"])
216
+ ```
217
+
218
+ ### Environment-driven evaluations
219
+
220
+ For training contracts, you can use the same `Environment` adapter for evals,
221
+ SFT, and GRPO. `run_environment` loads examples, builds prompt messages, calls
222
+ your model callback, scores the response through the environment, and uploads
223
+ the same `scorers_data` shape used by the eval DB.
224
+
225
+ ```python
226
+ from typing import Any
227
+
228
+ from openai import OpenAI
229
+
230
+ from freesolo.environments import (
231
+ Environment,
232
+ EnvironmentGeneration,
233
+ RewardMetric,
234
+ RewardResult,
235
+ TaskExample,
236
+ )
237
+ from freesolo.evaluation import EvaluationClient
238
+
239
+
240
+ class ContractEnvironment(Environment):
241
+ def build_prompt_messages(
242
+ self,
243
+ example: TaskExample,
244
+ contract_text: str,
245
+ ):
246
+ return [
247
+ {"role": "system", "content": contract_text},
248
+ {"role": "user", "content": example.task},
249
+ ]
250
+
251
+ def score_response(
252
+ self,
253
+ example: TaskExample,
254
+ response_text: str,
255
+ ) -> RewardResult:
256
+ passed = response_text.strip() == str(example.expected_output).strip()
257
+ return RewardResult(
258
+ name="exact_match",
259
+ score=1.0 if passed else 0.0,
260
+ success=passed,
261
+ threshold=1.0,
262
+ reason="matched expected output" if passed else "mismatch",
263
+ return_type="binary",
264
+ metrics=(
265
+ RewardMetric(
266
+ name="canonical_match",
267
+ score=1.0 if passed else 0.0,
268
+ success=passed,
269
+ threshold=1.0,
270
+ ),
271
+ ),
272
+ )
273
+
274
+
275
+ model = OpenAI()
276
+
277
+
278
+ def generate(messages: list[dict[str, str]], example: TaskExample):
279
+ response = model.chat.completions.create(
280
+ model="gpt-4.1-mini",
281
+ messages=messages,
282
+ )
283
+ return EnvironmentGeneration(
284
+ response_text=response.choices[0].message.content or "",
285
+ total_tokens=response.usage.total_tokens if response.usage else None,
286
+ )
287
+
288
+
289
+ results = EvaluationClient().run_environment(
290
+ name="contract-eval",
291
+ source="eval.jsonl",
292
+ contract_path="TRAINING_CONTRACT.md",
293
+ environment=ContractEnvironment(),
294
+ generate=generate,
295
+ )
296
+ ```
297
+
298
+ `RewardResult` is the top-level scorer entry stored in
299
+ `eval_tasks.scorers_data`. Its fields are:
300
+
301
+ - `name`: scorer name shown in the UI.
302
+ - `score`: numeric reward value.
303
+ - `success`: pass/fail. If omitted, Freesolo derives it from `threshold`, then
304
+ from whether `score > 0`.
305
+ - `threshold`, `value`, `reason`, `error`, `return_type`: scorer display and
306
+ pass/fail context.
307
+ - `latency_ms`, `total_tokens`: optional per-response usage metadata.
308
+ - `metadata`: JSON object for scorer-specific details.
309
+ - `metrics`: optional `RewardMetric` components, also JSON-only, with `name`,
310
+ `score`, `value`, `success`, `threshold`, `weight`, `reason`, and `metadata`.
311
+
216
312
  Custom scorer:
217
313
 
218
314
  ```python
219
315
  from typing import Any
220
316
 
221
- from freesolo import Freesolo
222
- from freesolo.evaluation import BinaryResponse, CustomScorer
317
+ from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
223
318
 
224
319
 
225
320
  class NoEmptyAnswer(CustomScorer[BinaryResponse]):
@@ -228,7 +323,7 @@ class NoEmptyAnswer(CustomScorer[BinaryResponse]):
228
323
  return BinaryResponse(value=ok, reason="actual_output is non-empty")
229
324
 
230
325
 
231
- results = Freesolo().evals.run(
326
+ results = EvaluationClient().run(
232
327
  name="support-agent-non-empty",
233
328
  data=[{"actual_output": "hello"}],
234
329
  scorers=[NoEmptyAnswer()],
@@ -246,8 +341,8 @@ from typing import Any
246
341
 
247
342
  from openai import OpenAI
248
343
 
249
- from freesolo import Freesolo, instrument_openai
250
- from freesolo.evaluation import CustomScorer, NumericResponse
344
+ from freesolo import instrument_openai
345
+ from freesolo.evaluation import CustomScorer, EvaluationClient, NumericResponse
251
346
 
252
347
 
253
348
  class CorrectnessJudge(CustomScorer[NumericResponse]):
@@ -292,7 +387,7 @@ class CorrectnessJudge(CustomScorer[NumericResponse]):
292
387
 
293
388
  judge_client = instrument_openai(OpenAI())
294
389
 
295
- results = Freesolo().evals.run(
390
+ results = EvaluationClient().run(
296
391
  name="support-agent-correctness",
297
392
  data=[
298
393
  {
@@ -316,27 +411,16 @@ Hosted scorers are also available out of the box and use OpenRouter by default:
316
411
  ```python
317
412
  from freesolo.evaluation import HostedJudgeClient, ReferenceCorrectnessScorer
318
413
 
319
- judge = HostedJudgeClient(
320
- api_key="YOUR_OPENROUTER_API_KEY",
321
- model="openai/gpt-oss-120b",
322
- )
414
+ judge = HostedJudgeClient(api_key="YOUR_OPENROUTER_API_KEY")
323
415
 
324
416
  scorer = ReferenceCorrectnessScorer(client=judge)
325
417
  ```
326
418
 
327
- Tracing is available from the same root client:
419
+ Tracing is available through namespaced helpers:
328
420
 
329
421
  ```python
330
- from freesolo import Freesolo
422
+ from freesolo.tracing import start_trace
331
423
 
332
- client = Freesolo()
333
-
334
- with client.traces.start("support-agent-run"):
424
+ with start_trace("support-agent-run"):
335
425
  ...
336
426
  ```
337
-
338
- You can also import namespaced tracing helpers directly:
339
-
340
- ```python
341
- from freesolo.tracing import start_trace, wrap
342
- ```
@@ -1,5 +1,4 @@
1
1
  OPENAI_API_KEY=
2
- FREESOLO_JUDGE_MODEL=gpt-4.1-mini
3
2
  OPENROUTER_API_KEY=
4
3
 
5
4
  ANTHROPIC_API_KEY=
@@ -0,0 +1 @@
1
+ """Evaluation examples for the Python package."""
@@ -0,0 +1,105 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from typing import Any
5
+
6
+ from freesolo import BinaryResponse, CustomScorer
7
+ from freesolo.evaluation import EvaluationClient
8
+
9
+ from ..utils import configure_example
10
+
11
+
12
+ class ExactMatchScorer(CustomScorer[BinaryResponse]):
13
+ name = "exact_match"
14
+
15
+ async def score(self, row: dict[str, Any]) -> BinaryResponse:
16
+ actual = str(row.get("actual_output", "")).strip()
17
+ expected = str(row.get("expected_output", "")).strip()
18
+ success = bool(actual) and actual == expected
19
+ return BinaryResponse(
20
+ success,
21
+ reason=(
22
+ "actual_output matched expected_output"
23
+ if success
24
+ else f'expected "{expected}" but got "{actual or "<empty>"}"'
25
+ ),
26
+ )
27
+
28
+
29
+ class NonEmptyOutputScorer(CustomScorer[BinaryResponse]):
30
+ name = "non_empty_output"
31
+
32
+ async def score(self, row: dict[str, Any]) -> BinaryResponse:
33
+ actual = str(row.get("actual_output", "")).strip()
34
+ return BinaryResponse(
35
+ bool(actual),
36
+ reason=(
37
+ "actual_output is non-empty" if actual else "actual_output was empty"
38
+ ),
39
+ )
40
+
41
+
42
+ def parse_args() -> argparse.Namespace:
43
+ parser = argparse.ArgumentParser(description="Exact-match eval example.")
44
+ parser.add_argument("name", nargs="?", default="eval-example-exact-match-py")
45
+ return parser.parse_args()
46
+
47
+
48
+ def build_dataset() -> list[dict[str, str]]:
49
+ return [
50
+ {
51
+ "input": "What is the capital of France?",
52
+ "actual_output": "Paris",
53
+ "expected_output": "Paris",
54
+ },
55
+ {
56
+ "input": "What is the capital of Canada?",
57
+ "actual_output": "Ottawa is the capital of Canada.",
58
+ "expected_output": "Ottawa",
59
+ },
60
+ {
61
+ "input": "What is 2 + 2?",
62
+ "actual_output": "4",
63
+ "expected_output": "4",
64
+ },
65
+ ]
66
+
67
+
68
+ def print_results(name: str, results: list[Any]) -> None:
69
+ run_id = results[0].run_id if results else "unknown"
70
+ print(f"eval_name={name}")
71
+ print(f"run_id={run_id}")
72
+ print()
73
+
74
+ for index, result in enumerate(results, start=1):
75
+ summary = " | ".join(
76
+ (
77
+ f"{scorer.name}:{'pass' if scorer.success else 'fail'}"
78
+ + (f" ({scorer.reason})" if scorer.reason else "")
79
+ )
80
+ for scorer in result.scorers_data
81
+ )
82
+ print(f"{index}. {'pass' if result.success else 'fail'} -> {summary}")
83
+
84
+
85
+ def main() -> None:
86
+ args = parse_args()
87
+ configure_example()
88
+
89
+ client = EvaluationClient()
90
+ results = client.run(
91
+ name=args.name,
92
+ data=build_dataset(),
93
+ scorers=[ExactMatchScorer(), NonEmptyOutputScorer()],
94
+ metadata={
95
+ "model": "rule-based-exact-match",
96
+ "provider": "local",
97
+ "source": "python",
98
+ "example": "exact-match",
99
+ },
100
+ )
101
+ print_results(args.name, results)
102
+
103
+
104
+ if __name__ == "__main__":
105
+ main()