freesolo 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- freesolo-0.2.2/README.md → freesolo-0.2.3/PKG-INFO +144 -28
- freesolo-0.2.2/PKG-INFO → freesolo-0.2.3/README.md +126 -42
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/.env.example +0 -1
- freesolo-0.2.3/pypi/examples/evals/__init__.py +1 -0
- freesolo-0.2.3/pypi/examples/evals/exact_match.py +105 -0
- freesolo-0.2.3/pypi/examples/evals/llm_judge.py +149 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/__init__.py +4 -2
- freesolo-0.2.3/pypi/freesolo/contracts/__init__.py +23 -0
- freesolo-0.2.3/pypi/freesolo/contracts/markdown.py +80 -0
- freesolo-0.2.3/pypi/freesolo/contracts/types.py +30 -0
- freesolo-0.2.3/pypi/freesolo/environments/__init__.py +32 -0
- freesolo-0.2.3/pypi/freesolo/environments/base.py +235 -0
- freesolo-0.2.3/pypi/freesolo/environments/evaluation.py +321 -0
- freesolo-0.2.3/pypi/freesolo/environments/types.py +157 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/evaluation/__init__.py +14 -2
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/evaluation/client.py +70 -6
- freesolo-0.2.3/pypi/freesolo/evaluation/judges/__init__.py +27 -0
- freesolo-0.2.3/pypi/freesolo/evaluation/judges/base.py +182 -0
- freesolo-0.2.3/pypi/freesolo/evaluation/judges/groundedness.py +34 -0
- freesolo-0.2.3/pypi/freesolo/evaluation/judges/instruction_following.py +31 -0
- freesolo-0.2.3/pypi/freesolo/evaluation/judges/pairwise_preference.py +45 -0
- freesolo-0.2.3/pypi/freesolo/evaluation/judges/reference_correctness.py +26 -0
- freesolo-0.2.3/pypi/freesolo/evaluation/judges/rubric.py +46 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/evaluation/responses.py +16 -8
- freesolo-0.2.3/pypi/freesolo/evaluation/results.py +93 -0
- freesolo-0.2.3/pypi/freesolo/evaluation/types.py +16 -0
- freesolo-0.2.3/pypi/freesolo/gepa/__init__.py +47 -0
- freesolo-0.2.3/pypi/freesolo/gepa/adapter.py +223 -0
- freesolo-0.2.3/pypi/freesolo/gepa/reflection.py +88 -0
- freesolo-0.2.3/pypi/freesolo/gepa/setup.py +227 -0
- freesolo-0.2.3/pypi/freesolo/gepa/types.py +127 -0
- freesolo-0.2.3/pypi/freesolo/storage.py +198 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/__init__.py +4 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/client.py +65 -100
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/decorators.py +8 -5
- freesolo-0.2.3/pypi/freesolo/tracing/types.py +79 -0
- freesolo-0.2.3/pypi/freesolo/training/__init__.py +14 -0
- freesolo-0.2.3/pypi/freesolo/training/grpo/__init__.py +20 -0
- freesolo-0.2.3/pypi/freesolo/training/grpo/config.py +44 -0
- freesolo-0.2.3/pypi/freesolo/training/grpo/datums.py +205 -0
- freesolo-0.2.3/pypi/freesolo/training/grpo/rewards.py +139 -0
- freesolo-0.2.3/pypi/freesolo/training/grpo/sampling.py +135 -0
- freesolo-0.2.3/pypi/freesolo/training/storage.py +57 -0
- freesolo-0.2.3/pypi/freesolo/training/train_grpo.py +433 -0
- freesolo-0.2.3/pypi/freesolo/training/train_sft.py +264 -0
- freesolo-0.2.3/pypi/freesolo/training/types.py +22 -0
- freesolo-0.2.3/pypi/freesolo/util.py +462 -0
- freesolo-0.2.3/pypi/freesolo/utils/__init__.py +119 -0
- freesolo-0.2.3/pypi/freesolo/utils/checkpoints.py +262 -0
- freesolo-0.2.3/pypi/freesolo/utils/deployment.py +70 -0
- freesolo-0.2.3/pypi/freesolo/utils/openrouter.py +217 -0
- freesolo-0.2.3/pypi/freesolo/utils/oracle.py +241 -0
- freesolo-0.2.3/pypi/freesolo/utils/wandb.py +344 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pyproject.toml +7 -2
- freesolo-0.2.3/tests/test_evaluation_client.py +160 -0
- freesolo-0.2.3/tests/test_gepa_adapter.py +95 -0
- freesolo-0.2.3/tests/test_storage_sync.py +324 -0
- freesolo-0.2.3/tests/test_utils_checkpoints.py +106 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/uv.lock +302 -2
- freesolo-0.2.2/pypi/freesolo/evaluation/hosted.py +0 -404
- freesolo-0.2.2/pypi/freesolo/evaluation/judges.py +0 -27
- freesolo-0.2.2/pypi/freesolo/evaluation/results.py +0 -61
- freesolo-0.2.2/pypi/freesolo/evaluation/utils.py +0 -11
- freesolo-0.2.2/pypi/freesolo/sdk.py +0 -52
- freesolo-0.2.2/pypi/freesolo/tracing/utils.py +0 -15
- freesolo-0.2.2/pypi/freesolo/utils.py +0 -37
- {freesolo-0.2.2 → freesolo-0.2.3}/.env.example +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/.gitignore +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/.gitignore +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/__init__.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/anthropic/__init__.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/anthropic/chat.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/anthropic/vision.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/gemini/__init__.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/gemini/chat.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/gemini/vision.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/openai/__init__.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/openai/chat.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/openai/vision.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/openrouter/__init__.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/openrouter/chat.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/utils.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/py.typed +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/__init__.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/anthropic.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/config.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/gemini.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/openai.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/providers/utils.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/pypi/freesolo/tracing/sanitize.py +0 -0
- {freesolo-0.2.2 → freesolo-0.2.3}/ruff.toml +0 -0
|
@@ -1,3 +1,21 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: freesolo
|
|
3
|
+
Version: 0.2.3
|
|
4
|
+
Summary: Tracing, evaluation, and training utilities for LLM applications.
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: httpx>=0.27.0
|
|
7
|
+
Requires-Dist: wandb>=0.17.0
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
10
|
+
Requires-Dist: ruff>=0.11.0; extra == 'dev'
|
|
11
|
+
Provides-Extra: examples
|
|
12
|
+
Requires-Dist: anthropic>=0.40.0; extra == 'examples'
|
|
13
|
+
Requires-Dist: google-genai>=1.0.0; extra == 'examples'
|
|
14
|
+
Requires-Dist: openai>=1.0.0; extra == 'examples'
|
|
15
|
+
Provides-Extra: gepa
|
|
16
|
+
Requires-Dist: gepa>=0.1.1; extra == 'gepa'
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
1
19
|
# freesolo
|
|
2
20
|
|
|
3
21
|
`freesolo` is a Python tracing and evaluation package for LLM apps.
|
|
@@ -7,7 +25,7 @@ It is built for the lowest-friction integration possible:
|
|
|
7
25
|
1. Install the package
|
|
8
26
|
2. Set `FREESOLO_API_KEY`
|
|
9
27
|
3. Wrap your OpenAI, Anthropic, Gemini, or OpenAI-compatible client
|
|
10
|
-
4. Run traces and evaluations from the
|
|
28
|
+
4. Run traces and evaluations from the package APIs
|
|
11
29
|
|
|
12
30
|
## Current provider support
|
|
13
31
|
|
|
@@ -20,7 +38,7 @@ It is built for the lowest-friction integration possible:
|
|
|
20
38
|
|
|
21
39
|
## Install
|
|
22
40
|
|
|
23
|
-
Install the package plus the provider
|
|
41
|
+
Install the package plus the provider client you use:
|
|
24
42
|
|
|
25
43
|
```bash
|
|
26
44
|
pip install freesolo openai
|
|
@@ -154,7 +172,7 @@ with start_trace("support-agent-run"):
|
|
|
154
172
|
|
|
155
173
|
## Evaluations
|
|
156
174
|
|
|
157
|
-
`freesolo` also includes a small evaluation
|
|
175
|
+
`freesolo` also includes a small evaluation API for CI jobs, GitHub bots, and
|
|
158
176
|
eval scripts. All evaluation runs require `FREESOLO_API_KEY` or an explicit
|
|
159
177
|
`api_key`.
|
|
160
178
|
|
|
@@ -168,8 +186,7 @@ results with your API key. Pass scorer objects, not strings.
|
|
|
168
186
|
```python
|
|
169
187
|
from typing import Any
|
|
170
188
|
|
|
171
|
-
from freesolo import
|
|
172
|
-
from freesolo.evaluation import BinaryResponse, CustomScorer
|
|
189
|
+
from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
|
|
173
190
|
|
|
174
191
|
|
|
175
192
|
class ExactMatch(CustomScorer[BinaryResponse]):
|
|
@@ -182,9 +199,9 @@ class ExactMatch(CustomScorer[BinaryResponse]):
|
|
|
182
199
|
)
|
|
183
200
|
|
|
184
201
|
|
|
185
|
-
client =
|
|
202
|
+
client = EvaluationClient()
|
|
186
203
|
|
|
187
|
-
results = client.
|
|
204
|
+
results = client.run(
|
|
188
205
|
name="support-agent-correctness",
|
|
189
206
|
data=[
|
|
190
207
|
{
|
|
@@ -199,13 +216,123 @@ results = client.evals.run(
|
|
|
199
216
|
print(results[0].success)
|
|
200
217
|
```
|
|
201
218
|
|
|
219
|
+
## Tinker Deployment
|
|
220
|
+
|
|
221
|
+
`freesolo.utils.deployment` is a thin proxy for the Modal deployment server. It posts
|
|
222
|
+
a Tinker checkpoint URL to the pinned Modal `/deployments` endpoint and returns
|
|
223
|
+
the server JSON response.
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
from freesolo.utils.deployment import deploy_tinker_checkpoint
|
|
227
|
+
|
|
228
|
+
result = deploy_tinker_checkpoint(
|
|
229
|
+
"tinker://<run_id>/sampler_weights/final",
|
|
230
|
+
base_model="Qwen/Qwen3.5-35B-A3B",
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
print(result["repoId"])
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### Environment-driven evaluations
|
|
237
|
+
|
|
238
|
+
For training contracts, you can use the same `Environment` adapter for evals,
|
|
239
|
+
SFT, and GRPO. `run_environment` loads examples, builds prompt messages, calls
|
|
240
|
+
your model callback, scores the response through the environment, and uploads
|
|
241
|
+
the same `scorers_data` shape used by the eval DB.
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
from typing import Any
|
|
245
|
+
|
|
246
|
+
from openai import OpenAI
|
|
247
|
+
|
|
248
|
+
from freesolo.environments import (
|
|
249
|
+
Environment,
|
|
250
|
+
EnvironmentGeneration,
|
|
251
|
+
RewardMetric,
|
|
252
|
+
RewardResult,
|
|
253
|
+
TaskExample,
|
|
254
|
+
)
|
|
255
|
+
from freesolo.evaluation import EvaluationClient
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class ContractEnvironment(Environment):
|
|
259
|
+
def build_prompt_messages(
|
|
260
|
+
self,
|
|
261
|
+
example: TaskExample,
|
|
262
|
+
contract_text: str,
|
|
263
|
+
):
|
|
264
|
+
return [
|
|
265
|
+
{"role": "system", "content": contract_text},
|
|
266
|
+
{"role": "user", "content": example.task},
|
|
267
|
+
]
|
|
268
|
+
|
|
269
|
+
def score_response(
|
|
270
|
+
self,
|
|
271
|
+
example: TaskExample,
|
|
272
|
+
response_text: str,
|
|
273
|
+
) -> RewardResult:
|
|
274
|
+
passed = response_text.strip() == str(example.expected_output).strip()
|
|
275
|
+
return RewardResult(
|
|
276
|
+
name="exact_match",
|
|
277
|
+
score=1.0 if passed else 0.0,
|
|
278
|
+
success=passed,
|
|
279
|
+
threshold=1.0,
|
|
280
|
+
reason="matched expected output" if passed else "mismatch",
|
|
281
|
+
return_type="binary",
|
|
282
|
+
metrics=(
|
|
283
|
+
RewardMetric(
|
|
284
|
+
name="canonical_match",
|
|
285
|
+
score=1.0 if passed else 0.0,
|
|
286
|
+
success=passed,
|
|
287
|
+
threshold=1.0,
|
|
288
|
+
),
|
|
289
|
+
),
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
model = OpenAI()
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def generate(messages: list[dict[str, str]], example: TaskExample):
|
|
297
|
+
response = model.chat.completions.create(
|
|
298
|
+
model="gpt-4.1-mini",
|
|
299
|
+
messages=messages,
|
|
300
|
+
)
|
|
301
|
+
return EnvironmentGeneration(
|
|
302
|
+
response_text=response.choices[0].message.content or "",
|
|
303
|
+
total_tokens=response.usage.total_tokens if response.usage else None,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
results = EvaluationClient().run_environment(
|
|
308
|
+
name="contract-eval",
|
|
309
|
+
source="eval.jsonl",
|
|
310
|
+
contract_path="TRAINING_CONTRACT.md",
|
|
311
|
+
environment=ContractEnvironment(),
|
|
312
|
+
generate=generate,
|
|
313
|
+
)
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
`RewardResult` is the top-level scorer entry stored in
|
|
317
|
+
`eval_tasks.scorers_data`. Its fields are:
|
|
318
|
+
|
|
319
|
+
- `name`: scorer name shown in the UI.
|
|
320
|
+
- `score`: numeric reward value.
|
|
321
|
+
- `success`: pass/fail. If omitted, Freesolo derives it from `threshold`, then
|
|
322
|
+
from whether `score > 0`.
|
|
323
|
+
- `threshold`, `value`, `reason`, `error`, `return_type`: scorer display and
|
|
324
|
+
pass/fail context.
|
|
325
|
+
- `latency_ms`, `total_tokens`: optional per-response usage metadata.
|
|
326
|
+
- `metadata`: JSON object for scorer-specific details.
|
|
327
|
+
- `metrics`: optional `RewardMetric` components, also JSON-only, with `name`,
|
|
328
|
+
`score`, `value`, `success`, `threshold`, `weight`, `reason`, and `metadata`.
|
|
329
|
+
|
|
202
330
|
Custom scorer:
|
|
203
331
|
|
|
204
332
|
```python
|
|
205
333
|
from typing import Any
|
|
206
334
|
|
|
207
|
-
from freesolo import
|
|
208
|
-
from freesolo.evaluation import BinaryResponse, CustomScorer
|
|
335
|
+
from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
|
|
209
336
|
|
|
210
337
|
|
|
211
338
|
class NoEmptyAnswer(CustomScorer[BinaryResponse]):
|
|
@@ -214,7 +341,7 @@ class NoEmptyAnswer(CustomScorer[BinaryResponse]):
|
|
|
214
341
|
return BinaryResponse(value=ok, reason="actual_output is non-empty")
|
|
215
342
|
|
|
216
343
|
|
|
217
|
-
results =
|
|
344
|
+
results = EvaluationClient().run(
|
|
218
345
|
name="support-agent-non-empty",
|
|
219
346
|
data=[{"actual_output": "hello"}],
|
|
220
347
|
scorers=[NoEmptyAnswer()],
|
|
@@ -232,8 +359,8 @@ from typing import Any
|
|
|
232
359
|
|
|
233
360
|
from openai import OpenAI
|
|
234
361
|
|
|
235
|
-
from freesolo import
|
|
236
|
-
from freesolo.evaluation import CustomScorer, NumericResponse
|
|
362
|
+
from freesolo import instrument_openai
|
|
363
|
+
from freesolo.evaluation import CustomScorer, EvaluationClient, NumericResponse
|
|
237
364
|
|
|
238
365
|
|
|
239
366
|
class CorrectnessJudge(CustomScorer[NumericResponse]):
|
|
@@ -278,7 +405,7 @@ class CorrectnessJudge(CustomScorer[NumericResponse]):
|
|
|
278
405
|
|
|
279
406
|
judge_client = instrument_openai(OpenAI())
|
|
280
407
|
|
|
281
|
-
results =
|
|
408
|
+
results = EvaluationClient().run(
|
|
282
409
|
name="support-agent-correctness",
|
|
283
410
|
data=[
|
|
284
411
|
{
|
|
@@ -302,27 +429,16 @@ Hosted scorers are also available out of the box and use OpenRouter by default:
|
|
|
302
429
|
```python
|
|
303
430
|
from freesolo.evaluation import HostedJudgeClient, ReferenceCorrectnessScorer
|
|
304
431
|
|
|
305
|
-
judge = HostedJudgeClient(
|
|
306
|
-
api_key="YOUR_OPENROUTER_API_KEY",
|
|
307
|
-
model="openai/gpt-oss-120b",
|
|
308
|
-
)
|
|
432
|
+
judge = HostedJudgeClient(api_key="YOUR_OPENROUTER_API_KEY")
|
|
309
433
|
|
|
310
434
|
scorer = ReferenceCorrectnessScorer(client=judge)
|
|
311
435
|
```
|
|
312
436
|
|
|
313
|
-
Tracing is available
|
|
437
|
+
Tracing is available through namespaced helpers:
|
|
314
438
|
|
|
315
439
|
```python
|
|
316
|
-
from freesolo import
|
|
440
|
+
from freesolo.tracing import start_trace
|
|
317
441
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
with client.traces.start("support-agent-run"):
|
|
442
|
+
with start_trace("support-agent-run"):
|
|
321
443
|
...
|
|
322
444
|
```
|
|
323
|
-
|
|
324
|
-
You can also import namespaced tracing helpers directly:
|
|
325
|
-
|
|
326
|
-
```python
|
|
327
|
-
from freesolo.tracing import start_trace, wrap
|
|
328
|
-
```
|
|
@@ -1,17 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: freesolo
|
|
3
|
-
Version: 0.2.2
|
|
4
|
-
Summary: Tracing and evaluation SDK for LLM applications.
|
|
5
|
-
Requires-Python: >=3.10
|
|
6
|
-
Requires-Dist: httpx>=0.27.0
|
|
7
|
-
Provides-Extra: dev
|
|
8
|
-
Requires-Dist: ruff>=0.11.0; extra == 'dev'
|
|
9
|
-
Provides-Extra: examples
|
|
10
|
-
Requires-Dist: anthropic>=0.40.0; extra == 'examples'
|
|
11
|
-
Requires-Dist: google-genai>=1.0.0; extra == 'examples'
|
|
12
|
-
Requires-Dist: openai>=1.0.0; extra == 'examples'
|
|
13
|
-
Description-Content-Type: text/markdown
|
|
14
|
-
|
|
15
1
|
# freesolo
|
|
16
2
|
|
|
17
3
|
`freesolo` is a Python tracing and evaluation package for LLM apps.
|
|
@@ -21,7 +7,7 @@ It is built for the lowest-friction integration possible:
|
|
|
21
7
|
1. Install the package
|
|
22
8
|
2. Set `FREESOLO_API_KEY`
|
|
23
9
|
3. Wrap your OpenAI, Anthropic, Gemini, or OpenAI-compatible client
|
|
24
|
-
4. Run traces and evaluations from the
|
|
10
|
+
4. Run traces and evaluations from the package APIs
|
|
25
11
|
|
|
26
12
|
## Current provider support
|
|
27
13
|
|
|
@@ -34,7 +20,7 @@ It is built for the lowest-friction integration possible:
|
|
|
34
20
|
|
|
35
21
|
## Install
|
|
36
22
|
|
|
37
|
-
Install the package plus the provider
|
|
23
|
+
Install the package plus the provider client you use:
|
|
38
24
|
|
|
39
25
|
```bash
|
|
40
26
|
pip install freesolo openai
|
|
@@ -168,7 +154,7 @@ with start_trace("support-agent-run"):
|
|
|
168
154
|
|
|
169
155
|
## Evaluations
|
|
170
156
|
|
|
171
|
-
`freesolo` also includes a small evaluation
|
|
157
|
+
`freesolo` also includes a small evaluation API for CI jobs, GitHub bots, and
|
|
172
158
|
eval scripts. All evaluation runs require `FREESOLO_API_KEY` or an explicit
|
|
173
159
|
`api_key`.
|
|
174
160
|
|
|
@@ -182,8 +168,7 @@ results with your API key. Pass scorer objects, not strings.
|
|
|
182
168
|
```python
|
|
183
169
|
from typing import Any
|
|
184
170
|
|
|
185
|
-
from freesolo import
|
|
186
|
-
from freesolo.evaluation import BinaryResponse, CustomScorer
|
|
171
|
+
from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
|
|
187
172
|
|
|
188
173
|
|
|
189
174
|
class ExactMatch(CustomScorer[BinaryResponse]):
|
|
@@ -196,9 +181,9 @@ class ExactMatch(CustomScorer[BinaryResponse]):
|
|
|
196
181
|
)
|
|
197
182
|
|
|
198
183
|
|
|
199
|
-
client =
|
|
184
|
+
client = EvaluationClient()
|
|
200
185
|
|
|
201
|
-
results = client.
|
|
186
|
+
results = client.run(
|
|
202
187
|
name="support-agent-correctness",
|
|
203
188
|
data=[
|
|
204
189
|
{
|
|
@@ -213,13 +198,123 @@ results = client.evals.run(
|
|
|
213
198
|
print(results[0].success)
|
|
214
199
|
```
|
|
215
200
|
|
|
201
|
+
## Tinker Deployment
|
|
202
|
+
|
|
203
|
+
`freesolo.utils.deployment` is a thin proxy for the Modal deployment server. It posts
|
|
204
|
+
a Tinker checkpoint URL to the pinned Modal `/deployments` endpoint and returns
|
|
205
|
+
the server JSON response.
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
from freesolo.utils.deployment import deploy_tinker_checkpoint
|
|
209
|
+
|
|
210
|
+
result = deploy_tinker_checkpoint(
|
|
211
|
+
"tinker://<run_id>/sampler_weights/final",
|
|
212
|
+
base_model="Qwen/Qwen3.5-35B-A3B",
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
print(result["repoId"])
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### Environment-driven evaluations
|
|
219
|
+
|
|
220
|
+
For training contracts, you can use the same `Environment` adapter for evals,
|
|
221
|
+
SFT, and GRPO. `run_environment` loads examples, builds prompt messages, calls
|
|
222
|
+
your model callback, scores the response through the environment, and uploads
|
|
223
|
+
the same `scorers_data` shape used by the eval DB.
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
from typing import Any
|
|
227
|
+
|
|
228
|
+
from openai import OpenAI
|
|
229
|
+
|
|
230
|
+
from freesolo.environments import (
|
|
231
|
+
Environment,
|
|
232
|
+
EnvironmentGeneration,
|
|
233
|
+
RewardMetric,
|
|
234
|
+
RewardResult,
|
|
235
|
+
TaskExample,
|
|
236
|
+
)
|
|
237
|
+
from freesolo.evaluation import EvaluationClient
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
class ContractEnvironment(Environment):
|
|
241
|
+
def build_prompt_messages(
|
|
242
|
+
self,
|
|
243
|
+
example: TaskExample,
|
|
244
|
+
contract_text: str,
|
|
245
|
+
):
|
|
246
|
+
return [
|
|
247
|
+
{"role": "system", "content": contract_text},
|
|
248
|
+
{"role": "user", "content": example.task},
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
def score_response(
|
|
252
|
+
self,
|
|
253
|
+
example: TaskExample,
|
|
254
|
+
response_text: str,
|
|
255
|
+
) -> RewardResult:
|
|
256
|
+
passed = response_text.strip() == str(example.expected_output).strip()
|
|
257
|
+
return RewardResult(
|
|
258
|
+
name="exact_match",
|
|
259
|
+
score=1.0 if passed else 0.0,
|
|
260
|
+
success=passed,
|
|
261
|
+
threshold=1.0,
|
|
262
|
+
reason="matched expected output" if passed else "mismatch",
|
|
263
|
+
return_type="binary",
|
|
264
|
+
metrics=(
|
|
265
|
+
RewardMetric(
|
|
266
|
+
name="canonical_match",
|
|
267
|
+
score=1.0 if passed else 0.0,
|
|
268
|
+
success=passed,
|
|
269
|
+
threshold=1.0,
|
|
270
|
+
),
|
|
271
|
+
),
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
model = OpenAI()
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def generate(messages: list[dict[str, str]], example: TaskExample):
|
|
279
|
+
response = model.chat.completions.create(
|
|
280
|
+
model="gpt-4.1-mini",
|
|
281
|
+
messages=messages,
|
|
282
|
+
)
|
|
283
|
+
return EnvironmentGeneration(
|
|
284
|
+
response_text=response.choices[0].message.content or "",
|
|
285
|
+
total_tokens=response.usage.total_tokens if response.usage else None,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
results = EvaluationClient().run_environment(
|
|
290
|
+
name="contract-eval",
|
|
291
|
+
source="eval.jsonl",
|
|
292
|
+
contract_path="TRAINING_CONTRACT.md",
|
|
293
|
+
environment=ContractEnvironment(),
|
|
294
|
+
generate=generate,
|
|
295
|
+
)
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
`RewardResult` is the top-level scorer entry stored in
|
|
299
|
+
`eval_tasks.scorers_data`. Its fields are:
|
|
300
|
+
|
|
301
|
+
- `name`: scorer name shown in the UI.
|
|
302
|
+
- `score`: numeric reward value.
|
|
303
|
+
- `success`: pass/fail. If omitted, Freesolo derives it from `threshold`, then
|
|
304
|
+
from whether `score > 0`.
|
|
305
|
+
- `threshold`, `value`, `reason`, `error`, `return_type`: scorer display and
|
|
306
|
+
pass/fail context.
|
|
307
|
+
- `latency_ms`, `total_tokens`: optional per-response usage metadata.
|
|
308
|
+
- `metadata`: JSON object for scorer-specific details.
|
|
309
|
+
- `metrics`: optional `RewardMetric` components, also JSON-only, with `name`,
|
|
310
|
+
`score`, `value`, `success`, `threshold`, `weight`, `reason`, and `metadata`.
|
|
311
|
+
|
|
216
312
|
Custom scorer:
|
|
217
313
|
|
|
218
314
|
```python
|
|
219
315
|
from typing import Any
|
|
220
316
|
|
|
221
|
-
from freesolo import
|
|
222
|
-
from freesolo.evaluation import BinaryResponse, CustomScorer
|
|
317
|
+
from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
|
|
223
318
|
|
|
224
319
|
|
|
225
320
|
class NoEmptyAnswer(CustomScorer[BinaryResponse]):
|
|
@@ -228,7 +323,7 @@ class NoEmptyAnswer(CustomScorer[BinaryResponse]):
|
|
|
228
323
|
return BinaryResponse(value=ok, reason="actual_output is non-empty")
|
|
229
324
|
|
|
230
325
|
|
|
231
|
-
results =
|
|
326
|
+
results = EvaluationClient().run(
|
|
232
327
|
name="support-agent-non-empty",
|
|
233
328
|
data=[{"actual_output": "hello"}],
|
|
234
329
|
scorers=[NoEmptyAnswer()],
|
|
@@ -246,8 +341,8 @@ from typing import Any
|
|
|
246
341
|
|
|
247
342
|
from openai import OpenAI
|
|
248
343
|
|
|
249
|
-
from freesolo import
|
|
250
|
-
from freesolo.evaluation import CustomScorer, NumericResponse
|
|
344
|
+
from freesolo import instrument_openai
|
|
345
|
+
from freesolo.evaluation import CustomScorer, EvaluationClient, NumericResponse
|
|
251
346
|
|
|
252
347
|
|
|
253
348
|
class CorrectnessJudge(CustomScorer[NumericResponse]):
|
|
@@ -292,7 +387,7 @@ class CorrectnessJudge(CustomScorer[NumericResponse]):
|
|
|
292
387
|
|
|
293
388
|
judge_client = instrument_openai(OpenAI())
|
|
294
389
|
|
|
295
|
-
results =
|
|
390
|
+
results = EvaluationClient().run(
|
|
296
391
|
name="support-agent-correctness",
|
|
297
392
|
data=[
|
|
298
393
|
{
|
|
@@ -316,27 +411,16 @@ Hosted scorers are also available out of the box and use OpenRouter by default:
|
|
|
316
411
|
```python
|
|
317
412
|
from freesolo.evaluation import HostedJudgeClient, ReferenceCorrectnessScorer
|
|
318
413
|
|
|
319
|
-
judge = HostedJudgeClient(
|
|
320
|
-
api_key="YOUR_OPENROUTER_API_KEY",
|
|
321
|
-
model="openai/gpt-oss-120b",
|
|
322
|
-
)
|
|
414
|
+
judge = HostedJudgeClient(api_key="YOUR_OPENROUTER_API_KEY")
|
|
323
415
|
|
|
324
416
|
scorer = ReferenceCorrectnessScorer(client=judge)
|
|
325
417
|
```
|
|
326
418
|
|
|
327
|
-
Tracing is available
|
|
419
|
+
Tracing is available through namespaced helpers:
|
|
328
420
|
|
|
329
421
|
```python
|
|
330
|
-
from freesolo import
|
|
422
|
+
from freesolo.tracing import start_trace
|
|
331
423
|
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
with client.traces.start("support-agent-run"):
|
|
424
|
+
with start_trace("support-agent-run"):
|
|
335
425
|
...
|
|
336
426
|
```
|
|
337
|
-
|
|
338
|
-
You can also import namespaced tracing helpers directly:
|
|
339
|
-
|
|
340
|
-
```python
|
|
341
|
-
from freesolo.tracing import start_trace, wrap
|
|
342
|
-
```
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Evaluation examples for the Python package."""
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from freesolo import BinaryResponse, CustomScorer
|
|
7
|
+
from freesolo.evaluation import EvaluationClient
|
|
8
|
+
|
|
9
|
+
from ..utils import configure_example
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ExactMatchScorer(CustomScorer[BinaryResponse]):
|
|
13
|
+
name = "exact_match"
|
|
14
|
+
|
|
15
|
+
async def score(self, row: dict[str, Any]) -> BinaryResponse:
|
|
16
|
+
actual = str(row.get("actual_output", "")).strip()
|
|
17
|
+
expected = str(row.get("expected_output", "")).strip()
|
|
18
|
+
success = bool(actual) and actual == expected
|
|
19
|
+
return BinaryResponse(
|
|
20
|
+
success,
|
|
21
|
+
reason=(
|
|
22
|
+
"actual_output matched expected_output"
|
|
23
|
+
if success
|
|
24
|
+
else f'expected "{expected}" but got "{actual or "<empty>"}"'
|
|
25
|
+
),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class NonEmptyOutputScorer(CustomScorer[BinaryResponse]):
|
|
30
|
+
name = "non_empty_output"
|
|
31
|
+
|
|
32
|
+
async def score(self, row: dict[str, Any]) -> BinaryResponse:
|
|
33
|
+
actual = str(row.get("actual_output", "")).strip()
|
|
34
|
+
return BinaryResponse(
|
|
35
|
+
bool(actual),
|
|
36
|
+
reason=(
|
|
37
|
+
"actual_output is non-empty" if actual else "actual_output was empty"
|
|
38
|
+
),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def parse_args() -> argparse.Namespace:
|
|
43
|
+
parser = argparse.ArgumentParser(description="Exact-match eval example.")
|
|
44
|
+
parser.add_argument("name", nargs="?", default="eval-example-exact-match-py")
|
|
45
|
+
return parser.parse_args()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def build_dataset() -> list[dict[str, str]]:
|
|
49
|
+
return [
|
|
50
|
+
{
|
|
51
|
+
"input": "What is the capital of France?",
|
|
52
|
+
"actual_output": "Paris",
|
|
53
|
+
"expected_output": "Paris",
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"input": "What is the capital of Canada?",
|
|
57
|
+
"actual_output": "Ottawa is the capital of Canada.",
|
|
58
|
+
"expected_output": "Ottawa",
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
"input": "What is 2 + 2?",
|
|
62
|
+
"actual_output": "4",
|
|
63
|
+
"expected_output": "4",
|
|
64
|
+
},
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def print_results(name: str, results: list[Any]) -> None:
|
|
69
|
+
run_id = results[0].run_id if results else "unknown"
|
|
70
|
+
print(f"eval_name={name}")
|
|
71
|
+
print(f"run_id={run_id}")
|
|
72
|
+
print()
|
|
73
|
+
|
|
74
|
+
for index, result in enumerate(results, start=1):
|
|
75
|
+
summary = " | ".join(
|
|
76
|
+
(
|
|
77
|
+
f"{scorer.name}:{'pass' if scorer.success else 'fail'}"
|
|
78
|
+
+ (f" ({scorer.reason})" if scorer.reason else "")
|
|
79
|
+
)
|
|
80
|
+
for scorer in result.scorers_data
|
|
81
|
+
)
|
|
82
|
+
print(f"{index}. {'pass' if result.success else 'fail'} -> {summary}")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def main() -> None:
|
|
86
|
+
args = parse_args()
|
|
87
|
+
configure_example()
|
|
88
|
+
|
|
89
|
+
client = EvaluationClient()
|
|
90
|
+
results = client.run(
|
|
91
|
+
name=args.name,
|
|
92
|
+
data=build_dataset(),
|
|
93
|
+
scorers=[ExactMatchScorer(), NonEmptyOutputScorer()],
|
|
94
|
+
metadata={
|
|
95
|
+
"model": "rule-based-exact-match",
|
|
96
|
+
"provider": "local",
|
|
97
|
+
"source": "python",
|
|
98
|
+
"example": "exact-match",
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
print_results(args.name, results)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
main()
|