freesolo 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. freesolo-0.2.4/.github/workflows/publish-packages.yml +96 -0
  2. freesolo-0.2.4/.github/workflows/python-checks.yml +41 -0
  3. freesolo-0.2.4/.github/workflows/sync-package-function-usage.yml +38 -0
  4. freesolo-0.2.4/PKG-INFO +392 -0
  5. freesolo-0.2.4/README.md +368 -0
  6. freesolo-0.2.4/examples/PROMPT.md +10 -0
  7. freesolo-0.2.4/examples/README.md +96 -0
  8. freesolo-0.2.4/examples/TRAINING_CONTRACT.md +10 -0
  9. freesolo-0.2.4/examples/data/support_eval.jsonl +3 -0
  10. freesolo-0.2.4/examples/data/support_train.jsonl +3 -0
  11. freesolo-0.2.4/examples/environment.py +110 -0
  12. freesolo-0.2.4/examples/evaluation_custom_scorer.py +105 -0
  13. freesolo-0.2.4/examples/evaluation_from_files.py +47 -0
  14. freesolo-0.2.4/examples/gepa_prompt_example.py +76 -0
  15. freesolo-0.2.4/examples/support_dataset.py +20 -0
  16. freesolo-0.2.4/examples/tracing_manual_span.py +47 -0
  17. freesolo-0.2.4/examples/training_sft_grpo.py +75 -0
  18. freesolo-0.2.4/function_usage_registry.json +12 -0
  19. freesolo-0.2.4/pypi/freesolo/__init__.py +1 -0
  20. freesolo-0.2.4/pypi/freesolo/_usage.py +39 -0
  21. freesolo-0.2.4/pypi/freesolo/contracts/__init__.py +23 -0
  22. freesolo-0.2.4/pypi/freesolo/contracts/markdown.py +76 -0
  23. freesolo-0.2.4/pypi/freesolo/contracts/types.py +29 -0
  24. freesolo-0.2.4/pypi/freesolo/datasets/__init__.py +13 -0
  25. freesolo-0.2.4/pypi/freesolo/datasets/core.py +74 -0
  26. freesolo-0.2.4/pypi/freesolo/datasets/records.py +139 -0
  27. freesolo-0.2.4/pypi/freesolo/datasets/types.py +26 -0
  28. freesolo-0.2.4/pypi/freesolo/environments/__init__.py +17 -0
  29. freesolo-0.2.4/pypi/freesolo/environments/base.py +162 -0
  30. freesolo-0.2.4/pypi/freesolo/environments/evaluation.py +358 -0
  31. freesolo-0.2.4/pypi/freesolo/environments/types.py +79 -0
  32. {freesolo-0.2.2 → freesolo-0.2.4}/pypi/freesolo/evaluation/__init__.py +5 -8
  33. {freesolo-0.2.2 → freesolo-0.2.4}/pypi/freesolo/evaluation/client.py +86 -18
  34. freesolo-0.2.4/pypi/freesolo/evaluation/judges/__init__.py +19 -0
  35. freesolo-0.2.4/pypi/freesolo/evaluation/judges/base.py +135 -0
  36. freesolo-0.2.4/pypi/freesolo/evaluation/judges/groundedness.py +34 -0
  37. freesolo-0.2.4/pypi/freesolo/evaluation/judges/instruction_following.py +31 -0
  38. freesolo-0.2.4/pypi/freesolo/evaluation/judges/pairwise_preference.py +45 -0
  39. freesolo-0.2.4/pypi/freesolo/evaluation/judges/reference_correctness.py +26 -0
  40. freesolo-0.2.4/pypi/freesolo/evaluation/judges/rubric.py +46 -0
  41. {freesolo-0.2.2 → freesolo-0.2.4}/pypi/freesolo/evaluation/responses.py +16 -8
  42. freesolo-0.2.4/pypi/freesolo/evaluation/results.py +93 -0
  43. freesolo-0.2.4/pypi/freesolo/evaluation/types.py +16 -0
  44. freesolo-0.2.4/pypi/freesolo/gepa/__init__.py +17 -0
  45. freesolo-0.2.4/pypi/freesolo/gepa/adapter.py +226 -0
  46. freesolo-0.2.4/pypi/freesolo/gepa/reflection.py +103 -0
  47. freesolo-0.2.4/pypi/freesolo/gepa/setup.py +219 -0
  48. freesolo-0.2.4/pypi/freesolo/gepa/types.py +120 -0
  49. freesolo-0.2.4/pypi/freesolo/tracing/__init__.py +13 -0
  50. freesolo-0.2.4/pypi/freesolo/tracing/otel.py +250 -0
  51. freesolo-0.2.4/pypi/freesolo/training/__init__.py +10 -0
  52. freesolo-0.2.4/pypi/freesolo/training/grpo/__init__.py +0 -0
  53. freesolo-0.2.4/pypi/freesolo/training/grpo/config.py +38 -0
  54. freesolo-0.2.4/pypi/freesolo/training/grpo/datums.py +196 -0
  55. freesolo-0.2.4/pypi/freesolo/training/grpo/rewards.py +133 -0
  56. freesolo-0.2.4/pypi/freesolo/training/grpo/sampling.py +127 -0
  57. freesolo-0.2.4/pypi/freesolo/training/storage.py +54 -0
  58. freesolo-0.2.4/pypi/freesolo/training/train_grpo.py +429 -0
  59. freesolo-0.2.4/pypi/freesolo/training/train_sft.py +284 -0
  60. freesolo-0.2.4/pypi/freesolo/training/types.py +34 -0
  61. freesolo-0.2.4/pypi/freesolo/utils/__init__.py +0 -0
  62. freesolo-0.2.4/pypi/freesolo/utils/checkpoints.py +239 -0
  63. freesolo-0.2.4/pypi/freesolo/utils/core.py +255 -0
  64. freesolo-0.2.4/pypi/freesolo/utils/openrouter.py +180 -0
  65. freesolo-0.2.4/pypi/freesolo/utils/oracle.py +240 -0
  66. freesolo-0.2.4/pypi/freesolo/utils/storage.py +239 -0
  67. freesolo-0.2.4/pypi/freesolo/utils/upload.py +60 -0
  68. freesolo-0.2.4/pypi/freesolo/utils/wandb.py +303 -0
  69. freesolo-0.2.4/pyproject.toml +36 -0
  70. {freesolo-0.2.2 → freesolo-0.2.4}/ruff.toml +3 -0
  71. freesolo-0.2.4/scripts/sync_package_function_usage.py +81 -0
  72. freesolo-0.2.4/tests/end_to_end_testing/test_environment_evaluation_flow.py +140 -0
  73. freesolo-0.2.4/tests/end_to_end_testing/test_examples.py +141 -0
  74. freesolo-0.2.4/tests/functionality/test_datasets.py +113 -0
  75. freesolo-0.2.4/tests/functionality/test_evaluation_client.py +161 -0
  76. freesolo-0.2.4/tests/functionality/test_gepa_adapter.py +133 -0
  77. freesolo-0.2.4/tests/functionality/test_records_rewards_and_config.py +126 -0
  78. freesolo-0.2.4/tests/functionality/test_storage_sync.py +447 -0
  79. freesolo-0.2.4/tests/functionality/test_tracing_opentelemetry.py +128 -0
  80. freesolo-0.2.4/tests/functionality/test_upload.py +97 -0
  81. freesolo-0.2.4/tests/functionality/test_usage_registry.py +63 -0
  82. freesolo-0.2.4/tests/functionality/test_utils_checkpoints.py +106 -0
  83. freesolo-0.2.4/tests/security/test_sanitize_and_contract_security.py +60 -0
  84. freesolo-0.2.4/uv.lock +3328 -0
  85. freesolo-0.2.2/PKG-INFO +0 -342
  86. freesolo-0.2.2/README.md +0 -328
  87. freesolo-0.2.2/pypi/examples/.env.example +0 -11
  88. freesolo-0.2.2/pypi/examples/__init__.py +0 -1
  89. freesolo-0.2.2/pypi/examples/anthropic/__init__.py +0 -1
  90. freesolo-0.2.2/pypi/examples/anthropic/chat.py +0 -56
  91. freesolo-0.2.2/pypi/examples/anthropic/vision.py +0 -82
  92. freesolo-0.2.2/pypi/examples/gemini/__init__.py +0 -1
  93. freesolo-0.2.2/pypi/examples/gemini/chat.py +0 -53
  94. freesolo-0.2.2/pypi/examples/gemini/vision.py +0 -79
  95. freesolo-0.2.2/pypi/examples/openai/__init__.py +0 -1
  96. freesolo-0.2.2/pypi/examples/openai/chat.py +0 -56
  97. freesolo-0.2.2/pypi/examples/openai/vision.py +0 -64
  98. freesolo-0.2.2/pypi/examples/openrouter/__init__.py +0 -1
  99. freesolo-0.2.2/pypi/examples/openrouter/chat.py +0 -60
  100. freesolo-0.2.2/pypi/examples/utils.py +0 -231
  101. freesolo-0.2.2/pypi/freesolo/__init__.py +0 -59
  102. freesolo-0.2.2/pypi/freesolo/evaluation/hosted.py +0 -404
  103. freesolo-0.2.2/pypi/freesolo/evaluation/judges.py +0 -27
  104. freesolo-0.2.2/pypi/freesolo/evaluation/results.py +0 -61
  105. freesolo-0.2.2/pypi/freesolo/evaluation/utils.py +0 -11
  106. freesolo-0.2.2/pypi/freesolo/sdk.py +0 -52
  107. freesolo-0.2.2/pypi/freesolo/tracing/__init__.py +0 -27
  108. freesolo-0.2.2/pypi/freesolo/tracing/client.py +0 -583
  109. freesolo-0.2.2/pypi/freesolo/tracing/decorators.py +0 -63
  110. freesolo-0.2.2/pypi/freesolo/tracing/providers/__init__.py +0 -14
  111. freesolo-0.2.2/pypi/freesolo/tracing/providers/anthropic.py +0 -111
  112. freesolo-0.2.2/pypi/freesolo/tracing/providers/config.py +0 -101
  113. freesolo-0.2.2/pypi/freesolo/tracing/providers/gemini.py +0 -205
  114. freesolo-0.2.2/pypi/freesolo/tracing/providers/openai.py +0 -208
  115. freesolo-0.2.2/pypi/freesolo/tracing/providers/utils.py +0 -276
  116. freesolo-0.2.2/pypi/freesolo/tracing/utils.py +0 -15
  117. freesolo-0.2.2/pypi/freesolo/utils.py +0 -37
  118. freesolo-0.2.2/pyproject.toml +0 -26
  119. freesolo-0.2.2/uv.lock +0 -904
  120. {freesolo-0.2.2 → freesolo-0.2.4}/.env.example +0 -0
  121. {freesolo-0.2.2 → freesolo-0.2.4}/.gitignore +0 -0
  122. {freesolo-0.2.2 → freesolo-0.2.4}/pypi/.gitignore +0 -0
  123. {freesolo-0.2.2 → freesolo-0.2.4}/pypi/freesolo/py.typed +0 -0
  124. {freesolo-0.2.2 → freesolo-0.2.4}/pypi/freesolo/tracing/sanitize.py +0 -0
@@ -0,0 +1,96 @@
1
+ name: Publish packages
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths:
8
+ - "pyproject.toml"
9
+ - "uv.lock"
10
+ - "pypi/**"
11
+ - "examples/**"
12
+ - ".github/workflows/publish-packages.yml"
13
+ workflow_dispatch:
14
+
15
+ concurrency:
16
+ group: publish-packages-${{ github.ref }}
17
+ cancel-in-progress: false
18
+
19
+ jobs:
20
+ publish-pypi:
21
+ name: Publish PyPI package
22
+ runs-on: ubuntu-latest
23
+ permissions:
24
+ contents: read
25
+ env:
26
+ UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
27
+
28
+ steps:
29
+ - uses: actions/checkout@v6
30
+
31
+ - uses: actions/setup-python@v6
32
+ with:
33
+ python-version: "3.12"
34
+
35
+ - name: Read package metadata
36
+ id: metadata
37
+ run: |
38
+ python - <<'PY' >> "$GITHUB_OUTPUT"
39
+ import tomllib
40
+
41
+ with open("pyproject.toml", "rb") as f:
42
+ project = tomllib.load(f)["project"]
43
+
44
+ print(f"name={project['name']}")
45
+ print(f"version={project['version']}")
46
+ PY
47
+
48
+ - name: Check PyPI for existing version
49
+ id: pypi
50
+ env:
51
+ PACKAGE_NAME: ${{ steps.metadata.outputs.name }}
52
+ PACKAGE_VERSION: ${{ steps.metadata.outputs.version }}
53
+ run: |
54
+ python - <<'PY' >> "$GITHUB_OUTPUT"
55
+ import os
56
+ import urllib.error
57
+ import urllib.request
58
+
59
+ name = os.environ["PACKAGE_NAME"]
60
+ version = os.environ["PACKAGE_VERSION"]
61
+ url = f"https://pypi.org/pypi/{name}/{version}/json"
62
+
63
+ try:
64
+ with urllib.request.urlopen(url, timeout=30) as response:
65
+ exists = response.status == 200
66
+ except urllib.error.HTTPError as error:
67
+ if error.code != 404:
68
+ raise
69
+ exists = False
70
+
71
+ print(f"exists={'true' if exists else 'false'}")
72
+ PY
73
+
74
+ - name: Skip existing PyPI version
75
+ if: steps.pypi.outputs.exists == 'true'
76
+ run: echo "${{ steps.metadata.outputs.name }} ${{ steps.metadata.outputs.version }} is already on PyPI."
77
+
78
+ - name: Install uv
79
+ if: steps.pypi.outputs.exists != 'true'
80
+ run: python -m pip install --upgrade uv
81
+
82
+ - name: Build distributions
83
+ if: steps.pypi.outputs.exists != 'true'
84
+ run: |
85
+ rm -rf dist
86
+ uv build
87
+
88
+ - name: Publish to PyPI
89
+ if: steps.pypi.outputs.exists != 'true' && env.UV_PUBLISH_TOKEN != ''
90
+ run: uv publish
91
+
92
+ - name: Skip publish without PyPI token
93
+ if: steps.pypi.outputs.exists != 'true' && env.UV_PUBLISH_TOKEN == ''
94
+ run: |
95
+ echo "PYPI_API_TOKEN is not configured; built distributions but skipped upload."
96
+ echo "Add a PYPI_API_TOKEN repository secret to publish this package."
@@ -0,0 +1,41 @@
1
+ name: Python checks
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches:
7
+ - main
8
+ workflow_dispatch:
9
+
10
+ permissions:
11
+ contents: read
12
+
13
+ jobs:
14
+ checks:
15
+ name: Ruff and tests
16
+ runs-on: ubuntu-latest
17
+
18
+ steps:
19
+ - uses: actions/checkout@v6
20
+
21
+ - uses: actions/setup-python@v6
22
+ with:
23
+ python-version: "3.12"
24
+
25
+ - name: Install uv
26
+ run: python3 -m pip install --upgrade uv
27
+
28
+ - name: Install dependencies
29
+ run: uv sync --locked --extra dev
30
+
31
+ - name: Python compile check
32
+ run: python3 -m py_compile $(find pypi tests -name '*.py' -print)
33
+
34
+ - name: Ruff check
35
+ run: uv run --extra dev python -m ruff check .
36
+
37
+ - name: Ruff format check
38
+ run: uv run --extra dev python -m ruff format --check .
39
+
40
+ - name: Tests
41
+ run: uv run --extra dev python -m pytest tests
@@ -0,0 +1,38 @@
1
+ name: Sync package function usage
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths:
8
+ - "function_usage_registry.json"
9
+ - "scripts/sync_package_function_usage.py"
10
+ - ".github/workflows/sync-package-function-usage.yml"
11
+ workflow_dispatch:
12
+
13
+ permissions:
14
+ contents: read
15
+
16
+ jobs:
17
+ sync:
18
+ name: Sync usage registry
19
+ runs-on: ubuntu-latest
20
+ if: ${{ github.ref == 'refs/heads/main' }}
21
+ env:
22
+ SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
23
+ SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }}
24
+
25
+ steps:
26
+ - uses: actions/checkout@v6
27
+
28
+ - uses: actions/setup-python@v6
29
+ with:
30
+ python-version: "3.12"
31
+
32
+ - name: Sync package function rows
33
+ if: env.SUPABASE_URL != '' && env.SUPABASE_SERVICE_ROLE_KEY != ''
34
+ run: python scripts/sync_package_function_usage.py --remove-stale
35
+
36
+ - name: Skip without Supabase secrets
37
+ if: env.SUPABASE_URL == '' || env.SUPABASE_SERVICE_ROLE_KEY == ''
38
+ run: echo "SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY secrets are required to sync usage."
@@ -0,0 +1,392 @@
1
+ Metadata-Version: 2.4
2
+ Name: freesolo
3
+ Version: 0.2.4
4
+ Summary: Tracing, evaluation, and training utilities for LLM applications.
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: gepa>=0.1.1
7
+ Requires-Dist: httpx>=0.27.0
8
+ Requires-Dist: jsonschema>=4.0.0
9
+ Requires-Dist: numpy>=1.26.0
10
+ Requires-Dist: opentelemetry-api>=1.28.0
11
+ Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.28.0
12
+ Requires-Dist: opentelemetry-sdk>=1.28.0
13
+ Requires-Dist: pymongo>=4.0.0
14
+ Requires-Dist: python-dotenv>=1.0.0
15
+ Requires-Dist: tinker-cookbook>=0.3.0
16
+ Requires-Dist: tinker>=0.19.0
17
+ Requires-Dist: wandb>=0.17.0
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
20
+ Requires-Dist: ruff>=0.11.0; extra == 'dev'
21
+ Provides-Extra: examples
22
+ Requires-Dist: openai>=1.0.0; extra == 'examples'
23
+ Description-Content-Type: text/markdown
24
+
25
+ # freesolo
26
+
27
+ `freesolo` is a Python tracing and evaluation package for LLM apps.
28
+
29
+ It is built for the lowest-friction integration possible:
30
+
31
+ 1. Install the package
32
+ 2. Set `FREESOLO_API_KEY`
33
+ 3. Configure the tracer
34
+ 4. Run traces and evaluations from the package APIs
35
+
36
+ ## Install
37
+
38
+ Install the package:
39
+
40
+ ```bash
41
+ pip install freesolo
42
+ ```
43
+
44
+ ## Environment
45
+
46
+ - `FREESOLO_API_KEY`
47
+ - `FREESOLO_BASE_URL` (optional, defaults to `https://api.freesolo.co`)
48
+
49
+ ```bash
50
+ export FREESOLO_API_KEY=fslo_...
51
+ ```
52
+
53
+ ## Quickstart
54
+
55
+ ```python
56
+ from freesolo.tracing import configure_tracer, get_tracer
57
+
58
+ configure_tracer(service_name="my-llm-app")
59
+ tracer = get_tracer()
60
+
61
+ with tracer.start_as_current_span(
62
+ "model.call",
63
+ attributes={
64
+ "gen_ai.system": "openai",
65
+ "gen_ai.request.model": "gpt-5.5",
66
+ "freesolo.input": {"prompt": "How do I reset my password?"},
67
+ },
68
+ ) as span:
69
+ result = "Reset it from account settings."
70
+ span.set_attribute("freesolo.output", result)
71
+ ```
72
+
73
+ ## Runnable Examples
74
+
75
+ Copy-pasteable examples live in [`examples/`](examples/):
76
+
77
+ - `tracing_manual_span.py`: configure OpenTelemetry and send one application span.
78
+ - `evaluation_custom_scorer.py`: run custom binary and numeric eval scorers.
79
+ - `evaluation_from_files.py`: run evals from a concrete dataset and environment.
80
+ - `environment.py`: example environment used by evals, training, and GEPA.
81
+ - `support_dataset.py`: example dataset paths and loaders used by evals, SFT, GRPO, and GEPA.
82
+ - `gepa_prompt_example.py`: run the Freesolo GEPA adapter over the example dataset.
83
+ - `training_sft_grpo.py`: start SFT or GRPO training runs from package APIs.
84
+
85
+ From a repo checkout:
86
+
87
+ ```bash
88
+ cd freesolo-sdk
89
+ export PYTHONPATH="$PWD/pypi"
90
+ uv run python examples/evaluation_custom_scorer.py --local
91
+ ```
92
+
93
+ ## Public API
94
+
95
+ The root `freesolo` module intentionally exports no functions. Import from the
96
+ subpackages below; lower-level modules may be importable, but they are
97
+ implementation helpers unless they appear here or in an example.
98
+
99
+ | Import | Use case |
100
+ | --- | --- |
101
+ | `freesolo.tracing.configure_tracer`, `get_tracer`, `force_flush`, `shutdown` | Send OpenTelemetry traces from an application to Freesolo. |
102
+ | `freesolo.evaluation.EvaluationClient` | Run custom-scorer evals or environment evals and upload results to Freesolo. |
103
+ | `freesolo.evaluation.run_local_evaluation` | Run custom scorers locally without uploading results. |
104
+ | `freesolo.evaluation.CustomScorer`, `BinaryResponse`, `NumericResponse` | Define local scorer logic for eval rows. |
105
+ | `freesolo.evaluation.HostedJudgeClient` and hosted scorer classes | Use hosted LLM-as-judge scorers with OpenRouter-compatible credentials. |
106
+ | `freesolo.datasets.TaskExample`, `Dataset`, `load_dataset` | Load task examples and construct labeled conversations for evals or training. |
107
+ | `freesolo.environments.Environment`, `RewardResult`, `RewardMetric`, `GrpoConfig`, `EnvironmentGeneration` | Define task behavior once for evals, GEPA, SFT, and GRPO. |
108
+ | `freesolo.training.SftConfig`, `TrainGrpoOptions`, `train_sft`, `train_grpo` | Start SFT or GRPO training from package APIs. |
109
+ | `freesolo.gepa.GEPASetup`, `GEPAConfig`, `DefaultReflectionAgent`, `attach_gepa`, `optimize_gepa` | Optimize prompts through the GEPA adapter using the same environment and dataset abstractions. |
110
+ | `freesolo.contracts.load_contract_text`, `extract_contract_spec`, `load_contract_spec`, `build_oracle_messages` | Read contract markdown and build oracle prompt messages. |
111
+ | `freesolo.utils.oracle.generate_ground_truth_records` | Generate ground-truth JSONL records from source examples using a contract, environment, and oracle model. |
112
+ | `freesolo.utils.upload.upload_tinker_checkpoint_to_huggingface` | Upload a Tinker checkpoint to a private Hugging Face model repo. |
113
+
114
+ ## What Gets Stored
115
+
116
+ - Native OTLP traces and spans
117
+ - Resource attributes like `service.name`
118
+ - Span names, timings, parent span ids, status, and errors
119
+ - Common model attributes such as `gen_ai.system`, `gen_ai.request.model`, and token counts
120
+ - Optional `freesolo.input` and `freesolo.output` span attributes
121
+
122
+ ## Notes
123
+
124
+ - Tracing uses native OpenTelemetry protobuf export to `/api/traces/ingest`.
125
+ - Configure third-party OpenTelemetry instrumentors against the provider returned by `configure_tracer(...)`.
126
+ - Delivery is handled by the OpenTelemetry span processor you configure.
127
+
128
+ ## Evaluations
129
+
130
+ `freesolo` also includes a small evaluation API for CI jobs, GitHub bots, and
131
+ eval scripts. All evaluation runs require `FREESOLO_API_KEY` or an explicit
132
+ `api_key`.
133
+
134
+ Evaluation data is a list of plain dictionaries. There is no separate `Example`
135
+ class to construct.
136
+
137
+ Define scorers by subclassing `CustomScorer` and returning `BinaryResponse` or
138
+ `NumericResponse`. Scorers run in your process, and Freesolo uploads the final
139
+ results with your API key. Pass scorer objects, not strings.
140
+
141
+ ```python
142
+ from typing import Any
143
+
144
+ from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
145
+
146
+
147
+ class ExactMatch(CustomScorer[BinaryResponse]):
148
+ async def score(self, row: dict[str, Any]) -> BinaryResponse:
149
+ actual = str(row.get("actual_output", "")).strip()
150
+ expected = str(row.get("expected_output", "")).strip()
151
+ return BinaryResponse(
152
+ value=actual == expected and bool(actual),
153
+ reason="actual_output matched expected_output",
154
+ )
155
+
156
+
157
+ client = EvaluationClient()
158
+
159
+ results = client.run(
160
+ name="support-agent-correctness",
161
+ data=[
162
+ {
163
+ "input": "What is the capital of France?",
164
+ "actual_output": "Paris",
165
+ "expected_output": "Paris",
166
+ }
167
+ ],
168
+ scorers=[ExactMatch()],
169
+ )
170
+
171
+ print(results[0].success)
172
+ ```
173
+
174
+ ## Tinker Hugging Face Upload
175
+
176
+ `freesolo.utils.upload` posts a Tinker checkpoint URL to the Freesolo upload
177
+ service and returns the Hugging Face upload response.
178
+
179
+ ```python
180
+ from freesolo.utils.upload import upload_tinker_checkpoint_to_huggingface
181
+
182
+ result = upload_tinker_checkpoint_to_huggingface(
183
+ "tinker://<run_id>/sampler_weights/final",
184
+ base_model="Qwen/Qwen3.5-35B-A3B",
185
+ )
186
+
187
+ print(result["repoId"])
188
+ ```
189
+
190
+ ### Environment-driven evaluations
191
+
192
+ For training contracts, `Environment` describes task behavior for evals and
193
+ GRPO/RL: prompt construction, response normalization, and reward scoring.
194
+ Dataset loading and labeled conversation construction live in `freesolo.datasets`.
195
+ `run_environment` loads task examples, calls your model callback, scores the
196
+ response through the environment, and uploads the same `scorers_data` shape used
197
+ by the eval DB.
198
+
199
+ ```python
200
+ from typing import Any
201
+
202
+ from openai import OpenAI
203
+
204
+ from freesolo.datasets import TaskExample
205
+ from freesolo.environments import (
206
+ Environment,
207
+ EnvironmentGeneration,
208
+ RewardMetric,
209
+ RewardResult,
210
+ )
211
+ from freesolo.evaluation import EvaluationClient
212
+
213
+
214
+ class PromptEnvironment(Environment):
215
+ def build_prompt_messages(
216
+ self,
217
+ example: TaskExample,
218
+ prompt_text: str,
219
+ ):
220
+ return [
221
+ {"role": "system", "content": prompt_text},
222
+ {"role": "user", "content": example.task},
223
+ ]
224
+
225
+ def score_response(
226
+ self,
227
+ example: TaskExample,
228
+ response_text: str,
229
+ ) -> RewardResult:
230
+ passed = response_text.strip() == str(example.expected_output).strip()
231
+ return RewardResult(
232
+ name="exact_match",
233
+ score=1.0 if passed else 0.0,
234
+ success=passed,
235
+ threshold=1.0,
236
+ reason="matched expected output" if passed else "mismatch",
237
+ return_type="binary",
238
+ metrics=(
239
+ RewardMetric(
240
+ name="canonical_match",
241
+ score=1.0 if passed else 0.0,
242
+ success=passed,
243
+ threshold=1.0,
244
+ ),
245
+ ),
246
+ )
247
+
248
+
249
+ model = OpenAI()
250
+
251
+
252
+ def generate(messages: list[dict[str, str]], example: TaskExample):
253
+ response = model.chat.completions.create(
254
+ model="gpt-4.1-mini",
255
+ messages=messages,
256
+ )
257
+ return EnvironmentGeneration(
258
+ response_text=response.choices[0].message.content or "",
259
+ total_tokens=response.usage.total_tokens if response.usage else None,
260
+ )
261
+
262
+
263
+ results = EvaluationClient().run_environment(
264
+ name="contract-eval",
265
+ source="eval.jsonl",
266
+ contract_path="TRAINING_CONTRACT.md",
267
+ environment=ContractEnvironment(),
268
+ generate=generate,
269
+ )
270
+ ```
271
+
272
+ `RewardResult` is the top-level scorer entry stored in
273
+ `eval_tasks.scorers_data`. Its fields are:
274
+
275
+ - `name`: scorer name shown in the UI.
276
+ - `score`: numeric reward value.
277
+ - `success`: pass/fail. If omitted, Freesolo derives it from `threshold`, then
278
+ from whether `score > 0`.
279
+ - `threshold`, `value`, `reason`, `error`, `return_type`: scorer display and
280
+ pass/fail context.
281
+ - `latency_ms`, `total_tokens`: optional per-response usage metadata.
282
+ - `metadata`: JSON object for scorer-specific details.
283
+ - `metrics`: optional `RewardMetric` components, also JSON-only, with `name`,
284
+ `score`, `value`, `success`, `threshold`, `weight`, `reason`, and `metadata`.
285
+
286
+ Custom scorer:
287
+
288
+ ```python
289
+ from typing import Any
290
+
291
+ from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
292
+
293
+
294
+ class NoEmptyAnswer(CustomScorer[BinaryResponse]):
295
+ async def score(self, row: dict[str, Any]) -> BinaryResponse:
296
+ ok = bool(str(row.get("actual_output", "")).strip())
297
+ return BinaryResponse(value=ok, reason="actual_output is non-empty")
298
+
299
+
300
+ results = EvaluationClient().run(
301
+ name="support-agent-non-empty",
302
+ data=[{"actual_output": "hello"}],
303
+ scorers=[NoEmptyAnswer()],
304
+ )
305
+ ```
306
+
307
+ LLM-as-judge is also a custom scorer. The scorer can call your judge model and
308
+ return a `NumericResponse`; Freesolo stores the eval run and score output with
309
+ your `FREESOLO_API_KEY`. This example uses `OPENAI_API_KEY` for the judge model
310
+ call and `FREESOLO_API_KEY` for eval upload.
311
+
312
+ ```python
313
+ import json
314
+ from typing import Any
315
+
316
+ from openai import OpenAI
317
+
318
+ from freesolo.evaluation import CustomScorer, EvaluationClient, NumericResponse
319
+
320
+
321
+ class CorrectnessJudge(CustomScorer[NumericResponse]):
322
+ name = "correctness_llm_judge"
323
+ threshold = 0.8
324
+
325
+ def __init__(self, client: OpenAI) -> None:
326
+ self.client = client
327
+
328
+ async def score(self, row: dict[str, Any]) -> NumericResponse:
329
+ response = self.client.responses.create(
330
+ model="gpt-4.1-mini",
331
+ instructions=(
332
+ "Grade correctness from 0.0 to 1.0. "
333
+ "Return JSON only: {\"score\": 0.0, \"reason\": \"...\"}"
334
+ ),
335
+ input=[
336
+ {
337
+ "role": "user",
338
+ "content": [
339
+ {
340
+ "type": "input_text",
341
+ "text": json.dumps(
342
+ {
343
+ "input": row.get("input", ""),
344
+ "actual_output": row.get("actual_output", ""),
345
+ "expected_output": row.get("expected_output", ""),
346
+ }
347
+ ),
348
+ }
349
+ ],
350
+ }
351
+ ],
352
+ )
353
+
354
+ parsed = json.loads(response.output_text or "{}")
355
+ return NumericResponse(
356
+ value=float(parsed["score"]),
357
+ reason=str(parsed.get("reason", "")),
358
+ )
359
+
360
+
361
+ judge_client = OpenAI()
362
+
363
+ results = EvaluationClient().run(
364
+ name="support-agent-correctness",
365
+ data=[
366
+ {
367
+ "input": "What is the capital of France?",
368
+ "actual_output": "Paris is the capital of France.",
369
+ "expected_output": "Paris",
370
+ }
371
+ ],
372
+ scorers=[CorrectnessJudge(judge_client)],
373
+ )
374
+ ```
375
+
376
+ Hosted scorers are also available out of the box and use OpenRouter by default:
377
+
378
+ - `ReferenceCorrectnessScorer`
379
+ - `RubricScorer`
380
+ - `GroundednessScorer`
381
+ - `InstructionFollowingScorer`
382
+ - `PairwisePreferenceScorer`
383
+
384
+ ```python
385
+ from freesolo.evaluation import HostedJudgeClient, ReferenceCorrectnessScorer
386
+
387
+ judge = HostedJudgeClient(api_key="YOUR_OPENROUTER_API_KEY")
388
+
389
+ scorer = ReferenceCorrectnessScorer(client=judge)
390
+ ```
391
+
392
+ Tracing is available through the OpenTelemetry helpers in `freesolo.tracing`.