PyPI - freesolo - Versions diffs - 0.2.2__tar.gz → 0.2.4__tar.gz - Mend

freesolo 0.2.2tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

freesolo-0.2.4/.github/workflows/publish-packages.yml +96 -0
freesolo-0.2.4/.github/workflows/python-checks.yml +41 -0
freesolo-0.2.4/.github/workflows/sync-package-function-usage.yml +38 -0
freesolo-0.2.4/PKG-INFO +392 -0
freesolo-0.2.4/README.md +368 -0
freesolo-0.2.4/examples/PROMPT.md +10 -0
freesolo-0.2.4/examples/README.md +96 -0
freesolo-0.2.4/examples/TRAINING_CONTRACT.md +10 -0
freesolo-0.2.4/examples/data/support_eval.jsonl +3 -0
freesolo-0.2.4/examples/data/support_train.jsonl +3 -0
freesolo-0.2.4/examples/environment.py +110 -0
freesolo-0.2.4/examples/evaluation_custom_scorer.py +105 -0
freesolo-0.2.4/examples/evaluation_from_files.py +47 -0
freesolo-0.2.4/examples/gepa_prompt_example.py +76 -0
freesolo-0.2.4/examples/support_dataset.py +20 -0
freesolo-0.2.4/examples/tracing_manual_span.py +47 -0
freesolo-0.2.4/examples/training_sft_grpo.py +75 -0
freesolo-0.2.4/function_usage_registry.json +12 -0
freesolo-0.2.4/pypi/freesolo/__init__.py +1 -0
freesolo-0.2.4/pypi/freesolo/_usage.py +39 -0
freesolo-0.2.4/pypi/freesolo/contracts/__init__.py +23 -0
freesolo-0.2.4/pypi/freesolo/contracts/markdown.py +76 -0
freesolo-0.2.4/pypi/freesolo/contracts/types.py +29 -0
freesolo-0.2.4/pypi/freesolo/datasets/__init__.py +13 -0
freesolo-0.2.4/pypi/freesolo/datasets/core.py +74 -0
freesolo-0.2.4/pypi/freesolo/datasets/records.py +139 -0
freesolo-0.2.4/pypi/freesolo/datasets/types.py +26 -0
freesolo-0.2.4/pypi/freesolo/environments/__init__.py +17 -0
freesolo-0.2.4/pypi/freesolo/environments/base.py +162 -0
freesolo-0.2.4/pypi/freesolo/environments/evaluation.py +358 -0
freesolo-0.2.4/pypi/freesolo/environments/types.py +79 -0
{freesolo-0.2.2 → freesolo-0.2.4}/pypi/freesolo/evaluation/__init__.py +5 -8
{freesolo-0.2.2 → freesolo-0.2.4}/pypi/freesolo/evaluation/client.py +86 -18
freesolo-0.2.4/pypi/freesolo/evaluation/judges/__init__.py +19 -0
freesolo-0.2.4/pypi/freesolo/evaluation/judges/base.py +135 -0
freesolo-0.2.4/pypi/freesolo/evaluation/judges/groundedness.py +34 -0
freesolo-0.2.4/pypi/freesolo/evaluation/judges/instruction_following.py +31 -0
freesolo-0.2.4/pypi/freesolo/evaluation/judges/pairwise_preference.py +45 -0
freesolo-0.2.4/pypi/freesolo/evaluation/judges/reference_correctness.py +26 -0
freesolo-0.2.4/pypi/freesolo/evaluation/judges/rubric.py +46 -0
{freesolo-0.2.2 → freesolo-0.2.4}/pypi/freesolo/evaluation/responses.py +16 -8
freesolo-0.2.4/pypi/freesolo/evaluation/results.py +93 -0
freesolo-0.2.4/pypi/freesolo/evaluation/types.py +16 -0
freesolo-0.2.4/pypi/freesolo/gepa/__init__.py +17 -0
freesolo-0.2.4/pypi/freesolo/gepa/adapter.py +226 -0
freesolo-0.2.4/pypi/freesolo/gepa/reflection.py +103 -0
freesolo-0.2.4/pypi/freesolo/gepa/setup.py +219 -0
freesolo-0.2.4/pypi/freesolo/gepa/types.py +120 -0
freesolo-0.2.4/pypi/freesolo/tracing/__init__.py +13 -0
freesolo-0.2.4/pypi/freesolo/tracing/otel.py +250 -0
freesolo-0.2.4/pypi/freesolo/training/__init__.py +10 -0
freesolo-0.2.4/pypi/freesolo/training/grpo/__init__.py +0 -0
freesolo-0.2.4/pypi/freesolo/training/grpo/config.py +38 -0
freesolo-0.2.4/pypi/freesolo/training/grpo/datums.py +196 -0
freesolo-0.2.4/pypi/freesolo/training/grpo/rewards.py +133 -0
freesolo-0.2.4/pypi/freesolo/training/grpo/sampling.py +127 -0
freesolo-0.2.4/pypi/freesolo/training/storage.py +54 -0
freesolo-0.2.4/pypi/freesolo/training/train_grpo.py +429 -0
freesolo-0.2.4/pypi/freesolo/training/train_sft.py +284 -0
freesolo-0.2.4/pypi/freesolo/training/types.py +34 -0
freesolo-0.2.4/pypi/freesolo/utils/__init__.py +0 -0
freesolo-0.2.4/pypi/freesolo/utils/checkpoints.py +239 -0
freesolo-0.2.4/pypi/freesolo/utils/core.py +255 -0
freesolo-0.2.4/pypi/freesolo/utils/openrouter.py +180 -0
freesolo-0.2.4/pypi/freesolo/utils/oracle.py +240 -0
freesolo-0.2.4/pypi/freesolo/utils/storage.py +239 -0
freesolo-0.2.4/pypi/freesolo/utils/upload.py +60 -0
freesolo-0.2.4/pypi/freesolo/utils/wandb.py +303 -0
freesolo-0.2.4/pyproject.toml +36 -0
{freesolo-0.2.2 → freesolo-0.2.4}/ruff.toml +3 -0
freesolo-0.2.4/scripts/sync_package_function_usage.py +81 -0
freesolo-0.2.4/tests/end_to_end_testing/test_environment_evaluation_flow.py +140 -0
freesolo-0.2.4/tests/end_to_end_testing/test_examples.py +141 -0
freesolo-0.2.4/tests/functionality/test_datasets.py +113 -0
freesolo-0.2.4/tests/functionality/test_evaluation_client.py +161 -0
freesolo-0.2.4/tests/functionality/test_gepa_adapter.py +133 -0
freesolo-0.2.4/tests/functionality/test_records_rewards_and_config.py +126 -0
freesolo-0.2.4/tests/functionality/test_storage_sync.py +447 -0
freesolo-0.2.4/tests/functionality/test_tracing_opentelemetry.py +128 -0
freesolo-0.2.4/tests/functionality/test_upload.py +97 -0
freesolo-0.2.4/tests/functionality/test_usage_registry.py +63 -0
freesolo-0.2.4/tests/functionality/test_utils_checkpoints.py +106 -0
freesolo-0.2.4/tests/security/test_sanitize_and_contract_security.py +60 -0
freesolo-0.2.4/uv.lock +3328 -0
freesolo-0.2.2/PKG-INFO +0 -342
freesolo-0.2.2/README.md +0 -328
freesolo-0.2.2/pypi/examples/.env.example +0 -11
freesolo-0.2.2/pypi/examples/__init__.py +0 -1
freesolo-0.2.2/pypi/examples/anthropic/__init__.py +0 -1
freesolo-0.2.2/pypi/examples/anthropic/chat.py +0 -56
freesolo-0.2.2/pypi/examples/anthropic/vision.py +0 -82
freesolo-0.2.2/pypi/examples/gemini/__init__.py +0 -1
freesolo-0.2.2/pypi/examples/gemini/chat.py +0 -53
freesolo-0.2.2/pypi/examples/gemini/vision.py +0 -79
freesolo-0.2.2/pypi/examples/openai/__init__.py +0 -1
freesolo-0.2.2/pypi/examples/openai/chat.py +0 -56
freesolo-0.2.2/pypi/examples/openai/vision.py +0 -64
freesolo-0.2.2/pypi/examples/openrouter/__init__.py +0 -1
freesolo-0.2.2/pypi/examples/openrouter/chat.py +0 -60
freesolo-0.2.2/pypi/examples/utils.py +0 -231
freesolo-0.2.2/pypi/freesolo/__init__.py +0 -59
freesolo-0.2.2/pypi/freesolo/evaluation/hosted.py +0 -404
freesolo-0.2.2/pypi/freesolo/evaluation/judges.py +0 -27
freesolo-0.2.2/pypi/freesolo/evaluation/results.py +0 -61
freesolo-0.2.2/pypi/freesolo/evaluation/utils.py +0 -11
freesolo-0.2.2/pypi/freesolo/sdk.py +0 -52
freesolo-0.2.2/pypi/freesolo/tracing/__init__.py +0 -27
freesolo-0.2.2/pypi/freesolo/tracing/client.py +0 -583
freesolo-0.2.2/pypi/freesolo/tracing/decorators.py +0 -63
freesolo-0.2.2/pypi/freesolo/tracing/providers/__init__.py +0 -14
freesolo-0.2.2/pypi/freesolo/tracing/providers/anthropic.py +0 -111
freesolo-0.2.2/pypi/freesolo/tracing/providers/config.py +0 -101
freesolo-0.2.2/pypi/freesolo/tracing/providers/gemini.py +0 -205
freesolo-0.2.2/pypi/freesolo/tracing/providers/openai.py +0 -208
freesolo-0.2.2/pypi/freesolo/tracing/providers/utils.py +0 -276
freesolo-0.2.2/pypi/freesolo/tracing/utils.py +0 -15
freesolo-0.2.2/pypi/freesolo/utils.py +0 -37
freesolo-0.2.2/pyproject.toml +0 -26
freesolo-0.2.2/uv.lock +0 -904
{freesolo-0.2.2 → freesolo-0.2.4}/.env.example +0 -0
{freesolo-0.2.2 → freesolo-0.2.4}/.gitignore +0 -0
{freesolo-0.2.2 → freesolo-0.2.4}/pypi/.gitignore +0 -0
{freesolo-0.2.2 → freesolo-0.2.4}/pypi/freesolo/py.typed +0 -0
{freesolo-0.2.2 → freesolo-0.2.4}/pypi/freesolo/tracing/sanitize.py +0 -0

freesolo-0.2.4/.github/workflows/publish-packages.yml ADDED Viewed

@@ -0,0 +1,96 @@
+name: Publish packages
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "pyproject.toml"
+      - "uv.lock"
+      - "pypi/**"
+      - "examples/**"
+      - ".github/workflows/publish-packages.yml"
+  workflow_dispatch:
+concurrency:
+  group: publish-packages-${{ github.ref }}
+  cancel-in-progress: false
+jobs:
+  publish-pypi:
+    name: Publish PyPI package
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    env:
+      UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v6
+        with:
+          python-version: "3.12"
+      - name: Read package metadata
+        id: metadata
+        run: |
+          python - <<'PY' >> "$GITHUB_OUTPUT"
+          import tomllib
+          with open("pyproject.toml", "rb") as f:
+              project = tomllib.load(f)["project"]
+          print(f"name={project['name']}")
+          print(f"version={project['version']}")
+          PY
+      - name: Check PyPI for existing version
+        id: pypi
+        env:
+          PACKAGE_NAME: ${{ steps.metadata.outputs.name }}
+          PACKAGE_VERSION: ${{ steps.metadata.outputs.version }}
+        run: |
+          python - <<'PY' >> "$GITHUB_OUTPUT"
+          import os
+          import urllib.error
+          import urllib.request
+          name = os.environ["PACKAGE_NAME"]
+          version = os.environ["PACKAGE_VERSION"]
+          url = f"https://pypi.org/pypi/{name}/{version}/json"
+          try:
+              with urllib.request.urlopen(url, timeout=30) as response:
+                  exists = response.status == 200
+          except urllib.error.HTTPError as error:
+              if error.code != 404:
+                  raise
+              exists = False
+          print(f"exists={'true' if exists else 'false'}")
+          PY
+      - name: Skip existing PyPI version
+        if: steps.pypi.outputs.exists == 'true'
+        run: echo "${{ steps.metadata.outputs.name }} ${{ steps.metadata.outputs.version }} is already on PyPI."
+      - name: Install uv
+        if: steps.pypi.outputs.exists != 'true'
+        run: python -m pip install --upgrade uv
+      - name: Build distributions
+        if: steps.pypi.outputs.exists != 'true'
+        run: |
+          rm -rf dist
+          uv build
+      - name: Publish to PyPI
+        if: steps.pypi.outputs.exists != 'true' && env.UV_PUBLISH_TOKEN != ''
+        run: uv publish
+      - name: Skip publish without PyPI token
+        if: steps.pypi.outputs.exists != 'true' && env.UV_PUBLISH_TOKEN == ''
+        run: |
+          echo "PYPI_API_TOKEN is not configured; built distributions but skipped upload."
+          echo "Add a PYPI_API_TOKEN repository secret to publish this package."

freesolo-0.2.4/.github/workflows/python-checks.yml ADDED Viewed

@@ -0,0 +1,41 @@
+name: Python checks
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+permissions:
+  contents: read
+jobs:
+  checks:
+    name: Ruff and tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v6
+        with:
+          python-version: "3.12"
+      - name: Install uv
+        run: python3 -m pip install --upgrade uv
+      - name: Install dependencies
+        run: uv sync --locked --extra dev
+      - name: Python compile check
+        run: python3 -m py_compile $(find pypi tests -name '*.py' -print)
+      - name: Ruff check
+        run: uv run --extra dev python -m ruff check .
+      - name: Ruff format check
+        run: uv run --extra dev python -m ruff format --check .
+      - name: Tests
+        run: uv run --extra dev python -m pytest tests

freesolo-0.2.4/.github/workflows/sync-package-function-usage.yml ADDED Viewed

@@ -0,0 +1,38 @@
+name: Sync package function usage
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "function_usage_registry.json"
+      - "scripts/sync_package_function_usage.py"
+      - ".github/workflows/sync-package-function-usage.yml"
+  workflow_dispatch:
+permissions:
+  contents: read
+jobs:
+  sync:
+    name: Sync usage registry
+    runs-on: ubuntu-latest
+    if: ${{ github.ref == 'refs/heads/main' }}
+    env:
+      SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
+      SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }}
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/setup-python@v6
+        with:
+          python-version: "3.12"
+      - name: Sync package function rows
+        if: env.SUPABASE_URL != '' && env.SUPABASE_SERVICE_ROLE_KEY != ''
+        run: python scripts/sync_package_function_usage.py --remove-stale
+      - name: Skip without Supabase secrets
+        if: env.SUPABASE_URL == '' || env.SUPABASE_SERVICE_ROLE_KEY == ''
+        run: echo "SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY secrets are required to sync usage."

freesolo-0.2.4/PKG-INFO ADDED Viewed

@@ -0,0 +1,392 @@
+Metadata-Version: 2.4
+Name: freesolo
+Version: 0.2.4
+Summary: Tracing, evaluation, and training utilities for LLM applications.
+Requires-Python: >=3.11
+Requires-Dist: gepa>=0.1.1
+Requires-Dist: httpx>=0.27.0
+Requires-Dist: jsonschema>=4.0.0
+Requires-Dist: numpy>=1.26.0
+Requires-Dist: opentelemetry-api>=1.28.0
+Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.28.0
+Requires-Dist: opentelemetry-sdk>=1.28.0
+Requires-Dist: pymongo>=4.0.0
+Requires-Dist: python-dotenv>=1.0.0
+Requires-Dist: tinker-cookbook>=0.3.0
+Requires-Dist: tinker>=0.19.0
+Requires-Dist: wandb>=0.17.0
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == 'dev'
+Requires-Dist: ruff>=0.11.0; extra == 'dev'
+Provides-Extra: examples
+Requires-Dist: openai>=1.0.0; extra == 'examples'
+Description-Content-Type: text/markdown
+# freesolo
+`freesolo` is a Python tracing and evaluation package for LLM apps.
+It is built for the lowest-friction integration possible:
+1. Install the package
+2. Set `FREESOLO_API_KEY`
+3. Configure the tracer
+4. Run traces and evaluations from the package APIs
+## Install
+Install the package:
+```bash
+pip install freesolo
+```
+## Environment
+- `FREESOLO_API_KEY`
+- `FREESOLO_BASE_URL` (optional, defaults to `https://api.freesolo.co`)
+```bash
+export FREESOLO_API_KEY=fslo_...
+```
+## Quickstart
+```python
+from freesolo.tracing import configure_tracer, get_tracer
+configure_tracer(service_name="my-llm-app")
+tracer = get_tracer()
+with tracer.start_as_current_span(
+    "model.call",
+    attributes={
+        "gen_ai.system": "openai",
+        "gen_ai.request.model": "gpt-5.5",
+        "freesolo.input": {"prompt": "How do I reset my password?"},
+    },
+) as span:
+    result = "Reset it from account settings."
+    span.set_attribute("freesolo.output", result)
+```
+## Runnable Examples
+Copy-pasteable examples live in [`examples/`](examples/):
+- `tracing_manual_span.py`: configure OpenTelemetry and send one application span.
+- `evaluation_custom_scorer.py`: run custom binary and numeric eval scorers.
+- `evaluation_from_files.py`: run evals from a concrete dataset and environment.
+- `environment.py`: example environment used by evals, training, and GEPA.
+- `support_dataset.py`: example dataset paths and loaders used by evals, SFT, GRPO, and GEPA.
+- `gepa_prompt_example.py`: run the Freesolo GEPA adapter over the example dataset.
+- `training_sft_grpo.py`: start SFT or GRPO training runs from package APIs.
+From a repo checkout:
+```bash
+cd freesolo-sdk
+export PYTHONPATH="$PWD/pypi"
+uv run python examples/evaluation_custom_scorer.py --local
+```
+## Public API
+The root `freesolo` module intentionally exports no functions. Import from the
+subpackages below; lower-level modules may be importable, but they are
+implementation helpers unless they appear here or in an example.
+| Import | Use case |
+| --- | --- |
+| `freesolo.tracing.configure_tracer`, `get_tracer`, `force_flush`, `shutdown` | Send OpenTelemetry traces from an application to Freesolo. |
+| `freesolo.evaluation.EvaluationClient` | Run custom-scorer evals or environment evals and upload results to Freesolo. |
+| `freesolo.evaluation.run_local_evaluation` | Run custom scorers locally without uploading results. |
+| `freesolo.evaluation.CustomScorer`, `BinaryResponse`, `NumericResponse` | Define local scorer logic for eval rows. |
+| `freesolo.evaluation.HostedJudgeClient` and hosted scorer classes | Use hosted LLM-as-judge scorers with OpenRouter-compatible credentials. |
+| `freesolo.datasets.TaskExample`, `Dataset`, `load_dataset` | Load task examples and construct labeled conversations for evals or training. |
+| `freesolo.environments.Environment`, `RewardResult`, `RewardMetric`, `GrpoConfig`, `EnvironmentGeneration` | Define task behavior once for evals, GEPA, SFT, and GRPO. |
+| `freesolo.training.SftConfig`, `TrainGrpoOptions`, `train_sft`, `train_grpo` | Start SFT or GRPO training from package APIs. |
+| `freesolo.gepa.GEPASetup`, `GEPAConfig`, `DefaultReflectionAgent`, `attach_gepa`, `optimize_gepa` | Optimize prompts through the GEPA adapter using the same environment and dataset abstractions. |
+| `freesolo.contracts.load_contract_text`, `extract_contract_spec`, `load_contract_spec`, `build_oracle_messages` | Read contract markdown and build oracle prompt messages. |
+| `freesolo.utils.oracle.generate_ground_truth_records` | Generate ground-truth JSONL records from source examples using a contract, environment, and oracle model. |
+| `freesolo.utils.upload.upload_tinker_checkpoint_to_huggingface` | Upload a Tinker checkpoint to a private Hugging Face model repo. |
+## What Gets Stored
+- Native OTLP traces and spans
+- Resource attributes like `service.name`
+- Span names, timings, parent span ids, status, and errors
+- Common model attributes such as `gen_ai.system`, `gen_ai.request.model`, and token counts
+- Optional `freesolo.input` and `freesolo.output` span attributes
+## Notes
+- Tracing uses native OpenTelemetry protobuf export to `/api/traces/ingest`.
+- Configure third-party OpenTelemetry instrumentors against the provider returned by `configure_tracer(...)`.
+- Delivery is handled by the OpenTelemetry span processor you configure.
+## Evaluations
+`freesolo` also includes a small evaluation API for CI jobs, GitHub bots, and
+eval scripts. All evaluation runs require `FREESOLO_API_KEY` or an explicit
+`api_key`.
+Evaluation data is a list of plain dictionaries. There is no separate `Example`
+class to construct.
+Define scorers by subclassing `CustomScorer` and returning `BinaryResponse` or
+`NumericResponse`. Scorers run in your process, and Freesolo uploads the final
+results with your API key. Pass scorer objects, not strings.
+```python
+from typing import Any
+from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
+class ExactMatch(CustomScorer[BinaryResponse]):
+    async def score(self, row: dict[str, Any]) -> BinaryResponse:
+        actual = str(row.get("actual_output", "")).strip()
+        expected = str(row.get("expected_output", "")).strip()
+        return BinaryResponse(
+            value=actual == expected and bool(actual),
+            reason="actual_output matched expected_output",
+        )
+client = EvaluationClient()
+results = client.run(
+    name="support-agent-correctness",
+    data=[
+        {
+            "input": "What is the capital of France?",
+            "actual_output": "Paris",
+            "expected_output": "Paris",
+        }
+    ],
+    scorers=[ExactMatch()],
+)
+print(results[0].success)
+```
+## Tinker Hugging Face Upload
+`freesolo.utils.upload` posts a Tinker checkpoint URL to the Freesolo upload
+service and returns the Hugging Face upload response.
+```python
+from freesolo.utils.upload import upload_tinker_checkpoint_to_huggingface
+result = upload_tinker_checkpoint_to_huggingface(
+    "tinker://<run_id>/sampler_weights/final",
+    base_model="Qwen/Qwen3.5-35B-A3B",
+)
+print(result["repoId"])
+```
+### Environment-driven evaluations
+For training contracts, `Environment` describes task behavior for evals and
+GRPO/RL: prompt construction, response normalization, and reward scoring.
+Dataset loading and labeled conversation construction live in `freesolo.datasets`.
+`run_environment` loads task examples, calls your model callback, scores the
+response through the environment, and uploads the same `scorers_data` shape used
+by the eval DB.
+```python
+from typing import Any
+from openai import OpenAI
+from freesolo.datasets import TaskExample
+from freesolo.environments import (
+    Environment,
+    EnvironmentGeneration,
+    RewardMetric,
+    RewardResult,
+)
+from freesolo.evaluation import EvaluationClient
+class PromptEnvironment(Environment):
+    def build_prompt_messages(
+        self,
+        example: TaskExample,
+        prompt_text: str,
+    ):
+        return [
+            {"role": "system", "content": prompt_text},
+            {"role": "user", "content": example.task},
+        ]
+    def score_response(
+        self,
+        example: TaskExample,
+        response_text: str,
+    ) -> RewardResult:
+        passed = response_text.strip() == str(example.expected_output).strip()
+        return RewardResult(
+            name="exact_match",
+            score=1.0 if passed else 0.0,
+            success=passed,
+            threshold=1.0,
+            reason="matched expected output" if passed else "mismatch",
+            return_type="binary",
+            metrics=(
+                RewardMetric(
+                    name="canonical_match",
+                    score=1.0 if passed else 0.0,
+                    success=passed,
+                    threshold=1.0,
+                ),
+            ),
+        )
+model = OpenAI()
+def generate(messages: list[dict[str, str]], example: TaskExample):
+    response = model.chat.completions.create(
+        model="gpt-4.1-mini",
+        messages=messages,
+    )
+    return EnvironmentGeneration(
+        response_text=response.choices[0].message.content or "",
+        total_tokens=response.usage.total_tokens if response.usage else None,
+    )
+results = EvaluationClient().run_environment(
+    name="contract-eval",
+    source="eval.jsonl",
+    contract_path="TRAINING_CONTRACT.md",
+    environment=ContractEnvironment(),
+    generate=generate,
+)
+```
+`RewardResult` is the top-level scorer entry stored in
+`eval_tasks.scorers_data`. Its fields are:
+- `name`: scorer name shown in the UI.
+- `score`: numeric reward value.
+- `success`: pass/fail. If omitted, Freesolo derives it from `threshold`, then
+  from whether `score > 0`.
+- `threshold`, `value`, `reason`, `error`, `return_type`: scorer display and
+  pass/fail context.
+- `latency_ms`, `total_tokens`: optional per-response usage metadata.
+- `metadata`: JSON object for scorer-specific details.
+- `metrics`: optional `RewardMetric` components, also JSON-only, with `name`,
+  `score`, `value`, `success`, `threshold`, `weight`, `reason`, and `metadata`.
+Custom scorer:
+```python
+from typing import Any
+from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
+class NoEmptyAnswer(CustomScorer[BinaryResponse]):
+    async def score(self, row: dict[str, Any]) -> BinaryResponse:
+        ok = bool(str(row.get("actual_output", "")).strip())
+        return BinaryResponse(value=ok, reason="actual_output is non-empty")
+results = EvaluationClient().run(
+    name="support-agent-non-empty",
+    data=[{"actual_output": "hello"}],
+    scorers=[NoEmptyAnswer()],
+)
+```
+LLM-as-judge is also a custom scorer. The scorer can call your judge model and
+return a `NumericResponse`; Freesolo stores the eval run and score output with
+your `FREESOLO_API_KEY`. This example uses `OPENAI_API_KEY` for the judge model
+call and `FREESOLO_API_KEY` for eval upload.
+```python
+import json
+from typing import Any
+from openai import OpenAI
+from freesolo.evaluation import CustomScorer, EvaluationClient, NumericResponse
+class CorrectnessJudge(CustomScorer[NumericResponse]):
+    name = "correctness_llm_judge"
+    threshold = 0.8
+    def __init__(self, client: OpenAI) -> None:
+        self.client = client
+    async def score(self, row: dict[str, Any]) -> NumericResponse:
+        response = self.client.responses.create(
+            model="gpt-4.1-mini",
+            instructions=(
+                "Grade correctness from 0.0 to 1.0. "
+                "Return JSON only: {\"score\": 0.0, \"reason\": \"...\"}"
+            ),
+            input=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "input_text",
+                            "text": json.dumps(
+                                {
+                                    "input": row.get("input", ""),
+                                    "actual_output": row.get("actual_output", ""),
+                                    "expected_output": row.get("expected_output", ""),
+                                }
+                            ),
+                        }
+                    ],
+                }
+            ],
+        )
+        parsed = json.loads(response.output_text or "{}")
+        return NumericResponse(
+            value=float(parsed["score"]),
+            reason=str(parsed.get("reason", "")),
+        )
+judge_client = OpenAI()
+results = EvaluationClient().run(
+    name="support-agent-correctness",
+    data=[
+        {
+            "input": "What is the capital of France?",
+            "actual_output": "Paris is the capital of France.",
+            "expected_output": "Paris",
+        }
+    ],
+    scorers=[CorrectnessJudge(judge_client)],
+)
+```
+Hosted scorers are also available out of the box and use OpenRouter by default:
+- `ReferenceCorrectnessScorer`
+- `RubricScorer`
+- `GroundednessScorer`
+- `InstructionFollowingScorer`
+- `PairwisePreferenceScorer`
+```python
+from freesolo.evaluation import HostedJudgeClient, ReferenceCorrectnessScorer
+judge = HostedJudgeClient(api_key="YOUR_OPENROUTER_API_KEY")
+scorer = ReferenceCorrectnessScorer(client=judge)
+```
+Tracing is available through the OpenTelemetry helpers in `freesolo.tracing`.

freesolo 0.2.2__tar.gz → 0.2.4__tar.gz

freesolo 0.2.2tar.gz → 0.2.4tar.gz