PyPI - freesolo - Versions diffs - 0.2.2__tar.gz → 0.2.3__tar.gz - Mend

freesolo 0.2.2tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

freesolo-0.2.2/README.md → freesolo-0.2.3/PKG-INFO RENAMED Viewed

@@ -1,3 +1,21 @@
+Metadata-Version: 2.4
+Name: freesolo
+Version: 0.2.3
+Summary: Tracing, evaluation, and training utilities for LLM applications.
+Requires-Python: >=3.10
+Requires-Dist: httpx>=0.27.0
+Requires-Dist: wandb>=0.17.0
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == 'dev'
+Requires-Dist: ruff>=0.11.0; extra == 'dev'
+Provides-Extra: examples
+Requires-Dist: anthropic>=0.40.0; extra == 'examples'
+Requires-Dist: google-genai>=1.0.0; extra == 'examples'
+Requires-Dist: openai>=1.0.0; extra == 'examples'
+Provides-Extra: gepa
+Requires-Dist: gepa>=0.1.1; extra == 'gepa'
+Description-Content-Type: text/markdown
 # freesolo
 `freesolo` is a Python tracing and evaluation package for LLM apps.
@@ -7,7 +25,7 @@ It is built for the lowest-friction integration possible:
 1. Install the package
 2. Set `FREESOLO_API_KEY`
 3. Wrap your OpenAI, Anthropic, Gemini, or OpenAI-compatible client
-4. Run traces and evaluations from the same SDK
+4. Run traces and evaluations from the package APIs
 ## Current provider support
@@ -20,7 +38,7 @@ It is built for the lowest-friction integration possible:
 ## Install
-Install the package plus the provider SDK you use:
+Install the package plus the provider client you use:
 ```bash
 pip install freesolo openai
@@ -154,7 +172,7 @@ with start_trace("support-agent-run"):
 ## Evaluations
-`freesolo` also includes a small evaluation SDK for CI jobs, GitHub bots, and
+`freesolo` also includes a small evaluation API for CI jobs, GitHub bots, and
 eval scripts. All evaluation runs require `FREESOLO_API_KEY` or an explicit
 `api_key`.
@@ -168,8 +186,7 @@ results with your API key. Pass scorer objects, not strings.
 ```python
 from typing import Any
-from freesolo import Freesolo
-from freesolo.evaluation import BinaryResponse, CustomScorer
+from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
 class ExactMatch(CustomScorer[BinaryResponse]):
@@ -182,9 +199,9 @@ class ExactMatch(CustomScorer[BinaryResponse]):
         )
-client = Freesolo()
+client = EvaluationClient()
-results = client.evals.run(
+results = client.run(
     name="support-agent-correctness",
     data=[
         {
@@ -199,13 +216,123 @@ results = client.evals.run(
 print(results[0].success)
 ```
+## Tinker Deployment
+`freesolo.utils.deployment` is a thin proxy for the Modal deployment server. It posts
+a Tinker checkpoint URL to the pinned Modal `/deployments` endpoint and returns
+the server JSON response.
+```python
+from freesolo.utils.deployment import deploy_tinker_checkpoint
+result = deploy_tinker_checkpoint(
+    "tinker://<run_id>/sampler_weights/final",
+    base_model="Qwen/Qwen3.5-35B-A3B",
+)
+print(result["repoId"])
+```
+### Environment-driven evaluations
+For training contracts, you can use the same `Environment` adapter for evals,
+SFT, and GRPO. `run_environment` loads examples, builds prompt messages, calls
+your model callback, scores the response through the environment, and uploads
+the same `scorers_data` shape used by the eval DB.
+```python
+from typing import Any
+from openai import OpenAI
+from freesolo.environments import (
+    Environment,
+    EnvironmentGeneration,
+    RewardMetric,
+    RewardResult,
+    TaskExample,
+)
+from freesolo.evaluation import EvaluationClient
+class ContractEnvironment(Environment):
+    def build_prompt_messages(
+        self,
+        example: TaskExample,
+        contract_text: str,
+    ):
+        return [
+            {"role": "system", "content": contract_text},
+            {"role": "user", "content": example.task},
+        ]
+    def score_response(
+        self,
+        example: TaskExample,
+        response_text: str,
+    ) -> RewardResult:
+        passed = response_text.strip() == str(example.expected_output).strip()
+        return RewardResult(
+            name="exact_match",
+            score=1.0 if passed else 0.0,
+            success=passed,
+            threshold=1.0,
+            reason="matched expected output" if passed else "mismatch",
+            return_type="binary",
+            metrics=(
+                RewardMetric(
+                    name="canonical_match",
+                    score=1.0 if passed else 0.0,
+                    success=passed,
+                    threshold=1.0,
+                ),
+            ),
+        )
+model = OpenAI()
+def generate(messages: list[dict[str, str]], example: TaskExample):
+    response = model.chat.completions.create(
+        model="gpt-4.1-mini",
+        messages=messages,
+    )
+    return EnvironmentGeneration(
+        response_text=response.choices[0].message.content or "",
+        total_tokens=response.usage.total_tokens if response.usage else None,
+    )
+results = EvaluationClient().run_environment(
+    name="contract-eval",
+    source="eval.jsonl",
+    contract_path="TRAINING_CONTRACT.md",
+    environment=ContractEnvironment(),
+    generate=generate,
+)
+```
+`RewardResult` is the top-level scorer entry stored in
+`eval_tasks.scorers_data`. Its fields are:
+- `name`: scorer name shown in the UI.
+- `score`: numeric reward value.
+- `success`: pass/fail. If omitted, Freesolo derives it from `threshold`, then
+  from whether `score > 0`.
+- `threshold`, `value`, `reason`, `error`, `return_type`: scorer display and
+  pass/fail context.
+- `latency_ms`, `total_tokens`: optional per-response usage metadata.
+- `metadata`: JSON object for scorer-specific details.
+- `metrics`: optional `RewardMetric` components, also JSON-only, with `name`,
+  `score`, `value`, `success`, `threshold`, `weight`, `reason`, and `metadata`.
 Custom scorer:
 ```python
 from typing import Any
-from freesolo import Freesolo
-from freesolo.evaluation import BinaryResponse, CustomScorer
+from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
 class NoEmptyAnswer(CustomScorer[BinaryResponse]):
@@ -214,7 +341,7 @@ class NoEmptyAnswer(CustomScorer[BinaryResponse]):
         return BinaryResponse(value=ok, reason="actual_output is non-empty")
-results = Freesolo().evals.run(
+results = EvaluationClient().run(
     name="support-agent-non-empty",
     data=[{"actual_output": "hello"}],
     scorers=[NoEmptyAnswer()],
@@ -232,8 +359,8 @@ from typing import Any
 from openai import OpenAI
-from freesolo import Freesolo, instrument_openai
-from freesolo.evaluation import CustomScorer, NumericResponse
+from freesolo import instrument_openai
+from freesolo.evaluation import CustomScorer, EvaluationClient, NumericResponse
 class CorrectnessJudge(CustomScorer[NumericResponse]):
@@ -278,7 +405,7 @@ class CorrectnessJudge(CustomScorer[NumericResponse]):
 judge_client = instrument_openai(OpenAI())
-results = Freesolo().evals.run(
+results = EvaluationClient().run(
     name="support-agent-correctness",
     data=[
         {
@@ -302,27 +429,16 @@ Hosted scorers are also available out of the box and use OpenRouter by default:
 ```python
 from freesolo.evaluation import HostedJudgeClient, ReferenceCorrectnessScorer
-judge = HostedJudgeClient(
-    api_key="YOUR_OPENROUTER_API_KEY",
-    model="openai/gpt-oss-120b",
-)
+judge = HostedJudgeClient(api_key="YOUR_OPENROUTER_API_KEY")
 scorer = ReferenceCorrectnessScorer(client=judge)
 ```
-Tracing is available from the same root client:
+Tracing is available through namespaced helpers:
 ```python
-from freesolo import Freesolo
+from freesolo.tracing import start_trace
-client = Freesolo()
-with client.traces.start("support-agent-run"):
+with start_trace("support-agent-run"):
     ...
 ```
-You can also import namespaced tracing helpers directly:
-```python
-from freesolo.tracing import start_trace, wrap
-```

freesolo-0.2.2/PKG-INFO → freesolo-0.2.3/README.md RENAMED Viewed

@@ -1,17 +1,3 @@
-Metadata-Version: 2.4
-Name: freesolo
-Version: 0.2.2
-Summary: Tracing and evaluation SDK for LLM applications.
-Requires-Python: >=3.10
-Requires-Dist: httpx>=0.27.0
-Provides-Extra: dev
-Requires-Dist: ruff>=0.11.0; extra == 'dev'
-Provides-Extra: examples
-Requires-Dist: anthropic>=0.40.0; extra == 'examples'
-Requires-Dist: google-genai>=1.0.0; extra == 'examples'
-Requires-Dist: openai>=1.0.0; extra == 'examples'
-Description-Content-Type: text/markdown
 # freesolo
 `freesolo` is a Python tracing and evaluation package for LLM apps.
@@ -21,7 +7,7 @@ It is built for the lowest-friction integration possible:
 1. Install the package
 2. Set `FREESOLO_API_KEY`
 3. Wrap your OpenAI, Anthropic, Gemini, or OpenAI-compatible client
-4. Run traces and evaluations from the same SDK
+4. Run traces and evaluations from the package APIs
 ## Current provider support
@@ -34,7 +20,7 @@ It is built for the lowest-friction integration possible:
 ## Install
-Install the package plus the provider SDK you use:
+Install the package plus the provider client you use:
 ```bash
 pip install freesolo openai
@@ -168,7 +154,7 @@ with start_trace("support-agent-run"):
 ## Evaluations
-`freesolo` also includes a small evaluation SDK for CI jobs, GitHub bots, and
+`freesolo` also includes a small evaluation API for CI jobs, GitHub bots, and
 eval scripts. All evaluation runs require `FREESOLO_API_KEY` or an explicit
 `api_key`.
@@ -182,8 +168,7 @@ results with your API key. Pass scorer objects, not strings.
 ```python
 from typing import Any
-from freesolo import Freesolo
-from freesolo.evaluation import BinaryResponse, CustomScorer
+from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
 class ExactMatch(CustomScorer[BinaryResponse]):
@@ -196,9 +181,9 @@ class ExactMatch(CustomScorer[BinaryResponse]):
         )
-client = Freesolo()
+client = EvaluationClient()
-results = client.evals.run(
+results = client.run(
     name="support-agent-correctness",
     data=[
         {
@@ -213,13 +198,123 @@ results = client.evals.run(
 print(results[0].success)
 ```
+## Tinker Deployment
+`freesolo.utils.deployment` is a thin proxy for the Modal deployment server. It posts
+a Tinker checkpoint URL to the pinned Modal `/deployments` endpoint and returns
+the server JSON response.
+```python
+from freesolo.utils.deployment import deploy_tinker_checkpoint
+result = deploy_tinker_checkpoint(
+    "tinker://<run_id>/sampler_weights/final",
+    base_model="Qwen/Qwen3.5-35B-A3B",
+)
+print(result["repoId"])
+```
+### Environment-driven evaluations
+For training contracts, you can use the same `Environment` adapter for evals,
+SFT, and GRPO. `run_environment` loads examples, builds prompt messages, calls
+your model callback, scores the response through the environment, and uploads
+the same `scorers_data` shape used by the eval DB.
+```python
+from typing import Any
+from openai import OpenAI
+from freesolo.environments import (
+    Environment,
+    EnvironmentGeneration,
+    RewardMetric,
+    RewardResult,
+    TaskExample,
+)
+from freesolo.evaluation import EvaluationClient
+class ContractEnvironment(Environment):
+    def build_prompt_messages(
+        self,
+        example: TaskExample,
+        contract_text: str,
+    ):
+        return [
+            {"role": "system", "content": contract_text},
+            {"role": "user", "content": example.task},
+        ]
+    def score_response(
+        self,
+        example: TaskExample,
+        response_text: str,
+    ) -> RewardResult:
+        passed = response_text.strip() == str(example.expected_output).strip()
+        return RewardResult(
+            name="exact_match",
+            score=1.0 if passed else 0.0,
+            success=passed,
+            threshold=1.0,
+            reason="matched expected output" if passed else "mismatch",
+            return_type="binary",
+            metrics=(
+                RewardMetric(
+                    name="canonical_match",
+                    score=1.0 if passed else 0.0,
+                    success=passed,
+                    threshold=1.0,
+                ),
+            ),
+        )
+model = OpenAI()
+def generate(messages: list[dict[str, str]], example: TaskExample):
+    response = model.chat.completions.create(
+        model="gpt-4.1-mini",
+        messages=messages,
+    )
+    return EnvironmentGeneration(
+        response_text=response.choices[0].message.content or "",
+        total_tokens=response.usage.total_tokens if response.usage else None,
+    )
+results = EvaluationClient().run_environment(
+    name="contract-eval",
+    source="eval.jsonl",
+    contract_path="TRAINING_CONTRACT.md",
+    environment=ContractEnvironment(),
+    generate=generate,
+)
+```
+`RewardResult` is the top-level scorer entry stored in
+`eval_tasks.scorers_data`. Its fields are:
+- `name`: scorer name shown in the UI.
+- `score`: numeric reward value.
+- `success`: pass/fail. If omitted, Freesolo derives it from `threshold`, then
+  from whether `score > 0`.
+- `threshold`, `value`, `reason`, `error`, `return_type`: scorer display and
+  pass/fail context.
+- `latency_ms`, `total_tokens`: optional per-response usage metadata.
+- `metadata`: JSON object for scorer-specific details.
+- `metrics`: optional `RewardMetric` components, also JSON-only, with `name`,
+  `score`, `value`, `success`, `threshold`, `weight`, `reason`, and `metadata`.
 Custom scorer:
 ```python
 from typing import Any
-from freesolo import Freesolo
-from freesolo.evaluation import BinaryResponse, CustomScorer
+from freesolo.evaluation import BinaryResponse, CustomScorer, EvaluationClient
 class NoEmptyAnswer(CustomScorer[BinaryResponse]):
@@ -228,7 +323,7 @@ class NoEmptyAnswer(CustomScorer[BinaryResponse]):
         return BinaryResponse(value=ok, reason="actual_output is non-empty")
-results = Freesolo().evals.run(
+results = EvaluationClient().run(
     name="support-agent-non-empty",
     data=[{"actual_output": "hello"}],
     scorers=[NoEmptyAnswer()],
@@ -246,8 +341,8 @@ from typing import Any
 from openai import OpenAI
-from freesolo import Freesolo, instrument_openai
-from freesolo.evaluation import CustomScorer, NumericResponse
+from freesolo import instrument_openai
+from freesolo.evaluation import CustomScorer, EvaluationClient, NumericResponse
 class CorrectnessJudge(CustomScorer[NumericResponse]):
@@ -292,7 +387,7 @@ class CorrectnessJudge(CustomScorer[NumericResponse]):
 judge_client = instrument_openai(OpenAI())
-results = Freesolo().evals.run(
+results = EvaluationClient().run(
     name="support-agent-correctness",
     data=[
         {
@@ -316,27 +411,16 @@ Hosted scorers are also available out of the box and use OpenRouter by default:
 ```python
 from freesolo.evaluation import HostedJudgeClient, ReferenceCorrectnessScorer
-judge = HostedJudgeClient(
-    api_key="YOUR_OPENROUTER_API_KEY",
-    model="openai/gpt-oss-120b",
-)
+judge = HostedJudgeClient(api_key="YOUR_OPENROUTER_API_KEY")
 scorer = ReferenceCorrectnessScorer(client=judge)
 ```
-Tracing is available from the same root client:
+Tracing is available through namespaced helpers:
 ```python
-from freesolo import Freesolo
+from freesolo.tracing import start_trace
-client = Freesolo()
-with client.traces.start("support-agent-run"):
+with start_trace("support-agent-run"):
     ...
 ```
-You can also import namespaced tracing helpers directly:
-```python
-from freesolo.tracing import start_trace, wrap
-```

{freesolo-0.2.2 → freesolo-0.2.3}/pypi/examples/.env.example RENAMED Viewed

@@ -1,5 +1,4 @@
 OPENAI_API_KEY=
-FREESOLO_JUDGE_MODEL=gpt-4.1-mini
 OPENROUTER_API_KEY=
 ANTHROPIC_API_KEY=

freesolo-0.2.3/pypi/examples/evals/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Evaluation examples for the Python package."""

freesolo-0.2.3/pypi/examples/evals/exact_match.py ADDED Viewed

@@ -0,0 +1,105 @@
+from __future__ import annotations
+import argparse
+from typing import Any
+from freesolo import BinaryResponse, CustomScorer
+from freesolo.evaluation import EvaluationClient
+from ..utils import configure_example
+class ExactMatchScorer(CustomScorer[BinaryResponse]):
+    name = "exact_match"
+    async def score(self, row: dict[str, Any]) -> BinaryResponse:
+        actual = str(row.get("actual_output", "")).strip()
+        expected = str(row.get("expected_output", "")).strip()
+        success = bool(actual) and actual == expected
+        return BinaryResponse(
+            success,
+            reason=(
+                "actual_output matched expected_output"
+                if success
+                else f'expected "{expected}" but got "{actual or "<empty>"}"'
+            ),
+        )
+class NonEmptyOutputScorer(CustomScorer[BinaryResponse]):
+    name = "non_empty_output"
+    async def score(self, row: dict[str, Any]) -> BinaryResponse:
+        actual = str(row.get("actual_output", "")).strip()
+        return BinaryResponse(
+            bool(actual),
+            reason=(
+                "actual_output is non-empty" if actual else "actual_output was empty"
+            ),
+        )
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Exact-match eval example.")
+    parser.add_argument("name", nargs="?", default="eval-example-exact-match-py")
+    return parser.parse_args()
+def build_dataset() -> list[dict[str, str]]:
+    return [
+        {
+            "input": "What is the capital of France?",
+            "actual_output": "Paris",
+            "expected_output": "Paris",
+        },
+        {
+            "input": "What is the capital of Canada?",
+            "actual_output": "Ottawa is the capital of Canada.",
+            "expected_output": "Ottawa",
+        },
+        {
+            "input": "What is 2 + 2?",
+            "actual_output": "4",
+            "expected_output": "4",
+        },
+    ]
+def print_results(name: str, results: list[Any]) -> None:
+    run_id = results[0].run_id if results else "unknown"
+    print(f"eval_name={name}")
+    print(f"run_id={run_id}")
+    print()
+    for index, result in enumerate(results, start=1):
+        summary = " | ".join(
+            (
+                f"{scorer.name}:{'pass' if scorer.success else 'fail'}"
+                + (f" ({scorer.reason})" if scorer.reason else "")
+            )
+            for scorer in result.scorers_data
+        )
+        print(f"{index}. {'pass' if result.success else 'fail'} -> {summary}")
+def main() -> None:
+    args = parse_args()
+    configure_example()
+    client = EvaluationClient()
+    results = client.run(
+        name=args.name,
+        data=build_dataset(),
+        scorers=[ExactMatchScorer(), NonEmptyOutputScorer()],
+        metadata={
+            "model": "rule-based-exact-match",
+            "provider": "local",
+            "source": "python",
+            "example": "exact-match",
+        },
+    )
+    print_results(args.name, results)
+if __name__ == "__main__":
+    main()

freesolo 0.2.2__tar.gz → 0.2.3__tar.gz

freesolo 0.2.2tar.gz → 0.2.3tar.gz