freesolo 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. freesolo-0.2.4/.github/workflows/publish-packages.yml +96 -0
  2. freesolo-0.2.4/.github/workflows/python-checks.yml +41 -0
  3. freesolo-0.2.4/.github/workflows/sync-package-function-usage.yml +38 -0
  4. {freesolo-0.2.3 → freesolo-0.2.4}/PKG-INFO +89 -141
  5. {freesolo-0.2.3 → freesolo-0.2.4}/README.md +77 -135
  6. freesolo-0.2.4/examples/PROMPT.md +10 -0
  7. freesolo-0.2.4/examples/README.md +96 -0
  8. freesolo-0.2.4/examples/TRAINING_CONTRACT.md +10 -0
  9. freesolo-0.2.4/examples/data/support_eval.jsonl +3 -0
  10. freesolo-0.2.4/examples/data/support_train.jsonl +3 -0
  11. freesolo-0.2.4/examples/environment.py +110 -0
  12. freesolo-0.2.4/examples/evaluation_custom_scorer.py +105 -0
  13. freesolo-0.2.4/examples/evaluation_from_files.py +47 -0
  14. freesolo-0.2.4/examples/gepa_prompt_example.py +76 -0
  15. freesolo-0.2.4/examples/support_dataset.py +20 -0
  16. freesolo-0.2.4/examples/tracing_manual_span.py +47 -0
  17. freesolo-0.2.4/examples/training_sft_grpo.py +75 -0
  18. freesolo-0.2.4/function_usage_registry.json +12 -0
  19. freesolo-0.2.4/pypi/freesolo/__init__.py +1 -0
  20. freesolo-0.2.4/pypi/freesolo/_usage.py +39 -0
  21. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/contracts/markdown.py +1 -5
  22. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/contracts/types.py +0 -1
  23. freesolo-0.2.4/pypi/freesolo/datasets/__init__.py +13 -0
  24. freesolo-0.2.4/pypi/freesolo/datasets/core.py +74 -0
  25. freesolo-0.2.4/pypi/freesolo/datasets/records.py +139 -0
  26. freesolo-0.2.4/pypi/freesolo/datasets/types.py +26 -0
  27. freesolo-0.2.4/pypi/freesolo/environments/__init__.py +17 -0
  28. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/environments/base.py +5 -78
  29. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/environments/evaluation.py +94 -57
  30. freesolo-0.2.4/pypi/freesolo/environments/types.py +79 -0
  31. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/__init__.py +0 -15
  32. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/client.py +20 -16
  33. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/judges/__init__.py +0 -8
  34. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/judges/base.py +1 -48
  35. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/judges/groundedness.py +1 -1
  36. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/judges/instruction_following.py +1 -1
  37. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/judges/pairwise_preference.py +1 -1
  38. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/judges/reference_correctness.py +1 -1
  39. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/judges/rubric.py +1 -1
  40. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/results.py +1 -1
  41. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/types.py +1 -1
  42. freesolo-0.2.4/pypi/freesolo/gepa/__init__.py +17 -0
  43. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/gepa/adapter.py +32 -29
  44. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/gepa/reflection.py +20 -5
  45. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/gepa/setup.py +23 -31
  46. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/gepa/types.py +8 -15
  47. freesolo-0.2.4/pypi/freesolo/tracing/__init__.py +13 -0
  48. freesolo-0.2.4/pypi/freesolo/tracing/otel.py +250 -0
  49. freesolo-0.2.4/pypi/freesolo/training/__init__.py +10 -0
  50. freesolo-0.2.4/pypi/freesolo/training/grpo/__init__.py +0 -0
  51. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/training/grpo/config.py +0 -6
  52. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/training/grpo/datums.py +0 -9
  53. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/training/grpo/rewards.py +0 -6
  54. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/training/grpo/sampling.py +0 -8
  55. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/training/storage.py +1 -4
  56. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/training/train_grpo.py +7 -11
  57. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/training/train_sft.py +33 -13
  58. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/training/types.py +12 -0
  59. freesolo-0.2.4/pypi/freesolo/utils/__init__.py +0 -0
  60. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/utils/checkpoints.py +0 -23
  61. freesolo-0.2.3/pypi/freesolo/util.py → freesolo-0.2.4/pypi/freesolo/utils/core.py +15 -222
  62. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/utils/openrouter.py +0 -37
  63. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/utils/oracle.py +3 -4
  64. {freesolo-0.2.3/pypi/freesolo → freesolo-0.2.4/pypi/freesolo/utils}/storage.py +78 -37
  65. freesolo-0.2.4/pypi/freesolo/utils/upload.py +60 -0
  66. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/utils/wandb.py +0 -41
  67. {freesolo-0.2.3 → freesolo-0.2.4}/pyproject.toml +12 -7
  68. {freesolo-0.2.3 → freesolo-0.2.4}/ruff.toml +3 -0
  69. freesolo-0.2.4/scripts/sync_package_function_usage.py +81 -0
  70. freesolo-0.2.4/tests/end_to_end_testing/test_environment_evaluation_flow.py +140 -0
  71. freesolo-0.2.4/tests/end_to_end_testing/test_examples.py +141 -0
  72. freesolo-0.2.4/tests/functionality/test_datasets.py +113 -0
  73. {freesolo-0.2.3/tests → freesolo-0.2.4/tests/functionality}/test_evaluation_client.py +5 -4
  74. {freesolo-0.2.3/tests → freesolo-0.2.4/tests/functionality}/test_gepa_adapter.py +54 -16
  75. freesolo-0.2.4/tests/functionality/test_records_rewards_and_config.py +126 -0
  76. {freesolo-0.2.3/tests → freesolo-0.2.4/tests/functionality}/test_storage_sync.py +143 -20
  77. freesolo-0.2.4/tests/functionality/test_tracing_opentelemetry.py +128 -0
  78. freesolo-0.2.4/tests/functionality/test_upload.py +97 -0
  79. freesolo-0.2.4/tests/functionality/test_usage_registry.py +63 -0
  80. freesolo-0.2.4/tests/security/test_sanitize_and_contract_security.py +60 -0
  81. freesolo-0.2.4/uv.lock +3328 -0
  82. freesolo-0.2.3/pypi/examples/.env.example +0 -10
  83. freesolo-0.2.3/pypi/examples/__init__.py +0 -1
  84. freesolo-0.2.3/pypi/examples/anthropic/__init__.py +0 -1
  85. freesolo-0.2.3/pypi/examples/anthropic/chat.py +0 -56
  86. freesolo-0.2.3/pypi/examples/anthropic/vision.py +0 -82
  87. freesolo-0.2.3/pypi/examples/evals/__init__.py +0 -1
  88. freesolo-0.2.3/pypi/examples/evals/exact_match.py +0 -105
  89. freesolo-0.2.3/pypi/examples/evals/llm_judge.py +0 -149
  90. freesolo-0.2.3/pypi/examples/gemini/__init__.py +0 -1
  91. freesolo-0.2.3/pypi/examples/gemini/chat.py +0 -53
  92. freesolo-0.2.3/pypi/examples/gemini/vision.py +0 -79
  93. freesolo-0.2.3/pypi/examples/openai/__init__.py +0 -1
  94. freesolo-0.2.3/pypi/examples/openai/chat.py +0 -56
  95. freesolo-0.2.3/pypi/examples/openai/vision.py +0 -64
  96. freesolo-0.2.3/pypi/examples/openrouter/__init__.py +0 -1
  97. freesolo-0.2.3/pypi/examples/openrouter/chat.py +0 -60
  98. freesolo-0.2.3/pypi/examples/utils.py +0 -231
  99. freesolo-0.2.3/pypi/freesolo/__init__.py +0 -61
  100. freesolo-0.2.3/pypi/freesolo/environments/__init__.py +0 -32
  101. freesolo-0.2.3/pypi/freesolo/environments/types.py +0 -157
  102. freesolo-0.2.3/pypi/freesolo/gepa/__init__.py +0 -47
  103. freesolo-0.2.3/pypi/freesolo/tracing/__init__.py +0 -31
  104. freesolo-0.2.3/pypi/freesolo/tracing/client.py +0 -548
  105. freesolo-0.2.3/pypi/freesolo/tracing/decorators.py +0 -66
  106. freesolo-0.2.3/pypi/freesolo/tracing/providers/__init__.py +0 -14
  107. freesolo-0.2.3/pypi/freesolo/tracing/providers/anthropic.py +0 -111
  108. freesolo-0.2.3/pypi/freesolo/tracing/providers/config.py +0 -101
  109. freesolo-0.2.3/pypi/freesolo/tracing/providers/gemini.py +0 -205
  110. freesolo-0.2.3/pypi/freesolo/tracing/providers/openai.py +0 -208
  111. freesolo-0.2.3/pypi/freesolo/tracing/providers/utils.py +0 -276
  112. freesolo-0.2.3/pypi/freesolo/tracing/types.py +0 -79
  113. freesolo-0.2.3/pypi/freesolo/training/__init__.py +0 -14
  114. freesolo-0.2.3/pypi/freesolo/training/grpo/__init__.py +0 -20
  115. freesolo-0.2.3/pypi/freesolo/utils/__init__.py +0 -119
  116. freesolo-0.2.3/pypi/freesolo/utils/deployment.py +0 -70
  117. freesolo-0.2.3/uv.lock +0 -1204
  118. {freesolo-0.2.3 → freesolo-0.2.4}/.env.example +0 -0
  119. {freesolo-0.2.3 → freesolo-0.2.4}/.gitignore +0 -0
  120. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/.gitignore +0 -0
  121. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/contracts/__init__.py +0 -0
  122. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/evaluation/responses.py +0 -0
  123. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/py.typed +0 -0
  124. {freesolo-0.2.3 → freesolo-0.2.4}/pypi/freesolo/tracing/sanitize.py +0 -0
  125. {freesolo-0.2.3/tests → freesolo-0.2.4/tests/functionality}/test_utils_checkpoints.py +0 -0
@@ -0,0 +1,96 @@
1
+ name: Publish packages
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths:
8
+ - "pyproject.toml"
9
+ - "uv.lock"
10
+ - "pypi/**"
11
+ - "examples/**"
12
+ - ".github/workflows/publish-packages.yml"
13
+ workflow_dispatch:
14
+
15
+ concurrency:
16
+ group: publish-packages-${{ github.ref }}
17
+ cancel-in-progress: false
18
+
19
+ jobs:
20
+ publish-pypi:
21
+ name: Publish PyPI package
22
+ runs-on: ubuntu-latest
23
+ permissions:
24
+ contents: read
25
+ env:
26
+ UV_PUBLISH_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
27
+
28
+ steps:
29
+ - uses: actions/checkout@v6
30
+
31
+ - uses: actions/setup-python@v6
32
+ with:
33
+ python-version: "3.12"
34
+
35
+ - name: Read package metadata
36
+ id: metadata
37
+ run: |
38
+ python - <<'PY' >> "$GITHUB_OUTPUT"
39
+ import tomllib
40
+
41
+ with open("pyproject.toml", "rb") as f:
42
+ project = tomllib.load(f)["project"]
43
+
44
+ print(f"name={project['name']}")
45
+ print(f"version={project['version']}")
46
+ PY
47
+
48
+ - name: Check PyPI for existing version
49
+ id: pypi
50
+ env:
51
+ PACKAGE_NAME: ${{ steps.metadata.outputs.name }}
52
+ PACKAGE_VERSION: ${{ steps.metadata.outputs.version }}
53
+ run: |
54
+ python - <<'PY' >> "$GITHUB_OUTPUT"
55
+ import os
56
+ import urllib.error
57
+ import urllib.request
58
+
59
+ name = os.environ["PACKAGE_NAME"]
60
+ version = os.environ["PACKAGE_VERSION"]
61
+ url = f"https://pypi.org/pypi/{name}/{version}/json"
62
+
63
+ try:
64
+ with urllib.request.urlopen(url, timeout=30) as response:
65
+ exists = response.status == 200
66
+ except urllib.error.HTTPError as error:
67
+ if error.code != 404:
68
+ raise
69
+ exists = False
70
+
71
+ print(f"exists={'true' if exists else 'false'}")
72
+ PY
73
+
74
+ - name: Skip existing PyPI version
75
+ if: steps.pypi.outputs.exists == 'true'
76
+ run: echo "${{ steps.metadata.outputs.name }} ${{ steps.metadata.outputs.version }} is already on PyPI."
77
+
78
+ - name: Install uv
79
+ if: steps.pypi.outputs.exists != 'true'
80
+ run: python -m pip install --upgrade uv
81
+
82
+ - name: Build distributions
83
+ if: steps.pypi.outputs.exists != 'true'
84
+ run: |
85
+ rm -rf dist
86
+ uv build
87
+
88
+ - name: Publish to PyPI
89
+ if: steps.pypi.outputs.exists != 'true' && env.UV_PUBLISH_TOKEN != ''
90
+ run: uv publish
91
+
92
+ - name: Skip publish without PyPI token
93
+ if: steps.pypi.outputs.exists != 'true' && env.UV_PUBLISH_TOKEN == ''
94
+ run: |
95
+ echo "PYPI_API_TOKEN is not configured; built distributions but skipped upload."
96
+ echo "Add a PYPI_API_TOKEN repository secret to publish this package."
@@ -0,0 +1,41 @@
1
+ name: Python checks
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches:
7
+ - main
8
+ workflow_dispatch:
9
+
10
+ permissions:
11
+ contents: read
12
+
13
+ jobs:
14
+ checks:
15
+ name: Ruff and tests
16
+ runs-on: ubuntu-latest
17
+
18
+ steps:
19
+ - uses: actions/checkout@v6
20
+
21
+ - uses: actions/setup-python@v6
22
+ with:
23
+ python-version: "3.12"
24
+
25
+ - name: Install uv
26
+ run: python3 -m pip install --upgrade uv
27
+
28
+ - name: Install dependencies
29
+ run: uv sync --locked --extra dev
30
+
31
+ - name: Python compile check
32
+ run: python3 -m py_compile $(find pypi tests -name '*.py' -print)
33
+
34
+ - name: Ruff check
35
+ run: uv run --extra dev python -m ruff check .
36
+
37
+ - name: Ruff format check
38
+ run: uv run --extra dev python -m ruff format --check .
39
+
40
+ - name: Tests
41
+ run: uv run --extra dev python -m pytest tests
@@ -0,0 +1,38 @@
1
+ name: Sync package function usage
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths:
8
+ - "function_usage_registry.json"
9
+ - "scripts/sync_package_function_usage.py"
10
+ - ".github/workflows/sync-package-function-usage.yml"
11
+ workflow_dispatch:
12
+
13
+ permissions:
14
+ contents: read
15
+
16
+ jobs:
17
+ sync:
18
+ name: Sync usage registry
19
+ runs-on: ubuntu-latest
20
+ if: ${{ github.ref == 'refs/heads/main' }}
21
+ env:
22
+ SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
23
+ SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }}
24
+
25
+ steps:
26
+ - uses: actions/checkout@v6
27
+
28
+ - uses: actions/setup-python@v6
29
+ with:
30
+ python-version: "3.12"
31
+
32
+ - name: Sync package function rows
33
+ if: env.SUPABASE_URL != '' && env.SUPABASE_SERVICE_ROLE_KEY != ''
34
+ run: python scripts/sync_package_function_usage.py --remove-stale
35
+
36
+ - name: Skip without Supabase secrets
37
+ if: env.SUPABASE_URL == '' || env.SUPABASE_SERVICE_ROLE_KEY == ''
38
+ run: echo "SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY secrets are required to sync usage."
@@ -1,19 +1,25 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: freesolo
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Tracing, evaluation, and training utilities for LLM applications.
5
- Requires-Python: >=3.10
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: gepa>=0.1.1
6
7
  Requires-Dist: httpx>=0.27.0
8
+ Requires-Dist: jsonschema>=4.0.0
9
+ Requires-Dist: numpy>=1.26.0
10
+ Requires-Dist: opentelemetry-api>=1.28.0
11
+ Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.28.0
12
+ Requires-Dist: opentelemetry-sdk>=1.28.0
13
+ Requires-Dist: pymongo>=4.0.0
14
+ Requires-Dist: python-dotenv>=1.0.0
15
+ Requires-Dist: tinker-cookbook>=0.3.0
16
+ Requires-Dist: tinker>=0.19.0
7
17
  Requires-Dist: wandb>=0.17.0
8
18
  Provides-Extra: dev
9
19
  Requires-Dist: pytest>=8.0.0; extra == 'dev'
10
20
  Requires-Dist: ruff>=0.11.0; extra == 'dev'
11
21
  Provides-Extra: examples
12
- Requires-Dist: anthropic>=0.40.0; extra == 'examples'
13
- Requires-Dist: google-genai>=1.0.0; extra == 'examples'
14
22
  Requires-Dist: openai>=1.0.0; extra == 'examples'
15
- Provides-Extra: gepa
16
- Requires-Dist: gepa>=0.1.1; extra == 'gepa'
17
23
  Description-Content-Type: text/markdown
18
24
 
19
25
  # freesolo
@@ -24,36 +30,15 @@ It is built for the lowest-friction integration possible:
24
30
 
25
31
  1. Install the package
26
32
  2. Set `FREESOLO_API_KEY`
27
- 3. Wrap your OpenAI, Anthropic, Gemini, or OpenAI-compatible client
33
+ 3. Configure the tracer
28
34
  4. Run traces and evaluations from the package APIs
29
35
 
30
- ## Current provider support
31
-
32
- `freesolo` currently supports automatic client instrumentation for:
33
-
34
- - OpenAI
35
- - Anthropic
36
- - Gemini
37
- - OpenAI-compatible clients via `wrap(...)` / `wrap_provider(...)`
38
-
39
36
  ## Install
40
37
 
41
- Install the package plus the provider client you use:
42
-
43
- ```bash
44
- pip install freesolo openai
45
- ```
46
-
47
- or
48
-
49
- ```bash
50
- pip install freesolo anthropic
51
- ```
52
-
53
- or
38
+ Install the package:
54
39
 
55
40
  ```bash
56
- pip install freesolo google-genai
41
+ pip install freesolo
57
42
  ```
58
43
 
59
44
  ## Environment
@@ -68,107 +53,77 @@ export FREESOLO_API_KEY=fslo_...
68
53
  ## Quickstart
69
54
 
70
55
  ```python
71
- from openai import OpenAI
72
- from freesolo import wrap
73
-
74
- client = wrap(OpenAI())
75
-
76
- result = client.responses.create(
77
- model="gpt-4.1-mini",
78
- instructions="Reply in plain text.",
79
- input=[
80
- {
81
- "role": "user",
82
- "content": [{"type": "input_text", "text": "How do I reset my password?"}],
83
- }
84
- ],
85
- )
86
-
87
- print(result.output_text or "")
56
+ from freesolo.tracing import configure_tracer, get_tracer
57
+
58
+ configure_tracer(service_name="my-llm-app")
59
+ tracer = get_tracer()
60
+
61
+ with tracer.start_as_current_span(
62
+ "model.call",
63
+ attributes={
64
+ "gen_ai.system": "openai",
65
+ "gen_ai.request.model": "gpt-5.5",
66
+ "freesolo.input": {"prompt": "How do I reset my password?"},
67
+ },
68
+ ) as span:
69
+ result = "Reset it from account settings."
70
+ span.set_attribute("freesolo.output", result)
88
71
  ```
89
72
 
90
- ## OpenRouter Quickstart
73
+ ## Runnable Examples
91
74
 
92
- ```python
93
- from openai import OpenAI
94
- from freesolo import wrap
75
+ Copy-pasteable examples live in [`examples/`](examples/):
95
76
 
96
- client = wrap(
97
- OpenAI(
98
- base_url="https://openrouter.ai/api/v1",
99
- api_key="YOUR_OPENROUTER_API_KEY",
100
- )
101
- )
77
+ - `tracing_manual_span.py`: configure OpenTelemetry and send one application span.
78
+ - `evaluation_custom_scorer.py`: run custom binary and numeric eval scorers.
79
+ - `evaluation_from_files.py`: run evals from a concrete dataset and environment.
80
+ - `environment.py`: example environment used by evals, training, and GEPA.
81
+ - `support_dataset.py`: example dataset paths and loaders used by evals, SFT, GRPO, and GEPA.
82
+ - `gepa_prompt_example.py`: run the Freesolo GEPA adapter over the example dataset.
83
+ - `training_sft_grpo.py`: start SFT or GRPO training runs from package APIs.
102
84
 
103
- response = client.chat.completions.create(
104
- model="openai/gpt-4.1-mini",
105
- messages=[
106
- {"role": "system", "content": "Reply in plain text."},
107
- {"role": "user", "content": "Write a one-sentence launch blurb."},
108
- ],
109
- max_tokens=120,
110
- )
111
-
112
- print(response.choices[0].message.content or "")
113
- ```
114
-
115
- ## Gemini Quickstart
116
-
117
- ```python
118
- from google import genai
119
- from freesolo import instrument_gemini
85
+ From a repo checkout:
120
86
 
121
- client = instrument_gemini(genai.Client())
122
-
123
- response = client.models.generate_content(
124
- model="gemini-2.5-flash",
125
- contents="Write a one-sentence release note for traced Gemini support.",
126
- )
127
-
128
- print(response.text or "")
87
+ ```bash
88
+ cd freesolo-sdk
89
+ export PYTHONPATH="$PWD/pypi"
90
+ uv run python examples/evaluation_custom_scorer.py --local
129
91
  ```
130
92
 
131
- ## Group Multiple Model Calls
132
-
133
- For agentic or long-horizon tasks, strongly prefer wrapping the whole task in `start_trace(...)` so all of the model calls land in one trace.
134
-
135
- For a single one-off OpenAI, Anthropic, or Gemini request, you can skip it.
136
-
137
- ```python
138
- from anthropic import Anthropic
139
- from freesolo import instrument_anthropic, start_trace
140
-
141
- client = instrument_anthropic(Anthropic())
142
-
143
- with start_trace("support-agent-run"):
144
- first = client.messages.create(
145
- model="claude-sonnet-4-20250514",
146
- max_tokens=64,
147
- messages=[{"role": "user", "content": "Say hello"}],
148
- )
149
- second = client.messages.create(
150
- model="claude-sonnet-4-20250514",
151
- max_tokens=64,
152
- messages=[{"role": "user", "content": "Say goodbye"}],
153
- )
154
- ```
93
+ ## Public API
94
+
95
+ The root `freesolo` module intentionally exports no functions. Import from the
96
+ subpackages below; lower-level modules may be importable, but they are
97
+ implementation helpers unless they appear here or in an example.
98
+
99
+ | Import | Use case |
100
+ | --- | --- |
101
+ | `freesolo.tracing.configure_tracer`, `get_tracer`, `force_flush`, `shutdown` | Send OpenTelemetry traces from an application to Freesolo. |
102
+ | `freesolo.evaluation.EvaluationClient` | Run custom-scorer evals or environment evals and upload results to Freesolo. |
103
+ | `freesolo.evaluation.run_local_evaluation` | Run custom scorers locally without uploading results. |
104
+ | `freesolo.evaluation.CustomScorer`, `BinaryResponse`, `NumericResponse` | Define local scorer logic for eval rows. |
105
+ | `freesolo.evaluation.HostedJudgeClient` and hosted scorer classes | Use hosted LLM-as-judge scorers with OpenRouter-compatible credentials. |
106
+ | `freesolo.datasets.TaskExample`, `Dataset`, `load_dataset` | Load task examples and construct labeled conversations for evals or training. |
107
+ | `freesolo.environments.Environment`, `RewardResult`, `RewardMetric`, `GrpoConfig`, `EnvironmentGeneration` | Define task behavior once for evals, GEPA, SFT, and GRPO. |
108
+ | `freesolo.training.SftConfig`, `TrainGrpoOptions`, `train_sft`, `train_grpo` | Start SFT or GRPO training from package APIs. |
109
+ | `freesolo.gepa.GEPASetup`, `GEPAConfig`, `DefaultReflectionAgent`, `attach_gepa`, `optimize_gepa` | Optimize prompts through the GEPA adapter using the same environment and dataset abstractions. |
110
+ | `freesolo.contracts.load_contract_text`, `extract_contract_spec`, `load_contract_spec`, `build_oracle_messages` | Read contract markdown and build oracle prompt messages. |
111
+ | `freesolo.utils.oracle.generate_ground_truth_records` | Generate ground-truth JSONL records from source examples using a contract, environment, and oracle model. |
112
+ | `freesolo.utils.upload.upload_tinker_checkpoint_to_huggingface` | Upload a Tinker checkpoint to a private Hugging Face model repo. |
155
113
 
156
114
  ## What Gets Stored
157
115
 
158
- - Trace title if you explicitly pass it to `start_trace("...")`
159
- - Trace metadata if you explicitly pass it to `start_trace(..., metadata=...)`
160
- - Input payloads with `system_prompt`, `user_prompt`, and `images`
161
- - Output payloads as plain text
162
- - Token usage when available
163
- - Image inputs with inline previews for the trace UI
116
+ - Native OTLP traces and spans
117
+ - Resource attributes like `service.name`
118
+ - Span names, timings, parent span ids, status, and errors
119
+ - Common model attributes such as `gen_ai.system`, `gen_ai.request.model`, and token counts
120
+ - Optional `freesolo.input` and `freesolo.output` span attributes
164
121
 
165
122
  ## Notes
166
123
 
167
- - You do not need `@trace()` for ordinary LLM tracing.
168
- - A single instrumented OpenAI, Anthropic, or Gemini request creates a trace automatically.
169
- - For OpenAI-compatible providers like OpenRouter, prefer `wrap(...)` instead of provider-specific helpers.
170
- - For agentic or long-horizon workflows, strongly recommend `start_trace("descriptive-title")` so planning, retries, and follow-up calls stay grouped.
171
- - Delivery is best-effort by default. Trace ingestion failures do not break your app.
124
+ - Tracing uses native OpenTelemetry protobuf export to `/api/traces/ingest`.
125
+ - Configure third-party OpenTelemetry instrumentors against the provider returned by `configure_tracer(...)`.
126
+ - Delivery is handled by the OpenTelemetry span processor you configure.
172
127
 
173
128
  ## Evaluations
174
129
 
@@ -216,16 +171,15 @@ results = client.run(
216
171
  print(results[0].success)
217
172
  ```
218
173
 
219
- ## Tinker Deployment
174
+ ## Tinker Hugging Face Upload
220
175
 
221
- `freesolo.utils.deployment` is a thin proxy for the Modal deployment server. It posts
222
- a Tinker checkpoint URL to the pinned Modal `/deployments` endpoint and returns
223
- the server JSON response.
176
+ `freesolo.utils.upload` posts a Tinker checkpoint URL to the Freesolo upload
177
+ service and returns the Hugging Face upload response.
224
178
 
225
179
  ```python
226
- from freesolo.utils.deployment import deploy_tinker_checkpoint
180
+ from freesolo.utils.upload import upload_tinker_checkpoint_to_huggingface
227
181
 
228
- result = deploy_tinker_checkpoint(
182
+ result = upload_tinker_checkpoint_to_huggingface(
229
183
  "tinker://<run_id>/sampler_weights/final",
230
184
  base_model="Qwen/Qwen3.5-35B-A3B",
231
185
  )
@@ -235,34 +189,36 @@ print(result["repoId"])
235
189
 
236
190
  ### Environment-driven evaluations
237
191
 
238
- For training contracts, you can use the same `Environment` adapter for evals,
239
- SFT, and GRPO. `run_environment` loads examples, builds prompt messages, calls
240
- your model callback, scores the response through the environment, and uploads
241
- the same `scorers_data` shape used by the eval DB.
192
+ For training contracts, `Environment` describes task behavior for evals and
193
+ GRPO/RL: prompt construction, response normalization, and reward scoring.
194
+ Dataset loading and labeled conversation construction live in `freesolo.datasets`.
195
+ `run_environment` loads task examples, calls your model callback, scores the
196
+ response through the environment, and uploads the same `scorers_data` shape used
197
+ by the eval DB.
242
198
 
243
199
  ```python
244
200
  from typing import Any
245
201
 
246
202
  from openai import OpenAI
247
203
 
204
+ from freesolo.datasets import TaskExample
248
205
  from freesolo.environments import (
249
206
  Environment,
250
207
  EnvironmentGeneration,
251
208
  RewardMetric,
252
209
  RewardResult,
253
- TaskExample,
254
210
  )
255
211
  from freesolo.evaluation import EvaluationClient
256
212
 
257
213
 
258
- class ContractEnvironment(Environment):
214
+ class PromptEnvironment(Environment):
259
215
  def build_prompt_messages(
260
216
  self,
261
217
  example: TaskExample,
262
- contract_text: str,
218
+ prompt_text: str,
263
219
  ):
264
220
  return [
265
- {"role": "system", "content": contract_text},
221
+ {"role": "system", "content": prompt_text},
266
222
  {"role": "user", "content": example.task},
267
223
  ]
268
224
 
@@ -359,7 +315,6 @@ from typing import Any
359
315
 
360
316
  from openai import OpenAI
361
317
 
362
- from freesolo import instrument_openai
363
318
  from freesolo.evaluation import CustomScorer, EvaluationClient, NumericResponse
364
319
 
365
320
 
@@ -403,7 +358,7 @@ class CorrectnessJudge(CustomScorer[NumericResponse]):
403
358
  )
404
359
 
405
360
 
406
- judge_client = instrument_openai(OpenAI())
361
+ judge_client = OpenAI()
407
362
 
408
363
  results = EvaluationClient().run(
409
364
  name="support-agent-correctness",
@@ -434,11 +389,4 @@ judge = HostedJudgeClient(api_key="YOUR_OPENROUTER_API_KEY")
434
389
  scorer = ReferenceCorrectnessScorer(client=judge)
435
390
  ```
436
391
 
437
- Tracing is available through namespaced helpers:
438
-
439
- ```python
440
- from freesolo.tracing import start_trace
441
-
442
- with start_trace("support-agent-run"):
443
- ...
444
- ```
392
+ Tracing is available through the OpenTelemetry helpers in `freesolo.tracing`.