athanor-sdk 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- athanor_sdk-0.3.2/.coverage +0 -0
- athanor_sdk-0.3.2/.github/dependabot.yml +23 -0
- athanor_sdk-0.3.2/.github/workflows/ci.yml +91 -0
- athanor_sdk-0.3.2/.gitignore +12 -0
- athanor_sdk-0.3.2/PKG-INFO +196 -0
- athanor_sdk-0.3.2/README.md +176 -0
- athanor_sdk-0.3.2/docs/task-builder.md +320 -0
- athanor_sdk-0.3.2/pyproject.toml +40 -0
- athanor_sdk-0.3.2/src/athanor/__init__.py +28 -0
- athanor_sdk-0.3.2/src/athanor/calibrate.py +419 -0
- athanor_sdk-0.3.2/src/athanor/cli.py +1050 -0
- athanor_sdk-0.3.2/src/athanor/compare.py +89 -0
- athanor_sdk-0.3.2/src/athanor/env.py +376 -0
- athanor_sdk-0.3.2/src/athanor/estimate.py +97 -0
- athanor_sdk-0.3.2/src/athanor/eval_status.py +183 -0
- athanor_sdk-0.3.2/src/athanor/lean.py +262 -0
- athanor_sdk-0.3.2/src/athanor/lint.py +80 -0
- athanor_sdk-0.3.2/src/athanor/preflight.py +183 -0
- athanor_sdk-0.3.2/src/athanor/property.py +139 -0
- athanor_sdk-0.3.2/src/athanor/providers.py +194 -0
- athanor_sdk-0.3.2/src/athanor/runner.py +1225 -0
- athanor_sdk-0.3.2/src/athanor/scoring.py +102 -0
- athanor_sdk-0.3.2/src/athanor/stats.py +95 -0
- athanor_sdk-0.3.2/src/athanor/types.py +40 -0
- athanor_sdk-0.3.2/tests/__init__.py +0 -0
- athanor_sdk-0.3.2/tests/test_calibrate.py +200 -0
- athanor_sdk-0.3.2/tests/test_calibrate_modes.py +2226 -0
- athanor_sdk-0.3.2/tests/test_cli.py +376 -0
- athanor_sdk-0.3.2/tests/test_compare.py +162 -0
- athanor_sdk-0.3.2/tests/test_env.py +106 -0
- athanor_sdk-0.3.2/tests/test_eval_status.py +222 -0
- athanor_sdk-0.3.2/tests/test_integration.py +568 -0
- athanor_sdk-0.3.2/tests/test_preflight_lint_estimate.py +362 -0
- athanor_sdk-0.3.2/tests/test_property.py +242 -0
- athanor_sdk-0.3.2/tests/test_providers.py +178 -0
- athanor_sdk-0.3.2/tests/test_retry_context.py +87 -0
- athanor_sdk-0.3.2/tests/test_runner.py +639 -0
- athanor_sdk-0.3.2/tests/test_stats.py +215 -0
- athanor_sdk-0.3.2/tests/test_types.py +75 -0
|
Binary file
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
version: 2
|
|
2
|
+
updates:
|
|
3
|
+
- package-ecosystem: "pip"
|
|
4
|
+
directory: "/"
|
|
5
|
+
schedule:
|
|
6
|
+
interval: "weekly"
|
|
7
|
+
day: "monday"
|
|
8
|
+
open-pull-requests-limit: 5
|
|
9
|
+
labels:
|
|
10
|
+
- "dependencies"
|
|
11
|
+
- "security"
|
|
12
|
+
commit-message:
|
|
13
|
+
prefix: "deps"
|
|
14
|
+
include: "scope"
|
|
15
|
+
# We ship to PyPI so transitive deps land on customers. Be paranoid:
|
|
16
|
+
# group patch-level updates so the maintenance noise stays manageable
|
|
17
|
+
# but security-critical updates surface immediately.
|
|
18
|
+
groups:
|
|
19
|
+
patch-updates:
|
|
20
|
+
patterns:
|
|
21
|
+
- "*"
|
|
22
|
+
update-types:
|
|
23
|
+
- "patch"
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
matrix:
|
|
13
|
+
python-version: ["3.10", "3.12"]
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: ${{ matrix.python-version }}
|
|
20
|
+
|
|
21
|
+
- name: Install package with dev dependencies
|
|
22
|
+
run: pip install -e ".[dev]"
|
|
23
|
+
|
|
24
|
+
- name: Lint (syntax check)
|
|
25
|
+
run: |
|
|
26
|
+
# Recursive — covers subpackages, not just src/athanor/*.py top level.
|
|
27
|
+
find src -name "*.py" -exec python -m py_compile {} +
|
|
28
|
+
|
|
29
|
+
- name: Run tests with coverage
|
|
30
|
+
# Threshold is 45%: current coverage is ~50%, leaving a 5-point
|
|
31
|
+
# buffer for legitimate test reorgs while still catching significant
|
|
32
|
+
# regressions. Raise as we add tests — 70% is the eventual goal
|
|
33
|
+
# for customer-facing code, but we don't want to gate the entire
|
|
34
|
+
# CI on aspirational targets today.
|
|
35
|
+
run: |
|
|
36
|
+
pip install pytest-cov -q
|
|
37
|
+
pytest tests/ -v --cov=src/athanor --cov-report=term-missing --cov-fail-under=45
|
|
38
|
+
|
|
39
|
+
security:
|
|
40
|
+
runs-on: ubuntu-latest
|
|
41
|
+
# Doesn't gate the test job — security findings are advisory.
|
|
42
|
+
# We ship to PyPI; supply chain risk is real but advisories alone
|
|
43
|
+
# shouldn't block releases. Same threat-model approach as
|
|
44
|
+
# athanor-builder/SECURITY.md.
|
|
45
|
+
steps:
|
|
46
|
+
- uses: actions/checkout@v4
|
|
47
|
+
|
|
48
|
+
- uses: actions/setup-python@v5
|
|
49
|
+
with:
|
|
50
|
+
python-version: "3.12"
|
|
51
|
+
|
|
52
|
+
- name: Install pip-audit
|
|
53
|
+
run: pip install pip-audit -q
|
|
54
|
+
|
|
55
|
+
- name: Audit declared dependencies
|
|
56
|
+
# Surface findings as warnings, not hard fail. New critical CVEs
|
|
57
|
+
# in our dep tree should be flagged but not block an unrelated PR.
|
|
58
|
+
run: |
|
|
59
|
+
pip-audit --desc 2>&1 | tee /tmp/audit.log || true
|
|
60
|
+
if grep -qi "vulnerability\|GHSA-\|CVE-" /tmp/audit.log 2>/dev/null; then
|
|
61
|
+
echo "::warning::pip-audit found vulnerabilities — see logs above"
|
|
62
|
+
fi
|
|
63
|
+
continue-on-error: true
|
|
64
|
+
|
|
65
|
+
publish:
|
|
66
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
67
|
+
needs: test
|
|
68
|
+
runs-on: ubuntu-latest
|
|
69
|
+
permissions:
|
|
70
|
+
id-token: write
|
|
71
|
+
steps:
|
|
72
|
+
- uses: actions/checkout@v4
|
|
73
|
+
|
|
74
|
+
- uses: actions/setup-python@v5
|
|
75
|
+
with:
|
|
76
|
+
python-version: "3.12"
|
|
77
|
+
|
|
78
|
+
- name: Install build tools
|
|
79
|
+
run: pip install build twine
|
|
80
|
+
|
|
81
|
+
- name: Build package
|
|
82
|
+
run: python -m build
|
|
83
|
+
|
|
84
|
+
- name: Check package
|
|
85
|
+
run: twine check dist/*
|
|
86
|
+
|
|
87
|
+
- name: Publish to PyPI
|
|
88
|
+
env:
|
|
89
|
+
TWINE_USERNAME: __token__
|
|
90
|
+
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
|
91
|
+
run: twine upload dist/*
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: athanor-sdk
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: Formal verification as agentic training signal — CLI + self-hosted runner
|
|
5
|
+
Project-URL: Homepage, https://athanor-ai.com
|
|
6
|
+
Project-URL: Repository, https://github.com/athanor-ai/athanor-sdk
|
|
7
|
+
Author-email: Athanor AI <aidan@athanor-ai.com>
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
17
|
+
Provides-Extra: multi-model
|
|
18
|
+
Requires-Dist: litellm>=1.50.0; extra == 'multi-model'
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
<p align="center">
|
|
22
|
+
<img src="https://raw.githubusercontent.com/athanor-ai/athanor-website/main/logo.svg" width="64" height="64" alt="Athanor">
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
<h1 align="center">athanor-ai</h1>
|
|
26
|
+
|
|
27
|
+
<p align="center">
|
|
28
|
+
<strong>Lean 4 proof verification as agentic training signal.</strong><br>
|
|
29
|
+
Turn formal proofs into reward functions. Score agent output with compilers, not judges.<br><br>
|
|
30
|
+
<a href="https://athanor-ai.com">athanor-ai.com</a>
|
|
31
|
+
</p>
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
Your agent writes code. Then it writes a proof that the code is correct. The Lean 4 compiler checks the proof. The result is a training signal with no ambiguity.
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
import athanor
|
|
39
|
+
|
|
40
|
+
# Verify a Lean 4 proof
|
|
41
|
+
result = athanor.verify_proof("""
|
|
42
|
+
theorem add_comm (a b : Nat) : a + b = b + a := by
|
|
43
|
+
omega
|
|
44
|
+
""")
|
|
45
|
+
|
|
46
|
+
print(result.compiles) # True
|
|
47
|
+
print(result.has_sorry) # False
|
|
48
|
+
print(result.score) # 1.0
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Install
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install athanor-ai
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## What this solves
|
|
58
|
+
|
|
59
|
+
You have domain expertise. You know what correct code looks like. You want an AI agent to produce verified solutions, not guesses.
|
|
60
|
+
|
|
61
|
+
The problem: LLM judges are noisy. Unit tests are brittle. Benchmarks don't produce training signal.
|
|
62
|
+
|
|
63
|
+
The solution: Lean 4 formal proofs are deterministic, machine-checked, and produce continuous reward signal (full proof = 1.0, partial = 0.35, broken = 0.25).
|
|
64
|
+
|
|
65
|
+
## Verify proofs
|
|
66
|
+
|
|
67
|
+
Check if a Lean 4 proof compiles. Detect `sorry` placeholders. Catch banned constructs (`axiom`, `import Mathlib`, `unsafe`).
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from athanor import verify_proof, check_sorry, score_proof
|
|
71
|
+
|
|
72
|
+
# Full verification with detailed result
|
|
73
|
+
result = verify_proof(proof_code)
|
|
74
|
+
result.compiles # did it compile?
|
|
75
|
+
result.has_sorry # any incomplete proof markers?
|
|
76
|
+
result.sorry_count # how many sorry placeholders?
|
|
77
|
+
result.score # 0.0 - 1.0
|
|
78
|
+
result.status # "full_proof" | "partial_proof" | "compile_error" | "banned"
|
|
79
|
+
result.errors # compiler error messages
|
|
80
|
+
|
|
81
|
+
# Quick score (just the float)
|
|
82
|
+
score = score_proof(proof_code) # 1.0, 0.35, 0.25, or 0.0
|
|
83
|
+
|
|
84
|
+
# Check for sorry without full compilation
|
|
85
|
+
has_sorry, count = check_sorry(proof_code)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Works with local Lean 4 installation or via Docker (`ghcr.io/leanprover/lean4`).
|
|
89
|
+
|
|
90
|
+
## Score agent output
|
|
91
|
+
|
|
92
|
+
Pair code with a proof. Score both. Use the result as reward.
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
import athanor
|
|
96
|
+
|
|
97
|
+
env = athanor.make("my-environment", task="my-task")
|
|
98
|
+
env.reset()
|
|
99
|
+
|
|
100
|
+
result = env.score({
|
|
101
|
+
"kernel.py": agent_code,
|
|
102
|
+
"proof.lean": agent_proof,
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
# Scoring layers:
|
|
106
|
+
# 1. Does the code work? (verifier checks)
|
|
107
|
+
# 2. Does the proof compile? (Lean compiler)
|
|
108
|
+
# 3. Is the proof complete? (no sorry)
|
|
109
|
+
print(result.score) # combined score
|
|
110
|
+
print(result.lean_status) # proof status
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Agent retry with verifier feedback
|
|
114
|
+
|
|
115
|
+
Agent gets the scoring output and tries again. No human in the loop. The verifier feedback is the teacher.
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
results = env.run(
|
|
119
|
+
model="anthropic/claude-sonnet-4-6",
|
|
120
|
+
api_key="...",
|
|
121
|
+
max_retries=3,
|
|
122
|
+
target_score=0.95,
|
|
123
|
+
)
|
|
124
|
+
# Attempt 1: 0.35 (code correct, proof has sorry)
|
|
125
|
+
# Attempt 2: 0.72 (proof compiles, 2 sorry remaining)
|
|
126
|
+
# Attempt 3: 0.98 (full proof, verified)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## RL training
|
|
130
|
+
|
|
131
|
+
Use proof scores as reward signal in any RL framework.
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from trl import PPOTrainer
|
|
135
|
+
|
|
136
|
+
env = athanor.make("my-environment")
|
|
137
|
+
trainer = PPOTrainer(
|
|
138
|
+
reward_fn=lambda completions: env.reward_fn(completions),
|
|
139
|
+
...
|
|
140
|
+
)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Compatible with TRL, veRL, NeMo-RL, or any custom training loop.
|
|
144
|
+
|
|
145
|
+
## Proof scoring
|
|
146
|
+
|
|
147
|
+
```
|
|
148
|
+
proof_multiplier:
|
|
149
|
+
1.00 full proof (compiles, no sorry)
|
|
150
|
+
0.35 partial proof (compiles with sorry)
|
|
151
|
+
0.25 broken proof (does not compile)
|
|
152
|
+
0.15 no proof submitted
|
|
153
|
+
0.00 banned construct (axiom, Mathlib, unsafe)
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Partial proofs produce gradient. An agent that proves 4 of 7 theorems scores higher than one that proves 0. This is the training signal.
|
|
157
|
+
|
|
158
|
+
## Getting environments
|
|
159
|
+
|
|
160
|
+
The `verify_proof` and `score_proof` functions work standalone with any Lean 4 code. For full environment scoring (code + proof + property tests), contact [athanor-ai.com](https://athanor-ai.com).
|
|
161
|
+
|
|
162
|
+
## CLI
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
# Container sanity checks — 7 gates including shim re-export + reward-leak probe
|
|
166
|
+
athanor preflight --env <env_slug>
|
|
167
|
+
|
|
168
|
+
# Batch evaluate all tasks in an environment (or filter/patch)
|
|
169
|
+
athanor evaluate --env hw-cbmc --model anthropic/claude-sonnet-4-6
|
|
170
|
+
athanor evaluate --env hw-cbmc --tasks fix_arb_lock,fix_tlb_ctrl # subset
|
|
171
|
+
athanor evaluate --env hw-cbmc --patch runs/prev_run1.json # resume
|
|
172
|
+
|
|
173
|
+
# Score, solve, lint, estimate
|
|
174
|
+
athanor score --env <env> --task <slug> --file solution.py
|
|
175
|
+
athanor solve --env <env> --task <slug> --model claude-opus-4-6
|
|
176
|
+
athanor lint Proof.lean # positional
|
|
177
|
+
athanor estimate --model claude-sonnet-4-6 --tasks 26
|
|
178
|
+
|
|
179
|
+
# Run/calibrate/stats
|
|
180
|
+
athanor runs --env <env>
|
|
181
|
+
athanor calibrate --run runs/<model>_run1.json
|
|
182
|
+
athanor stats runs/*.json
|
|
183
|
+
athanor compare runs/a.json runs/b.json
|
|
184
|
+
athanor eval-status [env_dir]
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Results save incrementally after each task (crash-resilient). `--patch` merges into an existing run file so you can resume interrupted sweeps.
|
|
188
|
+
|
|
189
|
+
## Requirements
|
|
190
|
+
|
|
191
|
+
- Python >= 3.9
|
|
192
|
+
- Lean 4 or Docker (for proof verification)
|
|
193
|
+
|
|
194
|
+
## License
|
|
195
|
+
|
|
196
|
+
Apache-2.0
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://raw.githubusercontent.com/athanor-ai/athanor-website/main/logo.svg" width="64" height="64" alt="Athanor">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">athanor-ai</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>Lean 4 proof verification as agentic training signal.</strong><br>
|
|
9
|
+
Turn formal proofs into reward functions. Score agent output with compilers, not judges.<br><br>
|
|
10
|
+
<a href="https://athanor-ai.com">athanor-ai.com</a>
|
|
11
|
+
</p>
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
Your agent writes code. Then it writes a proof that the code is correct. The Lean 4 compiler checks the proof. The result is a training signal with no ambiguity.
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
import athanor
|
|
19
|
+
|
|
20
|
+
# Verify a Lean 4 proof
|
|
21
|
+
result = athanor.verify_proof("""
|
|
22
|
+
theorem add_comm (a b : Nat) : a + b = b + a := by
|
|
23
|
+
omega
|
|
24
|
+
""")
|
|
25
|
+
|
|
26
|
+
print(result.compiles) # True
|
|
27
|
+
print(result.has_sorry) # False
|
|
28
|
+
print(result.score) # 1.0
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install athanor-ai
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## What this solves
|
|
38
|
+
|
|
39
|
+
You have domain expertise. You know what correct code looks like. You want an AI agent to produce verified solutions, not guesses.
|
|
40
|
+
|
|
41
|
+
The problem: LLM judges are noisy. Unit tests are brittle. Benchmarks don't produce training signal.
|
|
42
|
+
|
|
43
|
+
The solution: Lean 4 formal proofs are deterministic, machine-checked, and produce continuous reward signal (full proof = 1.0, partial = 0.35, broken = 0.25).
|
|
44
|
+
|
|
45
|
+
## Verify proofs
|
|
46
|
+
|
|
47
|
+
Check if a Lean 4 proof compiles. Detect `sorry` placeholders. Catch banned constructs (`axiom`, `import Mathlib`, `unsafe`).
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from athanor import verify_proof, check_sorry, score_proof
|
|
51
|
+
|
|
52
|
+
# Full verification with detailed result
|
|
53
|
+
result = verify_proof(proof_code)
|
|
54
|
+
result.compiles # did it compile?
|
|
55
|
+
result.has_sorry # any incomplete proof markers?
|
|
56
|
+
result.sorry_count # how many sorry placeholders?
|
|
57
|
+
result.score # 0.0 - 1.0
|
|
58
|
+
result.status # "full_proof" | "partial_proof" | "compile_error" | "banned"
|
|
59
|
+
result.errors # compiler error messages
|
|
60
|
+
|
|
61
|
+
# Quick score (just the float)
|
|
62
|
+
score = score_proof(proof_code) # 1.0, 0.35, 0.25, or 0.0
|
|
63
|
+
|
|
64
|
+
# Check for sorry without full compilation
|
|
65
|
+
has_sorry, count = check_sorry(proof_code)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Works with local Lean 4 installation or via Docker (`ghcr.io/leanprover/lean4`).
|
|
69
|
+
|
|
70
|
+
## Score agent output
|
|
71
|
+
|
|
72
|
+
Pair code with a proof. Score both. Use the result as reward.
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
import athanor
|
|
76
|
+
|
|
77
|
+
env = athanor.make("my-environment", task="my-task")
|
|
78
|
+
env.reset()
|
|
79
|
+
|
|
80
|
+
result = env.score({
|
|
81
|
+
"kernel.py": agent_code,
|
|
82
|
+
"proof.lean": agent_proof,
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
# Scoring layers:
|
|
86
|
+
# 1. Does the code work? (verifier checks)
|
|
87
|
+
# 2. Does the proof compile? (Lean compiler)
|
|
88
|
+
# 3. Is the proof complete? (no sorry)
|
|
89
|
+
print(result.score) # combined score
|
|
90
|
+
print(result.lean_status) # proof status
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Agent retry with verifier feedback
|
|
94
|
+
|
|
95
|
+
Agent gets the scoring output and tries again. No human in the loop. The verifier feedback is the teacher.
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
results = env.run(
|
|
99
|
+
model="anthropic/claude-sonnet-4-6",
|
|
100
|
+
api_key="...",
|
|
101
|
+
max_retries=3,
|
|
102
|
+
target_score=0.95,
|
|
103
|
+
)
|
|
104
|
+
# Attempt 1: 0.35 (code correct, proof has sorry)
|
|
105
|
+
# Attempt 2: 0.72 (proof compiles, 2 sorry remaining)
|
|
106
|
+
# Attempt 3: 0.98 (full proof, verified)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## RL training
|
|
110
|
+
|
|
111
|
+
Use proof scores as reward signal in any RL framework.
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from trl import PPOTrainer
|
|
115
|
+
|
|
116
|
+
env = athanor.make("my-environment")
|
|
117
|
+
trainer = PPOTrainer(
|
|
118
|
+
reward_fn=lambda completions: env.reward_fn(completions),
|
|
119
|
+
...
|
|
120
|
+
)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Compatible with TRL, veRL, NeMo-RL, or any custom training loop.
|
|
124
|
+
|
|
125
|
+
## Proof scoring
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
proof_multiplier:
|
|
129
|
+
1.00 full proof (compiles, no sorry)
|
|
130
|
+
0.35 partial proof (compiles with sorry)
|
|
131
|
+
0.25 broken proof (does not compile)
|
|
132
|
+
0.15 no proof submitted
|
|
133
|
+
0.00 banned construct (axiom, Mathlib, unsafe)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Partial proofs produce gradient. An agent that proves 4 of 7 theorems scores higher than one that proves 0. This is the training signal.
|
|
137
|
+
|
|
138
|
+
## Getting environments
|
|
139
|
+
|
|
140
|
+
The `verify_proof` and `score_proof` functions work standalone with any Lean 4 code. For full environment scoring (code + proof + property tests), contact [athanor-ai.com](https://athanor-ai.com).
|
|
141
|
+
|
|
142
|
+
## CLI
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
# Container sanity checks — 7 gates including shim re-export + reward-leak probe
|
|
146
|
+
athanor preflight --env <env_slug>
|
|
147
|
+
|
|
148
|
+
# Batch evaluate all tasks in an environment (or filter/patch)
|
|
149
|
+
athanor evaluate --env hw-cbmc --model anthropic/claude-sonnet-4-6
|
|
150
|
+
athanor evaluate --env hw-cbmc --tasks fix_arb_lock,fix_tlb_ctrl # subset
|
|
151
|
+
athanor evaluate --env hw-cbmc --patch runs/prev_run1.json # resume
|
|
152
|
+
|
|
153
|
+
# Score, solve, lint, estimate
|
|
154
|
+
athanor score --env <env> --task <slug> --file solution.py
|
|
155
|
+
athanor solve --env <env> --task <slug> --model claude-opus-4-6
|
|
156
|
+
athanor lint Proof.lean # positional
|
|
157
|
+
athanor estimate --model claude-sonnet-4-6 --tasks 26
|
|
158
|
+
|
|
159
|
+
# Run/calibrate/stats
|
|
160
|
+
athanor runs --env <env>
|
|
161
|
+
athanor calibrate --run runs/<model>_run1.json
|
|
162
|
+
athanor stats runs/*.json
|
|
163
|
+
athanor compare runs/a.json runs/b.json
|
|
164
|
+
athanor eval-status [env_dir]
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Results save incrementally after each task (crash-resilient). `--patch` merges into an existing run file so you can resume interrupted sweeps.
|
|
168
|
+
|
|
169
|
+
## Requirements
|
|
170
|
+
|
|
171
|
+
- Python >= 3.9
|
|
172
|
+
- Lean 4 or Docker (for proof verification)
|
|
173
|
+
|
|
174
|
+
## License
|
|
175
|
+
|
|
176
|
+
Apache-2.0
|