athanor-sdk 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. athanor_sdk-0.3.2/.coverage +0 -0
  2. athanor_sdk-0.3.2/.github/dependabot.yml +23 -0
  3. athanor_sdk-0.3.2/.github/workflows/ci.yml +91 -0
  4. athanor_sdk-0.3.2/.gitignore +12 -0
  5. athanor_sdk-0.3.2/PKG-INFO +196 -0
  6. athanor_sdk-0.3.2/README.md +176 -0
  7. athanor_sdk-0.3.2/docs/task-builder.md +320 -0
  8. athanor_sdk-0.3.2/pyproject.toml +40 -0
  9. athanor_sdk-0.3.2/src/athanor/__init__.py +28 -0
  10. athanor_sdk-0.3.2/src/athanor/calibrate.py +419 -0
  11. athanor_sdk-0.3.2/src/athanor/cli.py +1050 -0
  12. athanor_sdk-0.3.2/src/athanor/compare.py +89 -0
  13. athanor_sdk-0.3.2/src/athanor/env.py +376 -0
  14. athanor_sdk-0.3.2/src/athanor/estimate.py +97 -0
  15. athanor_sdk-0.3.2/src/athanor/eval_status.py +183 -0
  16. athanor_sdk-0.3.2/src/athanor/lean.py +262 -0
  17. athanor_sdk-0.3.2/src/athanor/lint.py +80 -0
  18. athanor_sdk-0.3.2/src/athanor/preflight.py +183 -0
  19. athanor_sdk-0.3.2/src/athanor/property.py +139 -0
  20. athanor_sdk-0.3.2/src/athanor/providers.py +194 -0
  21. athanor_sdk-0.3.2/src/athanor/runner.py +1225 -0
  22. athanor_sdk-0.3.2/src/athanor/scoring.py +102 -0
  23. athanor_sdk-0.3.2/src/athanor/stats.py +95 -0
  24. athanor_sdk-0.3.2/src/athanor/types.py +40 -0
  25. athanor_sdk-0.3.2/tests/__init__.py +0 -0
  26. athanor_sdk-0.3.2/tests/test_calibrate.py +200 -0
  27. athanor_sdk-0.3.2/tests/test_calibrate_modes.py +2226 -0
  28. athanor_sdk-0.3.2/tests/test_cli.py +376 -0
  29. athanor_sdk-0.3.2/tests/test_compare.py +162 -0
  30. athanor_sdk-0.3.2/tests/test_env.py +106 -0
  31. athanor_sdk-0.3.2/tests/test_eval_status.py +222 -0
  32. athanor_sdk-0.3.2/tests/test_integration.py +568 -0
  33. athanor_sdk-0.3.2/tests/test_preflight_lint_estimate.py +362 -0
  34. athanor_sdk-0.3.2/tests/test_property.py +242 -0
  35. athanor_sdk-0.3.2/tests/test_providers.py +178 -0
  36. athanor_sdk-0.3.2/tests/test_retry_context.py +87 -0
  37. athanor_sdk-0.3.2/tests/test_runner.py +639 -0
  38. athanor_sdk-0.3.2/tests/test_stats.py +215 -0
  39. athanor_sdk-0.3.2/tests/test_types.py +75 -0
Binary file
@@ -0,0 +1,23 @@
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "pip"
4
+ directory: "/"
5
+ schedule:
6
+ interval: "weekly"
7
+ day: "monday"
8
+ open-pull-requests-limit: 5
9
+ labels:
10
+ - "dependencies"
11
+ - "security"
12
+ commit-message:
13
+ prefix: "deps"
14
+ include: "scope"
15
+ # We ship to PyPI so transitive deps land on customers. Be paranoid:
16
+ # group patch-level updates so the maintenance noise stays manageable
17
+ # but security-critical updates surface immediately.
18
+ groups:
19
+ patch-updates:
20
+ patterns:
21
+ - "*"
22
+ update-types:
23
+ - "patch"
@@ -0,0 +1,91 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ matrix:
13
+ python-version: ["3.10", "3.12"]
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: ${{ matrix.python-version }}
20
+
21
+ - name: Install package with dev dependencies
22
+ run: pip install -e ".[dev]"
23
+
24
+ - name: Lint (syntax check)
25
+ run: |
26
+ # Recursive — covers subpackages, not just src/athanor/*.py top level.
27
+ find src -name "*.py" -exec python -m py_compile {} +
28
+
29
+ - name: Run tests with coverage
30
+ # Threshold is 45%: current coverage is ~50%, leaving a 5-point
31
+ # buffer for legitimate test reorgs while still catching significant
32
+ # regressions. Raise as we add tests — 70% is the eventual goal
33
+ # for customer-facing code, but we don't want to gate the entire
34
+ # CI on aspirational targets today.
35
+ run: |
36
+ pip install pytest-cov -q
37
+ pytest tests/ -v --cov=src/athanor --cov-report=term-missing --cov-fail-under=45
38
+
39
+ security:
40
+ runs-on: ubuntu-latest
41
+ # Doesn't gate the test job — security findings are advisory.
42
+ # We ship to PyPI; supply chain risk is real but advisories alone
43
+ # shouldn't block releases. Same threat-model approach as
44
+ # athanor-builder/SECURITY.md.
45
+ steps:
46
+ - uses: actions/checkout@v4
47
+
48
+ - uses: actions/setup-python@v5
49
+ with:
50
+ python-version: "3.12"
51
+
52
+ - name: Install pip-audit
53
+ run: pip install pip-audit -q
54
+
55
+ - name: Audit declared dependencies
56
+ # Surface findings as warnings, not hard fail. New critical CVEs
57
+ # in our dep tree should be flagged but not block an unrelated PR.
58
+ run: |
59
+ pip-audit --desc 2>&1 | tee /tmp/audit.log || true
60
+ if grep -qi "vulnerability\|GHSA-\|CVE-" /tmp/audit.log 2>/dev/null; then
61
+ echo "::warning::pip-audit found vulnerabilities — see logs above"
62
+ fi
63
+ continue-on-error: true
64
+
65
+ publish:
66
+ if: startsWith(github.ref, 'refs/tags/v')
67
+ needs: test
68
+ runs-on: ubuntu-latest
69
+ permissions:
70
+ id-token: write
71
+ steps:
72
+ - uses: actions/checkout@v4
73
+
74
+ - uses: actions/setup-python@v5
75
+ with:
76
+ python-version: "3.12"
77
+
78
+ - name: Install build tools
79
+ run: pip install build twine
80
+
81
+ - name: Build package
82
+ run: python -m build
83
+
84
+ - name: Check package
85
+ run: twine check dist/*
86
+
87
+ - name: Publish to PyPI
88
+ env:
89
+ TWINE_USERNAME: __token__
90
+ TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
91
+ run: twine upload dist/*
@@ -0,0 +1,12 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .eggs/
7
+ *.egg
8
+ .pytest_cache/
9
+ .mypy_cache/
10
+ *.so
11
+ .env
12
+ .venv/
@@ -0,0 +1,196 @@
1
+ Metadata-Version: 2.4
2
+ Name: athanor-sdk
3
+ Version: 0.3.2
4
+ Summary: Formal verification as agentic training signal — CLI + self-hosted runner
5
+ Project-URL: Homepage, https://athanor-ai.com
6
+ Project-URL: Repository, https://github.com/athanor-ai/athanor-sdk
7
+ Author-email: Athanor AI <aidan@athanor-ai.com>
8
+ License: Apache-2.0
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Requires-Python: >=3.9
15
+ Provides-Extra: dev
16
+ Requires-Dist: pytest>=7.0; extra == 'dev'
17
+ Provides-Extra: multi-model
18
+ Requires-Dist: litellm>=1.50.0; extra == 'multi-model'
19
+ Description-Content-Type: text/markdown
20
+
21
+ <p align="center">
22
+ <img src="https://raw.githubusercontent.com/athanor-ai/athanor-website/main/logo.svg" width="64" height="64" alt="Athanor">
23
+ </p>
24
+
25
+ <h1 align="center">athanor-ai</h1>
26
+
27
+ <p align="center">
28
+ <strong>Lean 4 proof verification as agentic training signal.</strong><br>
29
+ Turn formal proofs into reward functions. Score agent output with compilers, not judges.<br><br>
30
+ <a href="https://athanor-ai.com">athanor-ai.com</a>
31
+ </p>
32
+
33
+ ---
34
+
35
+ Your agent writes code. Then it writes a proof that the code is correct. The Lean 4 compiler checks the proof. The result is a training signal with no ambiguity.
36
+
37
+ ```python
38
+ import athanor
39
+
40
+ # Verify a Lean 4 proof
41
+ result = athanor.verify_proof("""
42
+ theorem add_comm (a b : Nat) : a + b = b + a := by
43
+ omega
44
+ """)
45
+
46
+ print(result.compiles) # True
47
+ print(result.has_sorry) # False
48
+ print(result.score) # 1.0
49
+ ```
50
+
51
+ ## Install
52
+
53
+ ```bash
54
+ pip install athanor-ai
55
+ ```
56
+
57
+ ## What this solves
58
+
59
+ You have domain expertise. You know what correct code looks like. You want an AI agent to produce verified solutions, not guesses.
60
+
61
+ The problem: LLM judges are noisy. Unit tests are brittle. Benchmarks don't produce training signal.
62
+
63
+ The solution: Lean 4 formal proofs are deterministic, machine-checked, and produce continuous reward signal (full proof = 1.0, partial = 0.35, broken = 0.25).
64
+
65
+ ## Verify proofs
66
+
67
+ Check if a Lean 4 proof compiles. Detect `sorry` placeholders. Catch banned constructs (`axiom`, `import Mathlib`, `unsafe`).
68
+
69
+ ```python
70
+ from athanor import verify_proof, check_sorry, score_proof
71
+
72
+ # Full verification with detailed result
73
+ result = verify_proof(proof_code)
74
+ result.compiles # did it compile?
75
+ result.has_sorry # any incomplete proof markers?
76
+ result.sorry_count # how many sorry placeholders?
77
+ result.score # 0.0 - 1.0
78
+ result.status # "full_proof" | "partial_proof" | "compile_error" | "banned"
79
+ result.errors # compiler error messages
80
+
81
+ # Quick score (just the float)
82
+ score = score_proof(proof_code) # 1.0, 0.35, 0.25, or 0.0
83
+
84
+ # Check for sorry without full compilation
85
+ has_sorry, count = check_sorry(proof_code)
86
+ ```
87
+
88
+ Works with local Lean 4 installation or via Docker (`ghcr.io/leanprover/lean4`).
89
+
90
+ ## Score agent output
91
+
92
+ Pair code with a proof. Score both. Use the result as reward.
93
+
94
+ ```python
95
+ import athanor
96
+
97
+ env = athanor.make("my-environment", task="my-task")
98
+ env.reset()
99
+
100
+ result = env.score({
101
+ "kernel.py": agent_code,
102
+ "proof.lean": agent_proof,
103
+ })
104
+
105
+ # Scoring layers:
106
+ # 1. Does the code work? (verifier checks)
107
+ # 2. Does the proof compile? (Lean compiler)
108
+ # 3. Is the proof complete? (no sorry)
109
+ print(result.score) # combined score
110
+ print(result.lean_status) # proof status
111
+ ```
112
+
113
+ ## Agent retry with verifier feedback
114
+
115
+ Agent gets the scoring output and tries again. No human in the loop. The verifier feedback is the teacher.
116
+
117
+ ```python
118
+ results = env.run(
119
+ model="anthropic/claude-sonnet-4-6",
120
+ api_key="...",
121
+ max_retries=3,
122
+ target_score=0.95,
123
+ )
124
+ # Attempt 1: 0.35 (code correct, proof has sorry)
125
+ # Attempt 2: 0.72 (proof compiles, 2 sorry remaining)
126
+ # Attempt 3: 0.98 (full proof, verified)
127
+ ```
128
+
129
+ ## RL training
130
+
131
+ Use proof scores as reward signal in any RL framework.
132
+
133
+ ```python
134
+ from trl import PPOTrainer
135
+
136
+ env = athanor.make("my-environment")
137
+ trainer = PPOTrainer(
138
+ reward_fn=lambda completions: env.reward_fn(completions),
139
+ ...
140
+ )
141
+ ```
142
+
143
+ Compatible with TRL, veRL, NeMo-RL, or any custom training loop.
144
+
145
+ ## Proof scoring
146
+
147
+ ```
148
+ proof_multiplier:
149
+ 1.00 full proof (compiles, no sorry)
150
+ 0.35 partial proof (compiles with sorry)
151
+ 0.25 broken proof (does not compile)
152
+ 0.15 no proof submitted
153
+ 0.00 banned construct (axiom, Mathlib, unsafe)
154
+ ```
155
+
156
+ Partial proofs produce gradient. An agent that proves 4 of 7 theorems scores higher than one that proves 0. This is the training signal.
157
+
158
+ ## Getting environments
159
+
160
+ The `verify_proof` and `score_proof` functions work standalone with any Lean 4 code. For full environment scoring (code + proof + property tests), contact [athanor-ai.com](https://athanor-ai.com).
161
+
162
+ ## CLI
163
+
164
+ ```bash
165
+ # Container sanity checks — 7 gates including shim re-export + reward-leak probe
166
+ athanor preflight --env <env_slug>
167
+
168
+ # Batch evaluate all tasks in an environment (or filter/patch)
169
+ athanor evaluate --env hw-cbmc --model anthropic/claude-sonnet-4-6
170
+ athanor evaluate --env hw-cbmc --tasks fix_arb_lock,fix_tlb_ctrl # subset
171
+ athanor evaluate --env hw-cbmc --patch runs/prev_run1.json # resume
172
+
173
+ # Score, solve, lint, estimate
174
+ athanor score --env <env> --task <slug> --file solution.py
175
+ athanor solve --env <env> --task <slug> --model claude-opus-4-6
176
+ athanor lint Proof.lean # positional
177
+ athanor estimate --model claude-sonnet-4-6 --tasks 26
178
+
179
+ # Run/calibrate/stats
180
+ athanor runs --env <env>
181
+ athanor calibrate --run runs/<model>_run1.json
182
+ athanor stats runs/*.json
183
+ athanor compare runs/a.json runs/b.json
184
+ athanor eval-status [env_dir]
185
+ ```
186
+
187
+ Results save incrementally after each task (crash-resilient). `--patch` merges into an existing run file so you can resume interrupted sweeps.
188
+
189
+ ## Requirements
190
+
191
+ - Python >= 3.9
192
+ - Lean 4 or Docker (for proof verification)
193
+
194
+ ## License
195
+
196
+ Apache-2.0
@@ -0,0 +1,176 @@
1
+ <p align="center">
2
+ <img src="https://raw.githubusercontent.com/athanor-ai/athanor-website/main/logo.svg" width="64" height="64" alt="Athanor">
3
+ </p>
4
+
5
+ <h1 align="center">athanor-ai</h1>
6
+
7
+ <p align="center">
8
+ <strong>Lean 4 proof verification as agentic training signal.</strong><br>
9
+ Turn formal proofs into reward functions. Score agent output with compilers, not judges.<br><br>
10
+ <a href="https://athanor-ai.com">athanor-ai.com</a>
11
+ </p>
12
+
13
+ ---
14
+
15
+ Your agent writes code. Then it writes a proof that the code is correct. The Lean 4 compiler checks the proof. The result is a training signal with no ambiguity.
16
+
17
+ ```python
18
+ import athanor
19
+
20
+ # Verify a Lean 4 proof
21
+ result = athanor.verify_proof("""
22
+ theorem add_comm (a b : Nat) : a + b = b + a := by
23
+ omega
24
+ """)
25
+
26
+ print(result.compiles) # True
27
+ print(result.has_sorry) # False
28
+ print(result.score) # 1.0
29
+ ```
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ pip install athanor-ai
35
+ ```
36
+
37
+ ## What this solves
38
+
39
+ You have domain expertise. You know what correct code looks like. You want an AI agent to produce verified solutions, not guesses.
40
+
41
+ The problem: LLM judges are noisy. Unit tests are brittle. Benchmarks don't produce training signal.
42
+
43
+ The solution: Lean 4 formal proofs are deterministic, machine-checked, and produce continuous reward signal (full proof = 1.0, partial = 0.35, broken = 0.25).
44
+
45
+ ## Verify proofs
46
+
47
+ Check if a Lean 4 proof compiles. Detect `sorry` placeholders. Catch banned constructs (`axiom`, `import Mathlib`, `unsafe`).
48
+
49
+ ```python
50
+ from athanor import verify_proof, check_sorry, score_proof
51
+
52
+ # Full verification with detailed result
53
+ result = verify_proof(proof_code)
54
+ result.compiles # did it compile?
55
+ result.has_sorry # any incomplete proof markers?
56
+ result.sorry_count # how many sorry placeholders?
57
+ result.score # 0.0 - 1.0
58
+ result.status # "full_proof" | "partial_proof" | "compile_error" | "banned"
59
+ result.errors # compiler error messages
60
+
61
+ # Quick score (just the float)
62
+ score = score_proof(proof_code) # 1.0, 0.35, 0.25, or 0.0
63
+
64
+ # Check for sorry without full compilation
65
+ has_sorry, count = check_sorry(proof_code)
66
+ ```
67
+
68
+ Works with local Lean 4 installation or via Docker (`ghcr.io/leanprover/lean4`).
69
+
70
+ ## Score agent output
71
+
72
+ Pair code with a proof. Score both. Use the result as reward.
73
+
74
+ ```python
75
+ import athanor
76
+
77
+ env = athanor.make("my-environment", task="my-task")
78
+ env.reset()
79
+
80
+ result = env.score({
81
+ "kernel.py": agent_code,
82
+ "proof.lean": agent_proof,
83
+ })
84
+
85
+ # Scoring layers:
86
+ # 1. Does the code work? (verifier checks)
87
+ # 2. Does the proof compile? (Lean compiler)
88
+ # 3. Is the proof complete? (no sorry)
89
+ print(result.score) # combined score
90
+ print(result.lean_status) # proof status
91
+ ```
92
+
93
+ ## Agent retry with verifier feedback
94
+
95
+ Agent gets the scoring output and tries again. No human in the loop. The verifier feedback is the teacher.
96
+
97
+ ```python
98
+ results = env.run(
99
+ model="anthropic/claude-sonnet-4-6",
100
+ api_key="...",
101
+ max_retries=3,
102
+ target_score=0.95,
103
+ )
104
+ # Attempt 1: 0.35 (code correct, proof has sorry)
105
+ # Attempt 2: 0.72 (proof compiles, 2 sorry remaining)
106
+ # Attempt 3: 0.98 (full proof, verified)
107
+ ```
108
+
109
+ ## RL training
110
+
111
+ Use proof scores as reward signal in any RL framework.
112
+
113
+ ```python
114
+ from trl import PPOTrainer
115
+
116
+ env = athanor.make("my-environment")
117
+ trainer = PPOTrainer(
118
+ reward_fn=lambda completions: env.reward_fn(completions),
119
+ ...
120
+ )
121
+ ```
122
+
123
+ Compatible with TRL, veRL, NeMo-RL, or any custom training loop.
124
+
125
+ ## Proof scoring
126
+
127
+ ```
128
+ proof_multiplier:
129
+ 1.00 full proof (compiles, no sorry)
130
+ 0.35 partial proof (compiles with sorry)
131
+ 0.25 broken proof (does not compile)
132
+ 0.15 no proof submitted
133
+ 0.00 banned construct (axiom, Mathlib, unsafe)
134
+ ```
135
+
136
+ Partial proofs produce gradient. An agent that proves 4 of 7 theorems scores higher than one that proves 0. This is the training signal.
137
+
138
+ ## Getting environments
139
+
140
+ The `verify_proof` and `score_proof` functions work standalone with any Lean 4 code. For full environment scoring (code + proof + property tests), contact [athanor-ai.com](https://athanor-ai.com).
141
+
142
+ ## CLI
143
+
144
+ ```bash
145
+ # Container sanity checks — 7 gates including shim re-export + reward-leak probe
146
+ athanor preflight --env <env_slug>
147
+
148
+ # Batch evaluate all tasks in an environment (or filter/patch)
149
+ athanor evaluate --env hw-cbmc --model anthropic/claude-sonnet-4-6
150
+ athanor evaluate --env hw-cbmc --tasks fix_arb_lock,fix_tlb_ctrl # subset
151
+ athanor evaluate --env hw-cbmc --patch runs/prev_run1.json # resume
152
+
153
+ # Score, solve, lint, estimate
154
+ athanor score --env <env> --task <slug> --file solution.py
155
+ athanor solve --env <env> --task <slug> --model claude-opus-4-6
156
+ athanor lint Proof.lean # positional
157
+ athanor estimate --model claude-sonnet-4-6 --tasks 26
158
+
159
+ # Run/calibrate/stats
160
+ athanor runs --env <env>
161
+ athanor calibrate --run runs/<model>_run1.json
162
+ athanor stats runs/*.json
163
+ athanor compare runs/a.json runs/b.json
164
+ athanor eval-status [env_dir]
165
+ ```
166
+
167
+ Results save incrementally after each task (crash-resilient). `--patch` merges into an existing run file so you can resume interrupted sweeps.
168
+
169
+ ## Requirements
170
+
171
+ - Python >= 3.9
172
+ - Lean 4 or Docker (for proof verification)
173
+
174
+ ## License
175
+
176
+ Apache-2.0