athanor-sdk 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
athanor/__init__.py ADDED
@@ -0,0 +1,28 @@
1
+ """Athanor AI — Lean 4 proof verification as RL training signal."""
2
+
3
+ from athanor.env import Environment
4
+ from athanor.types import ScoreResult, TaskConfig
5
+ from athanor.lean import verify_proof, check_sorry, score_proof, ProofResult
6
+
7
+ __version__ = "0.3.1"
8
+
9
+
10
+ def make(env_name: str, task: str | None = None, **kwargs) -> Environment:
11
+ """Create an Athanor environment.
12
+
13
+ Args:
14
+ env_name: Environment image name (e.g. 'neuron-nki-kernels').
15
+ task: Optional task ID to start with.
16
+ **kwargs: Passed to Environment (image=, timeout=).
17
+
18
+ Returns:
19
+ Environment instance ready for reset()/score() calls.
20
+ """
21
+ return Environment(env_name, task=task, **kwargs)
22
+
23
+
24
+ __all__ = [
25
+ "make", "Environment", "ScoreResult", "TaskConfig",
26
+ "verify_proof", "check_sorry", "score_proof", "ProofResult",
27
+ "__version__",
28
+ ]
athanor/calibrate.py ADDED
@@ -0,0 +1,419 @@
1
+ """Sigmoid calibrator — fit sigmoid parameters from score data.
2
+
3
+ The calibration maps raw scores (e.g. property test pass fractions) to
4
+ calibrated scores in [0, 1] using a sigmoid. This helps separate
5
+ "trying" from "succeeding" — the sigmoid center marks the threshold
6
+ where scores start counting.
7
+
8
+ score_calibrated = 1 / (1 + exp(-scale * (raw - center)))
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import math
14
+ from pathlib import Path
15
+
16
+
17
+ def sigmoid(x: float, center: float, scale: float) -> float:
18
+ """Standard logistic sigmoid centered at `center` with steepness `scale`."""
19
+ z = scale * (x - center)
20
+ # Clamp to avoid overflow
21
+ if z > 500:
22
+ return 1.0
23
+ if z < -500:
24
+ return 0.0
25
+ return 1.0 / (1.0 + math.exp(-z))
26
+
27
+
28
+ def fit_sigmoid(
29
+ raw_scores: list[float],
30
+ targets: list[float] | None = None,
31
+ *,
32
+ max_iter: int = 200,
33
+ lr: float = 0.1,
34
+ ) -> tuple[float, float]:
35
+ """Fit sigmoid center + scale via gradient descent on MSE loss.
36
+
37
+ If targets is None, fits to binarized targets (1 if raw > median, else 0).
38
+ This finds a natural decision boundary in the score distribution.
39
+
40
+ Args:
41
+ raw_scores: raw scores from property tests / simulations
42
+ targets: optional target calibrated scores (0..1). If None, uses
43
+ binarization at the median of raw_scores.
44
+ max_iter: gradient descent iterations
45
+ lr: learning rate
46
+
47
+ Returns:
48
+ (center, scale) tuple
49
+ """
50
+ if not raw_scores:
51
+ return 0.5, 8.0
52
+
53
+ if targets is None:
54
+ sorted_raw = sorted(raw_scores)
55
+ median = sorted_raw[len(sorted_raw) // 2]
56
+ targets = [1.0 if r > median else 0.0 for r in raw_scores]
57
+
58
+ if len(raw_scores) != len(targets):
59
+ raise ValueError("raw_scores and targets must have same length")
60
+
61
+ # Initialize near the median
62
+ mean_raw = sum(raw_scores) / len(raw_scores)
63
+ center = mean_raw
64
+ scale = 8.0
65
+ n = len(raw_scores)
66
+
67
+ for _ in range(max_iter):
68
+ d_center = 0.0
69
+ d_scale = 0.0
70
+ for x, t in zip(raw_scores, targets):
71
+ p = sigmoid(x, center, scale)
72
+ err = p - t
73
+ # d/d_center sigmoid = -scale * p * (1-p)
74
+ # d/d_scale sigmoid = (x - center) * p * (1-p)
75
+ sigmoid_deriv = p * (1.0 - p)
76
+ d_center += err * (-scale * sigmoid_deriv)
77
+ d_scale += err * ((x - center) * sigmoid_deriv)
78
+ d_center /= n
79
+ d_scale /= n
80
+ center -= lr * d_center
81
+ scale -= lr * d_scale
82
+ # Keep scale positive and bounded
83
+ if scale < 0.1:
84
+ scale = 0.1
85
+ if scale > 100.0:
86
+ scale = 100.0
87
+ if center < 0.0:
88
+ center = 0.0
89
+ if center > 1.0:
90
+ center = 1.0
91
+
92
+ return round(center, 4), round(scale, 4)
93
+
94
+
95
+ def calibrate_run_file(
96
+ run_path: str | Path,
97
+ *,
98
+ metric: str = "base_completeness_score",
99
+ ) -> tuple[float, float]:
100
+ """Load a run JSON file and fit sigmoid from the raw scores.
101
+
102
+ Args:
103
+ run_path: path to a run JSON (format: {results: [{score, scoring_metadata: {...}}]})
104
+ metric: metadata key to use as the raw score. Defaults to
105
+ base_completeness_score which is the pre-sigmoid score
106
+ in most environments.
107
+
108
+ Returns:
109
+ (center, scale) fitted to the metric values in the run.
110
+ """
111
+ path = Path(run_path)
112
+ data = json.loads(path.read_text())
113
+ results = data.get("results")
114
+ if results is None and "score" in data and "task" in data:
115
+ results = [data]
116
+ elif results is None:
117
+ results = []
118
+ raw = []
119
+ for r in results:
120
+ meta = r.get("scoring_metadata", {})
121
+ val = meta.get(metric)
122
+ if val is None:
123
+ val = r.get("score")
124
+ if isinstance(val, (int, float)):
125
+ raw.append(float(val))
126
+ if not raw:
127
+ raise ValueError(f"No valid scores found in {run_path} for metric {metric!r}")
128
+ return fit_sigmoid(raw)
129
+
130
+
131
+ def apply_sigmoid(raw_scores: list[float], center: float, scale: float) -> list[float]:
132
+ """Apply a fitted sigmoid to a list of raw scores."""
133
+ return [sigmoid(r, center, scale) for r in raw_scores]
134
+
135
+
136
+ # ---------------------------------------------------------------------------
137
+ # Mode-based recompute
138
+ # ---------------------------------------------------------------------------
139
+ #
140
+ # Athanor scoring containers emit a `scoring_metadata` object with
141
+ # `mode_scores = {"training": float, "eval": float}` when both modes
142
+ # apply. This module reads those precomputed values so you can flip the
143
+ # displayed score between modes without rerunning the eval — useful for
144
+ # regenerating heatmaps under a different reward regime.
145
+ #
146
+ # "training" mode is gradient-friendly (partial credit for progress on
147
+ # compile + property gates). "eval" mode is strict. The authoritative
148
+ # computation happens inside the scoring container; this module only
149
+ # reads the values.
150
+ #
151
+ # Backward compatibility: older run files without `mode_scores` fall
152
+ # back to the top-level `score` field. Safe on any run file.
153
+
154
+ # Cheat category labels that customers may see in scoring_metadata.cheat_category.
155
+ # This is a partial public-facing set — the full enforcement vocabulary
156
+ # lives in each env's scoring container. Used for dashboard labels + filtering.
157
+ KNOWN_CHEAT_CATEGORIES = (
158
+ "stub_detection",
159
+ "banned_construct",
160
+ "file_not_found",
161
+ "interface_violation",
162
+ "null_implementation",
163
+ )
164
+
165
+ VALID_MODES = ("training", "eval")
166
+ DEFAULT_MODE = "training"
167
+
168
+ # Where the per-customer "current mode for each env" state lives.
169
+ MODE_STATE_FILE = Path.home() / ".athanor" / "modes.json"
170
+
171
+
172
+ def _validate_mode(mode: str) -> None:
173
+ """Raise ValueError if mode is not one of the canonical modes."""
174
+ if mode not in VALID_MODES:
175
+ raise ValueError(f"mode must be one of {VALID_MODES}, got {mode!r}")
176
+
177
+
178
+ def recompute_score_for_mode(result: dict, mode: str) -> float:
179
+ """Recompute a single task result's score for the requested mode.
180
+
181
+ Resolution order (first hit wins):
182
+
183
+ 1. **Precomputed mode_scores** — the authoritative source of truth.
184
+ Every post-rollout env emits this. The parser reads it directly
185
+ without recomputation.
186
+
187
+ 2. **base_score + test_gate_passed** — APPROXIMATE fallback for
188
+ mid-migration envs that have the binary fields but haven't yet
189
+ emitted mode_scores. WARNING: base_score has env-specific
190
+ semantics (see schema docstring above) — for three-layer envs
191
+ (sb, neuro) and lean-bonus envs (NKI), this branch will return
192
+ a value that DIFFERS from what mode_scores.training would return.
193
+ The error is bounded (the difference is the layer-3 multiplier
194
+ and floors), but if precision matters and you care about cross-
195
+ env consistency, regenerate the run file via the env's scoring.py
196
+ so mode_scores gets populated.
197
+
198
+ 3. **Legacy top-level `score`** — for old run files with no
199
+ canonical fields at all (pre-rollout, neuron-nki-kernels with
200
+ empty scoring_metadata). Returned as-is, no transformation.
201
+
202
+ 4. **0.0** if nothing usable. Caller can spot it via the dropped
203
+ score and decide what to do.
204
+
205
+ Anti-cheat rules baked into branches 1 and 2:
206
+ - Training mode never gives credit when `test_gate_passed` is false.
207
+ - Eval mode never gives partial credit at all — score is always 0.0
208
+ or 1.0 in branch 2; whatever mode_scores.eval says in branch 1.
209
+ - Partial test pass rate is never extractable from this function;
210
+ `test_gate_passed` is the only test signal we trust.
211
+ - test_gate_passed=False with empty anti_cheat_violations is valid
212
+ (compile failures, no_progress stages) and still produces hard zero.
213
+
214
+ Args:
215
+ result: a single task result dict (from run_file["results"][i]).
216
+ mode: "training" or "eval".
217
+
218
+ Returns:
219
+ The recomputed score as a float in [0.0, 1.0].
220
+
221
+ Raises:
222
+ ValueError: if `mode` is not in VALID_MODES.
223
+ """
224
+ _validate_mode(mode)
225
+ metadata = result.get("scoring_metadata") or {}
226
+
227
+ # 1. Precomputed mode_scores is the AUTHORITATIVE source of truth.
228
+ # The env's scoring.py is responsible for getting the math right
229
+ # and emits both modes at scoring time. All 10 post-rollout envs
230
+ # emit this; the parser must prefer it over any recomputation.
231
+ mode_scores = metadata.get("mode_scores")
232
+ if isinstance(mode_scores, dict) and mode in mode_scores:
233
+ precomputed = mode_scores[mode]
234
+ if isinstance(precomputed, (int, float)):
235
+ return float(precomputed)
236
+
237
+ # 2. APPROXIMATE recompute from canonical individual fields. Only
238
+ # used when mode_scores is missing — i.e. mid-migration envs.
239
+ # For envs where base_score == mode_scores.training (hw-cbmc,
240
+ # custom-tpu, congestion, dc, c-to-rust) this is exact. For
241
+ # three-layer envs (sb, neuro) and lean-bonus envs (NKI) this
242
+ # is APPROXIMATE because base_score is the layer-2 sigmoid alone
243
+ # and the layer-3 multiplier/floors are not applied here. See
244
+ # docstring above. After the full 10-env rollout (2026-04-10)
245
+ # this branch should never fire on a current run file.
246
+ base_score = metadata.get("base_score")
247
+ test_gate_passed = metadata.get("test_gate_passed")
248
+ if isinstance(base_score, (int, float)) and isinstance(test_gate_passed, bool):
249
+ base_score = float(base_score)
250
+ if mode == "training":
251
+ return base_score if test_gate_passed else 0.0
252
+ if mode == "eval":
253
+ return 1.0 if (base_score == 1.0 and test_gate_passed) else 0.0
254
+
255
+ # 3. Legacy fallback: no canonical fields. Use the top-level `score`
256
+ # as-is. This is what pre-canonicalization run files end up with.
257
+ # Anti-cheat agent will land the evaluate.py score=None silent-drop
258
+ # fix soon, after which this branch is mostly dead — but it stays
259
+ # as a safety net per their request.
260
+ legacy_score = result.get("score")
261
+ if isinstance(legacy_score, (int, float)):
262
+ return float(legacy_score)
263
+
264
+ # 4. Nothing usable — return 0.0 rather than crashing. Callers can
265
+ # spot it via the dropped score and decide what to do.
266
+ return 0.0
267
+
268
+
269
+ def apply_mode_to_run_file(
270
+ path: str | Path,
271
+ mode: str,
272
+ *,
273
+ in_place: bool = False,
274
+ output_path: str | Path | None = None,
275
+ ) -> dict:
276
+ """Recompute every result's score in a run file for the given mode.
277
+
278
+ Reads the run JSON, walks `results[]`, replaces each result's `score`
279
+ field with the recomputed value for `mode`, and either returns the
280
+ new dict, writes back in place, or writes to `output_path`.
281
+
282
+ Original `scoring_metadata` is left untouched — only the top-level
283
+ `score` field on each result changes. This means you can flip modes
284
+ repeatedly without losing information.
285
+
286
+ A new top-level field `_athanor_mode` is added to the run file
287
+ indicating which mode the scores currently reflect, so finalize_readme
288
+ and other downstream tools can render the active mode in the UI.
289
+
290
+ Args:
291
+ path: path to the run JSON.
292
+ mode: "training" or "eval".
293
+ in_place: if True, overwrite `path` with the new content.
294
+ output_path: if provided, write the new content here instead.
295
+ Mutually exclusive with in_place.
296
+
297
+ Returns:
298
+ The recomputed dict (always, regardless of write mode).
299
+
300
+ Raises:
301
+ ValueError: if mode is invalid, or if both in_place and output_path
302
+ are set.
303
+ """
304
+ _validate_mode(mode)
305
+ if in_place and output_path is not None:
306
+ raise ValueError("in_place and output_path are mutually exclusive")
307
+
308
+ src = Path(path)
309
+ data = json.loads(src.read_text())
310
+
311
+ for result in data.get("results", []):
312
+ result["score"] = recompute_score_for_mode(result, mode)
313
+
314
+ data["_athanor_mode"] = mode
315
+
316
+ if in_place:
317
+ src.write_text(json.dumps(data, indent=2) + "\n")
318
+ elif output_path is not None:
319
+ Path(output_path).write_text(json.dumps(data, indent=2) + "\n")
320
+
321
+ return data
322
+
323
+
324
+ # ---------------------------------------------------------------------------
325
+ # Per-env mode persistence
326
+ # ---------------------------------------------------------------------------
327
+
328
+ def _read_mode_state() -> dict[str, str]:
329
+ """Read the user's mode state file. Returns {} if missing or corrupt."""
330
+ if not MODE_STATE_FILE.exists():
331
+ return {}
332
+ try:
333
+ return json.loads(MODE_STATE_FILE.read_text())
334
+ except (json.JSONDecodeError, OSError):
335
+ return {}
336
+
337
+
338
+ def _write_mode_state(state: dict[str, str]) -> None:
339
+ """Write the mode state file atomically (mkdir parent if needed)."""
340
+ MODE_STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
341
+ MODE_STATE_FILE.write_text(json.dumps(state, indent=2, sort_keys=True) + "\n")
342
+
343
+
344
+ def get_env_mode(env_dir: str | Path) -> str:
345
+ """Return the active mode for an env, or 'training' if unset.
346
+
347
+ Mode is keyed by the resolved absolute path of `env_dir`, so the same
348
+ env at different mount points can have different modes if needed.
349
+ """
350
+ state = _read_mode_state()
351
+ key = str(Path(env_dir).resolve())
352
+ return state.get(key, DEFAULT_MODE)
353
+
354
+
355
+ def set_env_mode(env_dir: str | Path, mode: str) -> None:
356
+ """Persist the active mode for an env in ~/.athanor/modes.json.
357
+
358
+ Raises ValueError if `mode` is not in VALID_MODES.
359
+ """
360
+ _validate_mode(mode)
361
+ state = _read_mode_state()
362
+ state[str(Path(env_dir).resolve())] = mode
363
+ _write_mode_state(state)
364
+
365
+
366
+ def list_env_modes() -> dict[str, str]:
367
+ """Return a copy of the full mode state — all envs the user has touched."""
368
+ return dict(_read_mode_state())
369
+
370
+
371
+ # ---------------------------------------------------------------------------
372
+ # Inspection helpers (used by `athanor calibrate show` once cli.py lands)
373
+ # ---------------------------------------------------------------------------
374
+
375
+ def summarize_run_file(path: str | Path) -> dict:
376
+ """Inspect a run file and report what scoring_metadata schema it uses.
377
+
378
+ Useful for debugging migration progress and answering questions like
379
+ "does this run file have the canonical schema yet?". Returns a dict
380
+ with counts and a sample missing-field list.
381
+
382
+ Returns:
383
+ {
384
+ "total_results": int,
385
+ "with_canonical_base_score": int,
386
+ "with_test_gate_passed": int,
387
+ "with_mode_scores": int,
388
+ "with_empty_metadata": int,
389
+ "schema_status": "canonical" | "partial" | "legacy" | "empty",
390
+ }
391
+ """
392
+ data = json.loads(Path(path).read_text())
393
+ results = data.get("results", [])
394
+ total = len(results)
395
+ has_base = sum(1 for r in results
396
+ if isinstance((r.get("scoring_metadata") or {}).get("base_score"), (int, float)))
397
+ has_gate = sum(1 for r in results
398
+ if isinstance((r.get("scoring_metadata") or {}).get("test_gate_passed"), bool))
399
+ has_modes = sum(1 for r in results
400
+ if isinstance((r.get("scoring_metadata") or {}).get("mode_scores"), dict))
401
+ empty_meta = sum(1 for r in results if not (r.get("scoring_metadata") or {}))
402
+
403
+ if total == 0:
404
+ status = "empty"
405
+ elif has_modes == total:
406
+ status = "canonical"
407
+ elif has_base == total and has_gate == total:
408
+ status = "partial" # canonical fields present but no precomputed mode_scores
409
+ else:
410
+ status = "legacy"
411
+
412
+ return {
413
+ "total_results": total,
414
+ "with_canonical_base_score": has_base,
415
+ "with_test_gate_passed": has_gate,
416
+ "with_mode_scores": has_modes,
417
+ "with_empty_metadata": empty_meta,
418
+ "schema_status": status,
419
+ }