PyPI - athanor-sdk - Versions diffs - 0.3.2__py3-none-any.whl - Mend

athanor-sdk 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

athanor/__init__.py +28 -0
athanor/calibrate.py +419 -0
athanor/cli.py +1050 -0
athanor/compare.py +89 -0
athanor/env.py +376 -0
athanor/estimate.py +97 -0
athanor/eval_status.py +183 -0
athanor/lean.py +262 -0
athanor/lint.py +80 -0
athanor/preflight.py +183 -0
athanor/property.py +139 -0
athanor/providers.py +194 -0
athanor/runner.py +1225 -0
athanor/scoring.py +102 -0
athanor/stats.py +95 -0
athanor/types.py +40 -0
athanor_sdk-0.3.2.dist-info/METADATA +196 -0
athanor_sdk-0.3.2.dist-info/RECORD +20 -0
athanor_sdk-0.3.2.dist-info/WHEEL +4 -0
athanor_sdk-0.3.2.dist-info/entry_points.txt +2 -0

athanor/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Athanor AI — Lean 4 proof verification as RL training signal."""
+from athanor.env import Environment
+from athanor.types import ScoreResult, TaskConfig
+from athanor.lean import verify_proof, check_sorry, score_proof, ProofResult
+__version__ = "0.3.1"
+def make(env_name: str, task: str | None = None, **kwargs) -> Environment:
+    """Create an Athanor environment.
+    Args:
+        env_name: Environment image name (e.g. 'neuron-nki-kernels').
+        task: Optional task ID to start with.
+        **kwargs: Passed to Environment (image=, timeout=).
+    Returns:
+        Environment instance ready for reset()/score() calls.
+    """
+    return Environment(env_name, task=task, **kwargs)
+__all__ = [
+    "make", "Environment", "ScoreResult", "TaskConfig",
+    "verify_proof", "check_sorry", "score_proof", "ProofResult",
+    "__version__",
+]

athanor/calibrate.py ADDED Viewed

@@ -0,0 +1,419 @@
+"""Sigmoid calibrator — fit sigmoid parameters from score data.
+The calibration maps raw scores (e.g. property test pass fractions) to
+calibrated scores in [0, 1] using a sigmoid. This helps separate
+"trying" from "succeeding" — the sigmoid center marks the threshold
+where scores start counting.
+score_calibrated = 1 / (1 + exp(-scale * (raw - center)))
+"""
+from __future__ import annotations
+import json
+import math
+from pathlib import Path
+def sigmoid(x: float, center: float, scale: float) -> float:
+    """Standard logistic sigmoid centered at `center` with steepness `scale`."""
+    z = scale * (x - center)
+    # Clamp to avoid overflow
+    if z > 500:
+        return 1.0
+    if z < -500:
+        return 0.0
+    return 1.0 / (1.0 + math.exp(-z))
+def fit_sigmoid(
+    raw_scores: list[float],
+    targets: list[float] | None = None,
+    *,
+    max_iter: int = 200,
+    lr: float = 0.1,
+) -> tuple[float, float]:
+    """Fit sigmoid center + scale via gradient descent on MSE loss.
+    If targets is None, fits to binarized targets (1 if raw > median, else 0).
+    This finds a natural decision boundary in the score distribution.
+    Args:
+        raw_scores: raw scores from property tests / simulations
+        targets: optional target calibrated scores (0..1). If None, uses
+                 binarization at the median of raw_scores.
+        max_iter: gradient descent iterations
+        lr: learning rate
+    Returns:
+        (center, scale) tuple
+    """
+    if not raw_scores:
+        return 0.5, 8.0
+    if targets is None:
+        sorted_raw = sorted(raw_scores)
+        median = sorted_raw[len(sorted_raw) // 2]
+        targets = [1.0 if r > median else 0.0 for r in raw_scores]
+    if len(raw_scores) != len(targets):
+        raise ValueError("raw_scores and targets must have same length")
+    # Initialize near the median
+    mean_raw = sum(raw_scores) / len(raw_scores)
+    center = mean_raw
+    scale = 8.0
+    n = len(raw_scores)
+    for _ in range(max_iter):
+        d_center = 0.0
+        d_scale = 0.0
+        for x, t in zip(raw_scores, targets):
+            p = sigmoid(x, center, scale)
+            err = p - t
+            # d/d_center sigmoid = -scale * p * (1-p)
+            # d/d_scale  sigmoid = (x - center) * p * (1-p)
+            sigmoid_deriv = p * (1.0 - p)
+            d_center += err * (-scale * sigmoid_deriv)
+            d_scale += err * ((x - center) * sigmoid_deriv)
+        d_center /= n
+        d_scale /= n
+        center -= lr * d_center
+        scale -= lr * d_scale
+        # Keep scale positive and bounded
+        if scale < 0.1:
+            scale = 0.1
+        if scale > 100.0:
+            scale = 100.0
+        if center < 0.0:
+            center = 0.0
+        if center > 1.0:
+            center = 1.0
+    return round(center, 4), round(scale, 4)
+def calibrate_run_file(
+    run_path: str | Path,
+    *,
+    metric: str = "base_completeness_score",
+) -> tuple[float, float]:
+    """Load a run JSON file and fit sigmoid from the raw scores.
+    Args:
+        run_path: path to a run JSON (format: {results: [{score, scoring_metadata: {...}}]})
+        metric: metadata key to use as the raw score. Defaults to
+                base_completeness_score which is the pre-sigmoid score
+                in most environments.
+    Returns:
+        (center, scale) fitted to the metric values in the run.
+    """
+    path = Path(run_path)
+    data = json.loads(path.read_text())
+    results = data.get("results")
+    if results is None and "score" in data and "task" in data:
+        results = [data]
+    elif results is None:
+        results = []
+    raw = []
+    for r in results:
+        meta = r.get("scoring_metadata", {})
+        val = meta.get(metric)
+        if val is None:
+            val = r.get("score")
+        if isinstance(val, (int, float)):
+            raw.append(float(val))
+    if not raw:
+        raise ValueError(f"No valid scores found in {run_path} for metric {metric!r}")
+    return fit_sigmoid(raw)
+def apply_sigmoid(raw_scores: list[float], center: float, scale: float) -> list[float]:
+    """Apply a fitted sigmoid to a list of raw scores."""
+    return [sigmoid(r, center, scale) for r in raw_scores]
+# ---------------------------------------------------------------------------
+# Mode-based recompute
+# ---------------------------------------------------------------------------
+#
+# Athanor scoring containers emit a `scoring_metadata` object with
+# `mode_scores = {"training": float, "eval": float}` when both modes
+# apply. This module reads those precomputed values so you can flip the
+# displayed score between modes without rerunning the eval — useful for
+# regenerating heatmaps under a different reward regime.
+#
+# "training" mode is gradient-friendly (partial credit for progress on
+# compile + property gates). "eval" mode is strict. The authoritative
+# computation happens inside the scoring container; this module only
+# reads the values.
+#
+# Backward compatibility: older run files without `mode_scores` fall
+# back to the top-level `score` field. Safe on any run file.
+# Cheat category labels that customers may see in scoring_metadata.cheat_category.
+# This is a partial public-facing set — the full enforcement vocabulary
+# lives in each env's scoring container. Used for dashboard labels + filtering.
+KNOWN_CHEAT_CATEGORIES = (
+    "stub_detection",
+    "banned_construct",
+    "file_not_found",
+    "interface_violation",
+    "null_implementation",
+)
+VALID_MODES = ("training", "eval")
+DEFAULT_MODE = "training"
+# Where the per-customer "current mode for each env" state lives.
+MODE_STATE_FILE = Path.home() / ".athanor" / "modes.json"
+def _validate_mode(mode: str) -> None:
+    """Raise ValueError if mode is not one of the canonical modes."""
+    if mode not in VALID_MODES:
+        raise ValueError(f"mode must be one of {VALID_MODES}, got {mode!r}")
+def recompute_score_for_mode(result: dict, mode: str) -> float:
+    """Recompute a single task result's score for the requested mode.
+    Resolution order (first hit wins):
+    1. **Precomputed mode_scores** — the authoritative source of truth.
+       Every post-rollout env emits this. The parser reads it directly
+       without recomputation.
+    2. **base_score + test_gate_passed** — APPROXIMATE fallback for
+       mid-migration envs that have the binary fields but haven't yet
+       emitted mode_scores. WARNING: base_score has env-specific
+       semantics (see schema docstring above) — for three-layer envs
+       (sb, neuro) and lean-bonus envs (NKI), this branch will return
+       a value that DIFFERS from what mode_scores.training would return.
+       The error is bounded (the difference is the layer-3 multiplier
+       and floors), but if precision matters and you care about cross-
+       env consistency, regenerate the run file via the env's scoring.py
+       so mode_scores gets populated.
+    3. **Legacy top-level `score`** — for old run files with no
+       canonical fields at all (pre-rollout, neuron-nki-kernels with
+       empty scoring_metadata). Returned as-is, no transformation.
+    4. **0.0** if nothing usable. Caller can spot it via the dropped
+       score and decide what to do.
+    Anti-cheat rules baked into branches 1 and 2:
+    - Training mode never gives credit when `test_gate_passed` is false.
+    - Eval mode never gives partial credit at all — score is always 0.0
+      or 1.0 in branch 2; whatever mode_scores.eval says in branch 1.
+    - Partial test pass rate is never extractable from this function;
+      `test_gate_passed` is the only test signal we trust.
+    - test_gate_passed=False with empty anti_cheat_violations is valid
+      (compile failures, no_progress stages) and still produces hard zero.
+    Args:
+        result: a single task result dict (from run_file["results"][i]).
+        mode: "training" or "eval".
+    Returns:
+        The recomputed score as a float in [0.0, 1.0].
+    Raises:
+        ValueError: if `mode` is not in VALID_MODES.
+    """
+    _validate_mode(mode)
+    metadata = result.get("scoring_metadata") or {}
+    # 1. Precomputed mode_scores is the AUTHORITATIVE source of truth.
+    #    The env's scoring.py is responsible for getting the math right
+    #    and emits both modes at scoring time. All 10 post-rollout envs
+    #    emit this; the parser must prefer it over any recomputation.
+    mode_scores = metadata.get("mode_scores")
+    if isinstance(mode_scores, dict) and mode in mode_scores:
+        precomputed = mode_scores[mode]
+        if isinstance(precomputed, (int, float)):
+            return float(precomputed)
+    # 2. APPROXIMATE recompute from canonical individual fields. Only
+    #    used when mode_scores is missing — i.e. mid-migration envs.
+    #    For envs where base_score == mode_scores.training (hw-cbmc,
+    #    custom-tpu, congestion, dc, c-to-rust) this is exact. For
+    #    three-layer envs (sb, neuro) and lean-bonus envs (NKI) this
+    #    is APPROXIMATE because base_score is the layer-2 sigmoid alone
+    #    and the layer-3 multiplier/floors are not applied here. See
+    #    docstring above. After the full 10-env rollout (2026-04-10)
+    #    this branch should never fire on a current run file.
+    base_score = metadata.get("base_score")
+    test_gate_passed = metadata.get("test_gate_passed")
+    if isinstance(base_score, (int, float)) and isinstance(test_gate_passed, bool):
+        base_score = float(base_score)
+        if mode == "training":
+            return base_score if test_gate_passed else 0.0
+        if mode == "eval":
+            return 1.0 if (base_score == 1.0 and test_gate_passed) else 0.0
+    # 3. Legacy fallback: no canonical fields. Use the top-level `score`
+    #    as-is. This is what pre-canonicalization run files end up with.
+    #    Anti-cheat agent will land the evaluate.py score=None silent-drop
+    #    fix soon, after which this branch is mostly dead — but it stays
+    #    as a safety net per their request.
+    legacy_score = result.get("score")
+    if isinstance(legacy_score, (int, float)):
+        return float(legacy_score)
+    # 4. Nothing usable — return 0.0 rather than crashing. Callers can
+    #    spot it via the dropped score and decide what to do.
+    return 0.0
+def apply_mode_to_run_file(
+    path: str | Path,
+    mode: str,
+    *,
+    in_place: bool = False,
+    output_path: str | Path | None = None,
+) -> dict:
+    """Recompute every result's score in a run file for the given mode.
+    Reads the run JSON, walks `results[]`, replaces each result's `score`
+    field with the recomputed value for `mode`, and either returns the
+    new dict, writes back in place, or writes to `output_path`.
+    Original `scoring_metadata` is left untouched — only the top-level
+    `score` field on each result changes. This means you can flip modes
+    repeatedly without losing information.
+    A new top-level field `_athanor_mode` is added to the run file
+    indicating which mode the scores currently reflect, so finalize_readme
+    and other downstream tools can render the active mode in the UI.
+    Args:
+        path: path to the run JSON.
+        mode: "training" or "eval".
+        in_place: if True, overwrite `path` with the new content.
+        output_path: if provided, write the new content here instead.
+                     Mutually exclusive with in_place.
+    Returns:
+        The recomputed dict (always, regardless of write mode).
+    Raises:
+        ValueError: if mode is invalid, or if both in_place and output_path
+                    are set.
+    """
+    _validate_mode(mode)
+    if in_place and output_path is not None:
+        raise ValueError("in_place and output_path are mutually exclusive")
+    src = Path(path)
+    data = json.loads(src.read_text())
+    for result in data.get("results", []):
+        result["score"] = recompute_score_for_mode(result, mode)
+    data["_athanor_mode"] = mode
+    if in_place:
+        src.write_text(json.dumps(data, indent=2) + "\n")
+    elif output_path is not None:
+        Path(output_path).write_text(json.dumps(data, indent=2) + "\n")
+    return data
+# ---------------------------------------------------------------------------
+# Per-env mode persistence
+# ---------------------------------------------------------------------------
+def _read_mode_state() -> dict[str, str]:
+    """Read the user's mode state file. Returns {} if missing or corrupt."""
+    if not MODE_STATE_FILE.exists():
+        return {}
+    try:
+        return json.loads(MODE_STATE_FILE.read_text())
+    except (json.JSONDecodeError, OSError):
+        return {}
+def _write_mode_state(state: dict[str, str]) -> None:
+    """Write the mode state file atomically (mkdir parent if needed)."""
+    MODE_STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
+    MODE_STATE_FILE.write_text(json.dumps(state, indent=2, sort_keys=True) + "\n")
+def get_env_mode(env_dir: str | Path) -> str:
+    """Return the active mode for an env, or 'training' if unset.
+    Mode is keyed by the resolved absolute path of `env_dir`, so the same
+    env at different mount points can have different modes if needed.
+    """
+    state = _read_mode_state()
+    key = str(Path(env_dir).resolve())
+    return state.get(key, DEFAULT_MODE)
+def set_env_mode(env_dir: str | Path, mode: str) -> None:
+    """Persist the active mode for an env in ~/.athanor/modes.json.
+    Raises ValueError if `mode` is not in VALID_MODES.
+    """
+    _validate_mode(mode)
+    state = _read_mode_state()
+    state[str(Path(env_dir).resolve())] = mode
+    _write_mode_state(state)
+def list_env_modes() -> dict[str, str]:
+    """Return a copy of the full mode state — all envs the user has touched."""
+    return dict(_read_mode_state())
+# ---------------------------------------------------------------------------
+# Inspection helpers (used by `athanor calibrate show` once cli.py lands)
+# ---------------------------------------------------------------------------
+def summarize_run_file(path: str | Path) -> dict:
+    """Inspect a run file and report what scoring_metadata schema it uses.
+    Useful for debugging migration progress and answering questions like
+    "does this run file have the canonical schema yet?". Returns a dict
+    with counts and a sample missing-field list.
+    Returns:
+        {
+            "total_results": int,
+            "with_canonical_base_score": int,
+            "with_test_gate_passed": int,
+            "with_mode_scores": int,
+            "with_empty_metadata": int,
+            "schema_status": "canonical" | "partial" | "legacy" | "empty",
+        }
+    """
+    data = json.loads(Path(path).read_text())
+    results = data.get("results", [])
+    total = len(results)
+    has_base = sum(1 for r in results
+                   if isinstance((r.get("scoring_metadata") or {}).get("base_score"), (int, float)))
+    has_gate = sum(1 for r in results
+                   if isinstance((r.get("scoring_metadata") or {}).get("test_gate_passed"), bool))
+    has_modes = sum(1 for r in results
+                    if isinstance((r.get("scoring_metadata") or {}).get("mode_scores"), dict))
+    empty_meta = sum(1 for r in results if not (r.get("scoring_metadata") or {}))
+    if total == 0:
+        status = "empty"
+    elif has_modes == total:
+        status = "canonical"
+    elif has_base == total and has_gate == total:
+        status = "partial"  # canonical fields present but no precomputed mode_scores
+    else:
+        status = "legacy"
+    return {
+        "total_results": total,
+        "with_canonical_base_score": has_base,
+        "with_test_gate_passed": has_gate,
+        "with_mode_scores": has_modes,
+        "with_empty_metadata": empty_meta,
+        "schema_status": status,
+    }