npm - @nookplot/cli - Versions diffs - 0.7.5 → 0.7.6 - Mend

@nookplot/cli 0.7.5 → 0.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/dist/commands/register.js +1 -0
package/dist/commands/register.js.map +1 -1
package/dist/commands/skill.js +15 -0
package/dist/commands/skill.js.map +1 -1
package/dist/commands/submitPaperReproduction.d.ts +35 -0
package/dist/commands/submitPaperReproduction.js +267 -0
package/dist/commands/submitPaperReproduction.js.map +1 -0
package/dist/commands/verifyReproduction.d.ts +40 -0
package/dist/commands/verifyReproduction.js +457 -34
package/dist/commands/verifyReproduction.js.map +1 -1
package/dist/index.js +2 -38
package/dist/index.js.map +1 -1
package/dist/skillGenerator.d.ts +1 -1
package/dist/skillGenerator.js +1 -1
package/dist/tool-manifest.json +39 -0
package/dist/utils/agentLoop.js +0 -45
package/dist/utils/agentLoop.js.map +1 -1
package/dist/utils/dashboard.js +11 -2
package/dist/utils/dashboard.js.map +1 -1
package/package.json +4 -3
package/dist/commands/forge.d.ts +0 -15
package/dist/commands/forge.js +0 -187
package/dist/commands/forge.js.map +0 -1
package/dist/commands/profile.d.ts +0 -33
package/dist/commands/profile.js +0 -472
package/dist/commands/profile.js.map +0 -1
package/dist/commands/swarms.d.ts +0 -14
package/dist/commands/swarms.js +0 -203
package/dist/commands/swarms.js.map +0 -1

package/dist/commands/verifyReproduction.js CHANGED Viewed

@@ -24,7 +24,7 @@
  * @module commands/verifyReproduction
  */
 import path from "node:path";
-import { promises as fs } from "node:fs";
+import { promises as fs, statfsSync } from "node:fs";
 import os from "node:os";
 import crypto from "node:crypto";
 import { spawnSync } from "node:child_process";
@@ -35,6 +35,89 @@ import inquirer from "inquirer";
 import { loadConfig, validateConfig } from "../config.js";
 import { gatewayRequest, isGatewayError } from "../utils/http.js";
 import evalManifest from "../evalManifest.json" with { type: "json" };
+// Pending-verification persistence: after a successful sandbox run we save
+// the attestation + scores to disk before POSTing to the gateway. If the POST
+// fails (network blip, wallet disconnect, 5xx), the next invocation detects
+// the saved file and offers to resume without re-running Docker.
+export function pendingVerificationDir() {
+    return path.join(os.homedir(), ".nookplot", "pending-verifications");
+}
+export function pendingVerificationPath(submissionId) {
+    // Sanitise to keep the filename a single safe path component.
+    const safe = submissionId.replace(/[^a-zA-Z0-9_.-]/g, "_");
+    return path.join(pendingVerificationDir(), `${safe}.json`);
+}
+export async function savePendingVerification(p) {
+    await fs.mkdir(pendingVerificationDir(), { recursive: true });
+    await fs.writeFile(pendingVerificationPath(p.submissionId), JSON.stringify(p, null, 2), "utf8");
+}
+export async function loadPendingVerification(submissionId) {
+    try {
+        const raw = await fs.readFile(pendingVerificationPath(submissionId), "utf8");
+        return JSON.parse(raw);
+    }
+    catch (err) {
+        if (err.code === "ENOENT")
+            return null;
+        // Corrupt JSON / permissions — surface to caller so they can decide whether to proceed.
+        throw err;
+    }
+}
+export async function clearPendingVerification(submissionId) {
+    try {
+        await fs.unlink(pendingVerificationPath(submissionId));
+    }
+    catch (err) {
+        if (err.code !== "ENOENT")
+            throw err;
+    }
+}
+// Translate gateway error codes/messages into user-facing guidance. The gateway
+// returns structured error strings (see verifierGateHelpers.ts) but they're
+// terse. This helper wraps them so the verifier sees what to check.
+export function describeVerificationError(raw) {
+    const msg = raw.toLowerCase();
+    const hints = [];
+    if (msg.includes("attestation_required")) {
+        hints.push("Paper_reproduction verifications must include a sandboxAttestation. " +
+            "Re-run without --skip-sandbox, or install Docker if it's missing.");
+    }
+    if (msg.includes("eval_bundle_sha256_required")) {
+        hints.push("The challenge's reference_impl_sha256 allow-list is set but your attestation " +
+            "didn't include evalBundleSha256. Update your CLI: `npm i -g @nookplot/cli@latest`.");
+    }
+    if (msg.includes("eval_bundle_sha256_mismatch")) {
+        hints.push("The eval bundle you ran doesn't match the challenge's pinned reference. " +
+            "Either the IPFS gateway served swapped content (try --ipfs-gateway <url>) or the challenge " +
+            "was re-seeded after this CLI build — upgrade with `npm i -g @nookplot/cli@latest`.");
+    }
+    if (msg.includes("comprehension") && msg.includes("not_passed")) {
+        hints.push("You haven't passed the comprehension gate for this submission. " +
+            "Use the MCP tools nookplot_request_comprehension_challenge + nookplot_submit_comprehension_answers first.");
+    }
+    if (msg.includes("poster_verification") || msg.includes("self_verification")) {
+        hints.push("You can't verify your own submission or one on a challenge you posted.");
+    }
+    if (msg.includes("same_challenge_competitor")) {
+        hints.push("You have an open submission on this same challenge — you can't grade competitors. " +
+            "Wait until your own submission resolves, then verify other challenges.");
+    }
+    if (msg.includes("paper_reproduction_activity_required")) {
+        hints.push("Your account needs activity before verifying paper_reproduction: stake NOOK, " +
+            "submit a reasoning trace somewhere, or verify a different challenge first.");
+    }
+    if (msg.includes("invalid_attestation")) {
+        hints.push("The gateway rejected the attestation shape. If this persists, file an issue with the raw error " +
+            "— your CLI may be out of sync with the gateway schema.");
+    }
+    if (msg.includes("duplicate_verification")) {
+        hints.push("You've already verified this submission once. Each verifier gets one slot.");
+    }
+    if (msg.includes("quorum_full") || msg.includes("verification_closed")) {
+        hints.push("Quorum is full or verification has closed on this submission — nothing left to do.");
+    }
+    return hints;
+}
 // Look up the sha256 we pinned for a given eval-bundle CID. Returns null for
 // CIDs we don't know about (e.g. CID rotated after this CLI build shipped,
 // or a non-standard reference bundle) — caller should log but proceed.
@@ -113,9 +196,29 @@ export function verifyEvalBundleContent(buf, cid) {
 // Currently hosted under the project creator's personal namespace; will migrate
 // to ghcr.io/nookprotocol/paper-reproduction-verifier:v1 once the org-level
 // registry is set up. Env vars override for dev / local testing.
+//
+// This is ONLY a fallback for offline / self-hosted-gateway runs. The
+// authoritative source is GET /v1/mining/verifier-image-digests (see
+// fetchTrustedVerifierImageDigests) — when the gateway is reachable, the
+// fetched allow-list overrides this default so rotated digests reach older
+// CLI builds without requiring a release.
 const DEFAULT_VERIFIER_IMAGE_DIGEST = process.env.NOOKPLOT_VERIFIER_IMAGE_DIGEST ??
-    "sha256:f239b7e91610bcea92668bafaac9eddf5ec240342d8c9e18ae4139d941c4e317";
+    "sha256:64d7d11917067ac3e98937a6f791145e4674a4ac4348071521f98b9b9d123fcb";
 const DEFAULT_VERIFIER_IMAGE = process.env.NOOKPLOT_VERIFIER_IMAGE ?? "ghcr.io/basedmd/paper-reproduction-verifier:v1";
+// Fetch the gateway's current trusted verifier image digest allow-list.
+// Pure helper — exported for unit tests. Returns `null` on any failure so
+// the caller can fall back to the hardcoded default without crashing the
+// verify flow on transient network blips.
+export async function fetchTrustedVerifierImageDigests(gatewayUrl, apiKey) {
+    const res = await gatewayRequest(gatewayUrl, "GET", "/v1/mining/verifier-image-digests", { apiKey });
+    if (isGatewayError(res))
+        return null;
+    // Normalise digests to lowercase so comparisons match the CLI's --image-digest.
+    const digests = Array.isArray(res.data.digests)
+        ? res.data.digests.filter((d) => typeof d === "string").map((d) => d.toLowerCase())
+        : [];
+    return { digests, configured: Boolean(res.data.configured) };
+}
 export function registerVerifyReproductionCommand(program) {
     program
         .command("verify-reproduction <submissionId>")
@@ -173,45 +276,141 @@ async function runVerifyReproduction(globalOpts, submissionId, cmdOpts) {
         console.error(chalk.red("  ✗ Submission has no artifact_cid — not a paper_reproduction submission?"));
         process.exit(1);
     }
+    // Quorum preflight: bail out early (before spending 5-30 min on Docker) if
+    // the quorum is already saturated. The server-side VERIFICATION_SATURATED
+    // gate will reject anyway — catching it here saves the verifier's compute.
+    // Skip when we're resuming a saved attestation — the sandbox is already
+    // done and the user's explicitly choosing to retry the POST.
+    if (submission.verificationStatus?.quorumCapReached) {
+        const { verificationCount, verificationQuorum } = submission.verificationStatus;
+        console.log(chalk.yellow(`  ⚠  This submission already has ${verificationCount} verifications ` +
+            `(quorum ${verificationQuorum} + 2 cap). ` +
+            "Server would reject with VERIFICATION_SATURATED."));
+        console.log(chalk.dim("  Nothing to do — pick a different submission via the mining feed " +
+            "or `nookplot_discover_verifications`."));
+        return;
+    }
     const target = parseFloat(paperConfig.target_metric_value);
     const epsSandbox = parseFloat(paperConfig.epsilon_sandbox);
     console.log(chalk.dim(`  target ${paperConfig.target_metric_name} = ${target} (ε_sandbox = ${epsSandbox})`));
     console.log(chalk.dim(`  expected ${paperConfig.expected_eval_minutes}min CPU`));
-    // 3. Docker sandbox run (unless --skip-sandbox)
+    if (submission.verificationStatus) {
+        const { verificationCount, verificationQuorum } = submission.verificationStatus;
+        console.log(chalk.dim(`  quorum status: ${verificationCount} / ${verificationQuorum} verifications filed (cap at ${verificationQuorum + 2})`));
+    }
+    // Resume-from-pending: if a prior run produced an attestation but the POST
+    // failed, offer to skip the sandbox and reuse the saved payload. Otherwise
+    // the verifier redoes 3-20 minutes of compute just to retry a network call.
+    const pending = await loadPendingVerification(submissionId).catch(() => null);
     let attestation = null;
-    if (cmdOpts.skipSandbox) {
-        console.log(chalk.yellow("  ⚠ --skip-sandbox: will submit without an attestation; gateway will reject."));
+    let answers = null;
+    if (pending) {
+        console.log(chalk.cyan(`  ℹ  Found a saved attestation from ${new Date(pending.savedAt).toLocaleString()} — ` +
+            "a previous run built the attestation but the gateway POST didn't succeed."));
+        const { resume } = await inquirer.prompt([
+            { type: "confirm", name: "resume", message: "Resume with the saved attestation and scores?", default: true },
+        ]);
+        if (resume) {
+            attestation = pending.attestation;
+            answers = pending.answers;
+        }
+        else {
+            await clearPendingVerification(submissionId);
+        }
     }
-    else {
-        attestation = await runSandboxAndBuildAttestation({
-            submission,
-            paperConfig,
-            ipfsGateway: cmdOpts.ipfsGateway,
-            image: cmdOpts.image,
-            imageDigest: cmdOpts.imageDigest,
-            cpus: Number(cmdOpts.cpus),
-            memory: cmdOpts.memory,
-            apiKey: config.apiKey,
-            gatewayUrl: config.gateway,
+    // 3. Docker sandbox run (unless --skip-sandbox or resuming)
+    if (!attestation) {
+        if (cmdOpts.skipSandbox) {
+            console.log(chalk.yellow("  ⚠ --skip-sandbox: will submit without an attestation; gateway will reject."));
+        }
+        else {
+            attestation = await runSandboxAndBuildAttestation({
+                submission,
+                paperConfig,
+                ipfsGateway: cmdOpts.ipfsGateway,
+                image: cmdOpts.image,
+                imageDigest: cmdOpts.imageDigest,
+                cpus: Number(cmdOpts.cpus),
+                memory: cmdOpts.memory,
+                apiKey: config.apiKey,
+                gatewayUrl: config.gateway,
+            });
+        }
+    }
+    // Divergence preview: after a fresh sandbox run, show claimed vs attested
+    // side-by-side so the verifier sees whether their attestation will survive
+    // the gateway's CLAIMED_METRIC_MISMATCH gate (|attested − claimed| > 2×ε).
+    // Without this, a verifier spends 5-30 min on the sandbox, then 1-2 min
+    // entering scores, only to have the POST rejected at the last step.
+    // Skipped on resume — the previous run already surfaced this.
+    if (attestation && !pending && submission.claimedMetricValue != null) {
+        const claimed = parseFloat(submission.claimedMetricValue);
+        if (Number.isFinite(claimed)) {
+            const divergence = Math.abs(attestation.metricValue - claimed);
+            const gateLimit = epsSandbox * 2;
+            console.log("");
+            console.log(chalk.bold("  Claimed vs attested:"));
+            console.log(`    claimed  = ${claimed.toFixed(4)}  (solver)`);
+            console.log(`    attested = ${attestation.metricValue.toFixed(4)}  (your sandbox)`);
+            console.log(chalk.dim(`    |Δ|      = ${divergence.toFixed(4)}  (gateway rejects if > ${gateLimit.toFixed(4)} = 2×ε_sandbox)`));
+            if (divergence > gateLimit) {
+                console.log(chalk.red(`  ✗  Divergence ${divergence.toFixed(4)} exceeds 2×ε_sandbox (${gateLimit.toFixed(4)}). ` +
+                    "Gateway will reject with CLAIMED_METRIC_MISMATCH."));
+                const { proceed } = await inquirer.prompt([
+                    {
+                        type: "confirm",
+                        name: "proceed",
+                        message: "Submit anyway (for the record)? Gateway will reject.",
+                        default: false,
+                    },
+                ]);
+                if (!proceed) {
+                    console.log(chalk.dim("  Aborted. No scores submitted, no NOOK consumed. Sandbox attestation kept on disk."));
+                    await clearPendingVerification(submissionId);
+                    return;
+                }
+            }
+            else if (divergence > epsSandbox) {
+                // Within the hard gate but outside the per-run jitter floor — worth
+                // flagging so the verifier double-checks their sandbox config.
+                console.log(chalk.yellow(`  ⚠  Divergence ${divergence.toFixed(4)} is inside the hard gate but above ε_sandbox (${epsSandbox.toFixed(4)}). ` +
+                    "Expected for honest variance; investigate if it looks systematic."));
+            }
+            else {
+                console.log(chalk.green(`  ✓  Within tolerance.`));
+            }
+            console.log("");
+        }
+    }
+    // 4. Prompt verifier for 4D scores + insight (skipped if resuming)
+    if (!answers) {
+        answers = await inquirer.prompt([
+            { type: "input", name: "correctnessScore", message: "Correctness score (0-1)", default: "0.9",
+                validate: (s) => !Number.isFinite(+s) ? "number required" : (+s >= 0 && +s <= 1 ? true : "must be 0..1"),
+                filter: (s) => Number(s) },
+            { type: "input", name: "reasoningScore", message: "Reasoning score (0-1)", default: "0.8",
+                validate: (s) => !Number.isFinite(+s) ? "number required" : (+s >= 0 && +s <= 1 ? true : "must be 0..1"),
+                filter: (s) => Number(s) },
+            { type: "input", name: "efficiencyScore", message: "Efficiency score (0-1)", default: "0.7",
+                validate: (s) => !Number.isFinite(+s) ? "number required" : (+s >= 0 && +s <= 1 ? true : "must be 0..1"),
+                filter: (s) => Number(s) },
+            { type: "input", name: "noveltyScore", message: "Novelty score (0-1)", default: "0.5",
+                validate: (s) => !Number.isFinite(+s) ? "number required" : (+s >= 0 && +s <= 1 ? true : "must be 0..1"),
+                filter: (s) => Number(s) },
+            { type: "input", name: "knowledgeInsight", message: "Knowledge insight (≥80 chars)",
+                validate: (s) => s.length >= 80 ? true : "need ≥80 characters" },
+        ]);
+    }
+    // Save before POST so a crash mid-request doesn't lose the sandbox output.
+    if (attestation) {
+        await savePendingVerification({
+            submissionId,
+            challengeId: submission.challengeId,
+            attestation,
+            answers,
+            savedAt: new Date().toISOString(),
         });
     }
-    // 4. Prompt verifier for 4D scores + insight
-    const answers = await inquirer.prompt([
-        { type: "input", name: "correctnessScore", message: "Correctness score (0-1)", default: "0.9",
-            validate: (s) => !Number.isFinite(+s) ? "number required" : (+s >= 0 && +s <= 1 ? true : "must be 0..1"),
-            filter: (s) => Number(s) },
-        { type: "input", name: "reasoningScore", message: "Reasoning score (0-1)", default: "0.8",
-            validate: (s) => !Number.isFinite(+s) ? "number required" : (+s >= 0 && +s <= 1 ? true : "must be 0..1"),
-            filter: (s) => Number(s) },
-        { type: "input", name: "efficiencyScore", message: "Efficiency score (0-1)", default: "0.7",
-            validate: (s) => !Number.isFinite(+s) ? "number required" : (+s >= 0 && +s <= 1 ? true : "must be 0..1"),
-            filter: (s) => Number(s) },
-        { type: "input", name: "noveltyScore", message: "Novelty score (0-1)", default: "0.5",
-            validate: (s) => !Number.isFinite(+s) ? "number required" : (+s >= 0 && +s <= 1 ? true : "must be 0..1"),
-            filter: (s) => Number(s) },
-        { type: "input", name: "knowledgeInsight", message: "Knowledge insight (≥80 chars)",
-            validate: (s) => s.length >= 80 ? true : "need ≥80 characters" },
-    ]);
     // 5. POST verify with attestation
     const verifyBody = {
         ...answers,
@@ -224,19 +423,95 @@ async function runVerifyReproduction(globalOpts, submissionId, cmdOpts) {
     if (isGatewayError(postRes)) {
         postSpinner.fail("Verification rejected");
         console.error(chalk.red(`  ${postRes.error}`));
+        for (const hint of describeVerificationError(postRes.error)) {
+            console.error(chalk.dim(`  → ${hint}`));
+        }
+        if (attestation) {
+            console.error(chalk.dim(`  Your attestation is saved at ${pendingVerificationPath(submissionId)}. ` +
+                "Re-run `nookplot verify-reproduction " + submissionId + "` to resume."));
+        }
         process.exit(1);
     }
+    await clearPendingVerification(submissionId);
     postSpinner.succeed(chalk.green("Verification accepted."));
     if (cmdOpts.json) {
         console.log(JSON.stringify({ submissionId, attestation, result: postRes.data }, null, 2));
     }
 }
+export function preflightSandboxEnvironment() {
+    const dockerCheck = spawnSync("docker", ["version", "--format", "{{.Server.Version}}"], {
+        encoding: "utf8",
+    });
+    if (dockerCheck.error || dockerCheck.status !== 0) {
+        const hint = dockerCheck.error && dockerCheck.error.code === "ENOENT"
+            ? "Docker is not installed or not on PATH. Install Docker Desktop, or on macOS run `brew install colima docker docker-buildx && colima start --cpu 4 --memory 8`."
+            : "Docker is installed but the daemon is not reachable. Start Docker Desktop, or run `colima start` on macOS. Raw error: " +
+                (dockerCheck.stderr || dockerCheck.error?.message || "unknown");
+        throw new Error(hint);
+    }
+    let rosettaAvailable = null;
+    if (process.platform === "darwin" && os.arch() === "arm64") {
+        const { status } = spawnSync("arch", ["-x86_64", "true"], { encoding: "utf8" });
+        rosettaAvailable = status === 0;
+        if (!rosettaAvailable) {
+            console.warn(chalk.yellow("  ⚠  arm64 Mac without Rosetta 2 — the reference image is linux/amd64 and will " +
+                "run under qemu emulation (3-10× slower). Install Rosetta with " +
+                "`softwareupdate --install-rosetta` for a faster verify. If the sandbox times out, " +
+                "this is the likely cause."));
+        }
+    }
+    try {
+        const stat = statfsSync(os.tmpdir());
+        const freeBytes = Number(stat.bavail) * Number(stat.bsize);
+        const MIN_FREE_BYTES = 2 * 1024 * 1024 * 1024;
+        if (freeBytes < MIN_FREE_BYTES) {
+            throw new Error(`Less than 2 GiB free in ${os.tmpdir()} (available ${(freeBytes / 1024 / 1024).toFixed(0)} MiB). ` +
+                "The largest V1 eval bundle is ~190 MiB and extraction plus the artifact needs headroom. " +
+                "Free space or set TMPDIR to a larger volume and retry.");
+        }
+    }
+    catch (err) {
+        if (err instanceof Error && err.message.startsWith("Less than 2 GiB"))
+            throw err;
+        // statfsSync unavailable on some Node builds — skip preflight silently.
+    }
+    return { rosettaAvailable };
+}
 async function runSandboxAndBuildAttestation(args) {
     const { submission, paperConfig, ipfsGateway, image, imageDigest, cpus, memory } = args;
     if (!imageDigest || !/^sha256:[0-9a-f]{64}$/.test(imageDigest)) {
         throw new Error("Missing --image-digest. Pass the pinned sha256 digest for the reference verifier image " +
             "(env NOOKPLOT_VERIFIER_IMAGE_DIGEST, or --image-digest sha256:<64 hex>).");
     }
+    // Fetch the gateway's current allow-list and verify our local digest is on
+    // it. When the rotation pushes a new digest, this catches the mismatch
+    // BEFORE the 5-30 min sandbox run — otherwise the gateway's
+    // UNTRUSTED_VERIFIER_IMAGE gate would reject the attestation at POST time.
+    // On gateway unreachable (null return), fall through with a warning — the
+    // server-side gate remains the source of truth and will reject at post
+    // time if the digest is actually untrusted.
+    const trustList = await fetchTrustedVerifierImageDigests(args.gatewayUrl, args.apiKey);
+    if (trustList) {
+        if (!trustList.configured) {
+            console.warn(chalk.yellow("  ⚠  Gateway has no trusted verifier image digests configured — the operator must set " +
+                "NOOKPLOT_VERIFIER_IMAGE_DIGESTS before your verification can be accepted. " +
+                "Running anyway (gateway will reject with VERIFIER_IMAGE_DIGEST_UNCONFIGURED)."));
+        }
+        else if (!trustList.digests.includes(imageDigest.toLowerCase())) {
+            throw new Error(`Image digest ${imageDigest} is NOT on the gateway's current trusted allow-list ` +
+                `(${trustList.digests.length} digest${trustList.digests.length === 1 ? "" : "s"} configured). ` +
+                "The reference image has likely rotated — upgrade your CLI with " +
+                "`npm i -g @nookplot/cli@latest`, or pass --image-digest with one of: " +
+                trustList.digests.join(", ") + ". " +
+                "Skipping this check would waste 5-30 min on a sandbox run that the gateway " +
+                "will reject with UNTRUSTED_VERIFIER_IMAGE.");
+        }
+    }
+    else {
+        console.warn(chalk.yellow("  ⚠  Could not fetch trusted verifier image digest allow-list from gateway. " +
+            "Proceeding with your local digest — gateway will still validate at POST time."));
+    }
+    const preflight = preflightSandboxEnvironment();
     const work = await fs.mkdtemp(path.join(os.tmpdir(), "nookplot-verify-"));
     const artifactDir = path.join(work, "artifact");
     const evalDir = path.join(work, "eval");
@@ -256,7 +531,7 @@ async function runSandboxAndBuildAttestation(args) {
     }
     const fetchSpinner = ora("Fetching artifact + eval bundle from IPFS…").start();
     await downloadBundleCid(ipfsGateway, submission.artifactCid, artifactDir, "artifact");
-    await downloadBundleCid(ipfsGateway, paperConfig.reference_implementation_cid, evalDir, "eval");
+    const { sha256: evalBundleSha256 } = await downloadBundleCid(ipfsGateway, paperConfig.reference_implementation_cid, evalDir, "eval");
     fetchSpinner.succeed("Artifact + eval bundle pulled.");
     // Run the reference image against the mounted artifact + eval.
     const runSpinner = ora("Running reference sandbox (this may take several minutes)…").start();
@@ -301,6 +576,20 @@ async function runSandboxAndBuildAttestation(args) {
         runSpinner.fail("Docker invocation failed");
         throw child.error;
     }
+    // spawnSync's `timeout` option fires SIGTERM and leaves status=null + signal set.
+    // Detect this explicitly so we can surface Rosetta guidance on arm64 Macs —
+    // qemu emulation is 3-10× slower than Rosetta and is the #1 cause of timeouts.
+    const timedOut = child.status === null && child.signal !== null;
+    if (timedOut) {
+        runSpinner.fail(`Sandbox exceeded ${budgetSeconds}s budget (killed ${child.signal}).`);
+        const rosettaHint = preflight.rosettaAvailable === false
+            ? " This machine is arm64 Mac without Rosetta 2 — the reference image runs under qemu (3-10× slower). " +
+                "Install Rosetta with `softwareupdate --install-rosetta` and retry."
+            : " Consider increasing --cpus or running on a faster host. " +
+                "The expected budget is expected_eval_minutes × 1.5; if the paper genuinely needs more, " +
+                "the challenge operator should adjust expected_eval_minutes.";
+        throw new Error(`Docker sandbox hit the ${budgetSeconds}s timeout.${rosettaHint}`);
+    }
     const rawStdout = (child.stdout ?? "") + (child.stderr ?? "");
     const exitCode = child.status ?? 1;
     runSpinner.succeed(`Sandbox completed in ${wallTimeS}s (exit ${exitCode}).`);
@@ -351,6 +640,7 @@ async function runSandboxAndBuildAttestation(args) {
         imageDigest,
         wallTimeS,
         exitCode,
+        evalBundleSha256,
     };
 }
 // AUDIT D3: cap bundle size at 1 GiB. STL10 (largest V1 shortlist bundle) is
@@ -405,12 +695,22 @@ async function downloadBundleCid(ipfsGateway, cid, destDir, kind) {
     // warning (fail-open); a CID that IS in the manifest MUST match its
     // sha256 — a gateway serving swapped content is blocked before the
     // sandbox sees it. See `verifyEvalBundleContent`.
+    //
+    // Regardless of kind, we always compute the sha256 of the downloaded
+    // bytes: the eval sha256 is surfaced up through the sandbox attestation
+    // so the gateway can cross-check it against the challenge's pinned
+    // reference_impl_sha256 allow-list (AUDIT §6b).
+    let bundleSha256;
     if (kind === "eval") {
         const verdict = verifyEvalBundleContent(buf, cid);
         if (!verdict.ok)
             throw new Error(verdict.error);
         if (verdict.warn)
             console.warn(chalk.yellow(`  ⚠  ${verdict.warn}`));
+        bundleSha256 = verdict.actualSha256;
+    }
+    else {
+        bundleSha256 = crypto.createHash("sha256").update(buf).digest("hex");
     }
     const isGzip = buf.length >= 2 && buf[0] === 0x1f && buf[1] === 0x8b;
     if (isGzip) {
@@ -421,12 +721,135 @@ async function downloadBundleCid(ipfsGateway, cid, destDir, kind) {
             throw new Error(`tar -xzf failed for ${kind} CID ${cid} (exit ${extract.status}): ${extract.stderr ?? ""}`);
         }
         await fs.unlink(tarballPath);
+        // AUDIT L2 (2026-04-20): walk the extracted tree and reject any
+        // symlink whose realpath resolves outside destDir. Modern GNU tar
+        // blocks `../` and absolute paths by default, but symlink-based
+        // escapes (e.g. `evil -> /proc/self/environ` or `evil -> /etc/passwd`)
+        // extract cleanly — the tar stream just writes a symlink entry, and
+        // any subsequent read through that path follows the link. The
+        // container runs with --read-only + --cap-drop ALL which makes the
+        // blast radius small, but this is the classic class of bug and the
+        // walk is cheap insurance that works cross-platform without
+        // depending on specific GNU-vs-BSD tar flags.
+        await assertNoSymlinkEscapes(destDir, kind, cid);
+        // Catch truncated/corrupt tarballs that extract cleanly to zero files —
+        // otherwise the sandbox fails later with opaque "mount dir is empty".
+        const entries = await fs.readdir(destDir);
+        if (entries.length === 0) {
+            throw new Error(`${kind} bundle at CID ${cid} extracted to zero files. The archive is empty or truncated. ` +
+                "Re-pin the bundle (if you operate the challenge) or try a different --ipfs-gateway.");
+        }
+        // Post-extraction entry-point check: the reference image's run.py expects
+        // /eval/eval.py and /artifact/inference.py. A bundle missing the entry
+        // point (malicious strip, bad tar layout with everything nested one level
+        // deep, or unknown-CID fail-open where the sha256 wasn't verified) would
+        // otherwise waste the 5-30 min sandbox run before failing with an opaque
+        // Docker error. Fail-fast here with a specific diagnostic.
+        const entrypoint = kind === "eval" ? "eval.py" : "inference.py";
+        await assertEntrypointPresent(destDir, entrypoint, kind, cid);
     }
     else {
         // Non-gzip CID — write as a single `bundle` file at the mount root and
         // let the image handle it (legacy path for agents pinning raw artifacts).
         await fs.writeFile(path.join(destDir, "bundle"), buf);
     }
+    return { sha256: bundleSha256 };
+}
+// AUDIT L2: walk the extracted tree rooted at destDir and reject any
+// symlink whose real path escapes destDir. Cross-platform (no tar-flag
+// assumptions). Non-symlink directory entries are recursed into; files
+// are ignored (reading them is safe — attacker-controlled file content
+// doesn't escape). Exported for unit tests so we can assert rejection
+// on crafted destDir trees without running tar end-to-end.
+//
+// Behavior:
+//   - Symlink points inside destDir (same prefix after realpath) → OK.
+//   - Symlink points outside → throw, with a diagnostic that names the
+//     offending entry and its real target. The whole extraction is
+//     aborted; the caller treats it as bundle malformation.
+//   - Broken symlink (target missing) → tolerated. The tar stream
+//     described it; realpath would throw ENOENT. Treat as safe (no
+//     file backs the link, nothing to escape to).
+export async function assertNoSymlinkEscapes(destDir, kind, cid) {
+    let destReal;
+    try {
+        destReal = await fs.realpath(destDir);
+    }
+    catch {
+        // destDir is a caller-owned path; if realpath fails here something
+        // is very wrong (race with rm-rf, permission flip). Use the literal
+        // path as the containment root — safer than silently skipping.
+        destReal = destDir;
+    }
+    const destRealWithSep = destReal.endsWith(path.sep) ? destReal : destReal + path.sep;
+    async function walk(dir) {
+        const entries = await fs.readdir(dir, { withFileTypes: true });
+        for (const entry of entries) {
+            const full = path.join(dir, entry.name);
+            if (entry.isSymbolicLink()) {
+                let target;
+                try {
+                    target = await fs.realpath(full);
+                }
+                catch {
+                    // Broken symlink — no escape possible. Skip.
+                    continue;
+                }
+                if (target !== destReal && !target.startsWith(destRealWithSep)) {
+                    throw new Error(`${kind} bundle at CID ${cid} contains symlink ${path.relative(destDir, full)} ` +
+                        `→ ${target}, which escapes the extraction directory (${destReal}). ` +
+                        `Refusing to proceed — malformed or malicious archive.`);
+                }
+            }
+            else if (entry.isDirectory()) {
+                await walk(full);
+            }
+        }
+    }
+    await walk(destDir);
+}
+// Post-extraction check: require the expected entry-point Python file exists
+// and is non-empty. Handles two common tarball layouts:
+//   1. Flat:   tarball root contains eval.py / inference.py directly.
+//   2. Nested: tarball root contains a single subdirectory (e.g. the
+//      commit-hash or paper-slug) that holds the entry point one level down.
+// Exported for unit tests so we can pin behavior on empty files, missing
+// files, and nested-dir layouts without scaffolding a full extraction.
+export async function assertEntrypointPresent(destDir, entrypoint, kind, cid) {
+    async function nonEmpty(p) {
+        try {
+            const st = await fs.stat(p);
+            return st.isFile() && st.size > 0;
+        }
+        catch {
+            return false;
+        }
+    }
+    // Case 1: flat layout.
+    if (await nonEmpty(path.join(destDir, entrypoint)))
+        return;
+    // Case 2: nested single-dir layout — walk one level in.
+    const entries = await fs.readdir(destDir, { withFileTypes: true });
+    const dirs = entries.filter((e) => e.isDirectory());
+    if (dirs.length === 1) {
+        const nested = path.join(destDir, dirs[0].name, entrypoint);
+        if (await nonEmpty(nested)) {
+            // Flatten so run.py's hardcoded /eval/eval.py and /artifact/inference.py
+            // paths resolve. Copy-then-rmdir is simpler than mount rebinding and
+            // runs in milliseconds for the small bundles we accept.
+            const nestedDir = path.join(destDir, dirs[0].name);
+            for (const child of await fs.readdir(nestedDir)) {
+                await fs.rename(path.join(nestedDir, child), path.join(destDir, child));
+            }
+            await fs.rmdir(nestedDir);
+            return;
+        }
+    }
+    throw new Error(`${kind} bundle at CID ${cid} is missing ${entrypoint} (or it is empty). ` +
+        `The reference sandbox mounts this bundle at /${kind} and runs ${entrypoint}; ` +
+        (kind === "eval"
+            ? "re-pin the eval bundle if you operate the challenge, or try a different --ipfs-gateway."
+            : "the solver's artifact is malformed — this submission cannot be verified as-is."));
 }
 async function pinStdoutToIpfs(gatewayUrl, apiKey, stdout) {
     // The gateway exposes a helper for verifiers to pin their sandbox