wotann 0.5.96 → 0.5.97

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/dist/index.js +42 -0
  2. package/dist/orchestration/proof-bundles.d.ts +8 -0
  3. package/dist/orchestration/proof-bundles.js +2 -0
  4. package/dist/security/approval-binding.d.ts +52 -0
  5. package/dist/security/approval-binding.js +57 -0
  6. package/dist/security/human-approval.d.ts +2 -0
  7. package/dist/security/human-approval.js +15 -24
  8. package/dist/verification/reproduction/autonomous-gate.d.ts +52 -0
  9. package/dist/verification/reproduction/autonomous-gate.js +71 -0
  10. package/dist/verification/reproduction/checkout-prep.d.ts +48 -0
  11. package/dist/verification/reproduction/checkout-prep.js +78 -0
  12. package/dist/verification/reproduction/diff-checker.d.ts +26 -0
  13. package/dist/verification/reproduction/diff-checker.js +33 -0
  14. package/dist/verification/reproduction/enforcement.d.ts +14 -0
  15. package/dist/verification/reproduction/enforcement.js +30 -0
  16. package/dist/verification/reproduction/exec-runner.d.ts +15 -0
  17. package/dist/verification/reproduction/exec-runner.js +47 -0
  18. package/dist/verification/reproduction/index.d.ts +10 -0
  19. package/dist/verification/reproduction/index.js +10 -0
  20. package/dist/verification/reproduction/mutation-gate.d.ts +42 -0
  21. package/dist/verification/reproduction/mutation-gate.js +43 -0
  22. package/dist/verification/reproduction/proof-artifact.d.ts +16 -0
  23. package/dist/verification/reproduction/proof-artifact.js +22 -0
  24. package/dist/verification/reproduction/replay-runner.d.ts +37 -0
  25. package/dist/verification/reproduction/replay-runner.js +28 -0
  26. package/dist/verification/reproduction/reproduce.d.ts +34 -0
  27. package/dist/verification/reproduction/reproduce.js +31 -0
  28. package/dist/verification/reproduction/verdict.d.ts +39 -0
  29. package/dist/verification/reproduction/verdict.js +40 -0
  30. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -5379,6 +5379,48 @@ program
5379
5379
  ? chalk.green(" Done — Task completed successfully")
5380
5380
  : chalk.red(" Failed — Task did not complete"));
5381
5381
  console.log();
5382
+ // Independent verify-by-reproduction (opt-in: WOTANN_REPRODUCE=1).
5383
+ // Default-off because re-running the suite in an isolated checkout
5384
+ // roughly doubles run time. Replays the agent's claimed checks in a
5385
+ // clean base+diff worktree (a separate trust boundary, deps symlinked)
5386
+ // and reports a verdict the agent cannot fake. Best-effort — never
5387
+ // breaks the run.
5388
+ if (process.env.WOTANN_REPRODUCE === "1") {
5389
+ try {
5390
+ const [repro, os, path] = await Promise.all([
5391
+ import("./verification/reproduction/index.js"),
5392
+ import("node:os"),
5393
+ import("node:path"),
5394
+ ]);
5395
+ const git = repro.buildExecGitRunner();
5396
+ const lastCycle = result.cycles[result.cycles.length - 1];
5397
+ const verdict = await repro.runWorkspaceReproduction({
5398
+ cwd: process.cwd(),
5399
+ claimed: {
5400
+ testsPass: lastCycle?.testsPass ?? false,
5401
+ typecheckPass: lastCycle?.typecheckPass ?? false,
5402
+ lintPass: lastCycle?.lintPass ?? false,
5403
+ },
5404
+ commands: { test: ["npm", "test"], typecheck: ["npm", "run", "typecheck"] },
5405
+ worktreeDir: path.join(os.tmpdir(), `wotann-verify-${Date.now()}`),
5406
+ linkFromRepo: ["node_modules"],
5407
+ }, { git, replay: repro.buildExecReplayRunner() });
5408
+ const tag = verdict.enforcement.action === "block"
5409
+ ? chalk.red(`⛔ ${verdict.result.verdict}`)
5410
+ : verdict.enforcement.action === "allow"
5411
+ ? chalk.green(`✓ ${verdict.result.verdict}`)
5412
+ : chalk.yellow(`⚠ ${verdict.result.verdict}`);
5413
+ console.log(chalk.bold("Independent reproduction (separate trust boundary):"));
5414
+ console.log(` ${tag} — ${verdict.enforcement.reason}`);
5415
+ for (const c of verdict.result.contradictions) {
5416
+ console.log(chalk.dim(` • ${c}`));
5417
+ }
5418
+ console.log();
5419
+ }
5420
+ catch (e) {
5421
+ console.log(chalk.dim(` Reproduction skipped: ${e instanceof Error ? e.message : String(e)}`));
5422
+ }
5423
+ }
5382
5424
  process.exit(result.success ? 0 : 1);
5383
5425
  }
5384
5426
  finally {
@@ -47,6 +47,14 @@ export interface AutonomousProofBundle {
47
47
  readonly visualVerificationEnabled: boolean;
48
48
  readonly visualExpectation?: string;
49
49
  readonly finalChecks: {
50
+ /**
51
+ * Provenance of these checks. "self-reported" = copied from the agent's
52
+ * own cycle result — a CLAIM, not an independently reproduced result. A
53
+ * green check is a claim, not proof (verify-by-reproduction, V7); a future
54
+ * verdict re-runs these in a separate trust boundary and reports a
55
+ * "reproduced" source instead.
56
+ */
57
+ readonly source: "self-reported";
50
58
  readonly testsPass: boolean;
51
59
  readonly typecheckPass: boolean;
52
60
  readonly lintPass: boolean;
@@ -44,6 +44,8 @@ export function writeAutonomousProofBundle(input) {
44
44
  visualExpectation: input.visualExpectation,
45
45
  finalChecks: lastCycle
46
46
  ? {
47
+ // Self-reported by the agent's own cycle — a claim, not proof (V7).
48
+ source: "self-reported",
47
49
  testsPass: lastCycle.testsPass,
48
50
  typecheckPass: lastCycle.typecheckPass,
49
51
  lintPass: lastCycle.lintPass,
@@ -0,0 +1,52 @@
1
+ /**
2
+ * Exact-bytes approval binding (corpus F4; defeats the OpenClaw CVE-2026-29607
3
+ * "approval persisted at the wrapper level, not the inner command" class and
4
+ * the TOCTOU race). Bind the canonicalized action's HMAC at approval time;
5
+ * re-pin at execution and abort on drift, replay (single-use nonce), or expiry.
6
+ */
7
+ export interface CanonicalAction {
8
+ readonly tool: string;
9
+ readonly args: readonly string[];
10
+ readonly cwd: string;
11
+ }
12
+ export interface ApprovalBinding {
13
+ readonly bindingId: string;
14
+ readonly actionHash: string;
15
+ readonly nonce: string;
16
+ readonly expiresAt: number;
17
+ }
18
+ export type VerifyResult = {
19
+ readonly ok: true;
20
+ } | {
21
+ readonly ok: false;
22
+ readonly reason: string;
23
+ };
24
+ /**
25
+ * Deterministic canonical form of an action. The CALLER must pre-resolve shell
26
+ * expansion / env-substitution / path resolution BEFORE binding, so the bound
27
+ * bytes ARE exactly what will execute (the literate wrapper is not what runs).
28
+ */
29
+ export declare function canonicalizeAction(action: CanonicalAction): string;
30
+ export interface ApprovalBinderOptions {
31
+ readonly ttlMs?: number;
32
+ readonly now?: () => number;
33
+ }
34
+ /**
35
+ * Stateful service: the consumed-nonce set lives inside the instance (QB#7
36
+ * per-call state, not module-global). Inject `now` for deterministic tests.
37
+ */
38
+ export declare class ApprovalBinder {
39
+ private readonly secret;
40
+ private readonly ttlMs;
41
+ private readonly now;
42
+ private readonly consumed;
43
+ private counter;
44
+ constructor(secret: Buffer | string, opts?: ApprovalBinderOptions);
45
+ bind(action: CanonicalAction, nonce?: string): ApprovalBinding;
46
+ /**
47
+ * Re-pin at execution: recompute the HMAC from the action that is ABOUT to
48
+ * run and reject on drift, expiry, or replay. Consumes the nonce on success.
49
+ * Order matters: replay > expiry > drift (cheapest, most-specific first).
50
+ */
51
+ verify(binding: ApprovalBinding, action: CanonicalAction): VerifyResult;
52
+ }
@@ -0,0 +1,57 @@
1
+ import { timingSafeEqual } from "node:crypto";
2
+ import { computeBoundaryHmac } from "./prompt-injection-quarantine.js";
3
+ /**
4
+ * Deterministic canonical form of an action. The CALLER must pre-resolve shell
5
+ * expansion / env-substitution / path resolution BEFORE binding, so the bound
6
+ * bytes ARE exactly what will execute (the literate wrapper is not what runs).
7
+ */
8
+ export function canonicalizeAction(action) {
9
+ return JSON.stringify({ tool: action.tool.trim(), args: action.args, cwd: action.cwd });
10
+ }
11
+ /**
12
+ * Stateful service: the consumed-nonce set lives inside the instance (QB#7
13
+ * per-call state, not module-global). Inject `now` for deterministic tests.
14
+ */
15
+ export class ApprovalBinder {
16
+ secret;
17
+ ttlMs;
18
+ now;
19
+ consumed = new Set();
20
+ counter = 0;
21
+ constructor(secret, opts = {}) {
22
+ this.secret = secret;
23
+ this.ttlMs = opts.ttlMs ?? 5 * 60_000;
24
+ this.now = opts.now ?? (() => Date.now());
25
+ }
26
+ bind(action, nonce) {
27
+ const actionHash = computeBoundaryHmac(canonicalizeAction(action), this.secret);
28
+ const id = ++this.counter;
29
+ return {
30
+ bindingId: `bind-${id}`,
31
+ actionHash,
32
+ nonce: nonce ?? `n-${this.now()}-${id}`,
33
+ expiresAt: this.now() + this.ttlMs,
34
+ };
35
+ }
36
+ /**
37
+ * Re-pin at execution: recompute the HMAC from the action that is ABOUT to
38
+ * run and reject on drift, expiry, or replay. Consumes the nonce on success.
39
+ * Order matters: replay > expiry > drift (cheapest, most-specific first).
40
+ */
41
+ verify(binding, action) {
42
+ if (this.consumed.has(binding.nonce)) {
43
+ return { ok: false, reason: "replay: nonce already consumed" };
44
+ }
45
+ if (this.now() > binding.expiresAt) {
46
+ return { ok: false, reason: "expired: approval window elapsed" };
47
+ }
48
+ const recomputed = computeBoundaryHmac(canonicalizeAction(action), this.secret);
49
+ const a = Buffer.from(recomputed, "utf-8");
50
+ const b = Buffer.from(binding.actionHash, "utf-8");
51
+ if (a.length !== b.length || !timingSafeEqual(a, b)) {
52
+ return { ok: false, reason: "hash drift: action changed since approval" };
53
+ }
54
+ this.consumed.add(binding.nonce);
55
+ return { ok: true };
56
+ }
57
+ }
@@ -19,6 +19,8 @@ export interface ApprovalRequest {
19
19
  readonly tool: string;
20
20
  readonly args: string;
21
21
  readonly reason: string;
22
+ /** Optional exact-bytes approval-binding hash (see security/approval-binding.ts). */
23
+ readonly hash?: string;
22
24
  readonly risk: "low" | "medium" | "high" | "critical";
23
25
  readonly riskLevel: "moderate" | "high" | "critical";
24
26
  readonly createdAt: number;
@@ -26,7 +26,10 @@ const DEFAULT_POLICIES = [
26
26
  riskLevel: "moderate",
27
27
  channels: ["cli", "desktop"],
28
28
  timeoutMs: 60_000, // 1 minute
29
- defaultOnTimeout: "approve",
29
+ // Fail CLOSED: an absent/stalled human must never auto-approve a write or
30
+ // edit. An adversarial agent could otherwise stall past the timeout to win
31
+ // an automatic approval. (bash is already fail-closed; this matches it.)
32
+ defaultOnTimeout: "deny",
30
33
  },
31
34
  ];
32
35
  // ── Approval Manager ─────────────────────────────────────
@@ -71,7 +74,7 @@ export class HumanApprovalManager {
71
74
  feedback: `Approval timed out after ${timeoutMs}ms. Default: ${defaultOnTimeout}.`,
72
75
  respondedAt: Date.now(),
73
76
  };
74
- this.history.push(response);
77
+ this.history.push({ request, response });
75
78
  resolve(response);
76
79
  }
77
80
  }, timeoutMs);
@@ -80,7 +83,7 @@ export class HumanApprovalManager {
80
83
  request,
81
84
  resolve: (response) => {
82
85
  clearTimeout(timer);
83
- this.history.push(response);
86
+ this.history.push({ request, response });
84
87
  resolve(response);
85
88
  },
86
89
  });
@@ -113,7 +116,7 @@ export class HumanApprovalManager {
113
116
  * Get approval history.
114
117
  */
115
118
  getHistory() {
116
- return this.history;
119
+ return this.history.map((record) => record.response);
117
120
  }
118
121
  /**
119
122
  * Add a custom policy.
@@ -146,30 +149,18 @@ export class HumanApprovalManager {
146
149
  * Get full audit log of all approval decisions (request + result pairs).
147
150
  */
148
151
  getAuditLog() {
149
- return this.history.map((response) => {
150
- // Look up the original request from pending or reconstruct from response
151
- const request = {
152
- id: response.requestId,
153
- action: "tool_call",
154
- description: response.feedback ?? "",
155
- tool: "",
156
- args: "",
157
- reason: "",
158
- risk: "medium",
159
- riskLevel: "moderate",
160
- createdAt: response.respondedAt,
161
- timestamp: response.respondedAt,
162
- timeoutMs: 0,
163
- channels: [],
164
- };
165
- const result = {
152
+ // The REAL request is recorded at decision time and read back verbatim —
153
+ // never reconstructed — so the audit trail can never drift from what was
154
+ // actually approved.
155
+ return this.history.map(({ request, response }) => ({
156
+ request,
157
+ result: {
166
158
  approved: response.decision === "approve",
167
159
  approvedBy: response.respondedBy,
168
160
  feedback: response.feedback,
169
161
  decidedAt: response.respondedAt,
170
- };
171
- return { request, result };
172
- });
162
+ },
163
+ }));
173
164
  }
174
165
  // ── Private ────────────────────────────────────────────
175
166
  assessRisk(tool, args) {
@@ -0,0 +1,52 @@
1
+ import { type GitRunner } from "./checkout-prep.js";
2
+ import { type ClaimedChecks, type ReproductionResult } from "./verdict.js";
3
+ import { type ProofArtifact } from "./proof-artifact.js";
4
+ import { type EnforcementDecision } from "./enforcement.js";
5
+ import type { ReplayRunner, ReplayCommands } from "./replay-runner.js";
6
+ import type { MutationRunner } from "./mutation-gate.js";
7
+ export interface AutonomousReproduceInput {
8
+ readonly repoDir: string;
9
+ readonly baseRef: string;
10
+ readonly diffText: string;
11
+ readonly changedPaths: readonly string[];
12
+ readonly claimed: ClaimedChecks;
13
+ readonly commands: ReplayCommands;
14
+ /** A fresh, non-existent directory for the verifier-box worktree. */
15
+ readonly worktreeDir: string;
16
+ /** Paths to symlink from repoDir into the worktree (e.g. ["node_modules"]). */
17
+ readonly linkFromRepo?: readonly string[];
18
+ }
19
+ export interface AutonomousReproduceRunners {
20
+ readonly git: GitRunner;
21
+ readonly replay: ReplayRunner;
22
+ readonly mutation?: MutationRunner;
23
+ }
24
+ export interface AutonomousReproduceOutput {
25
+ readonly result: ReproductionResult;
26
+ readonly proof: ProofArtifact;
27
+ readonly enforcement: EnforcementDecision;
28
+ }
29
+ /**
30
+ * Full production composition: prepare an isolated checkout (the separate trust
31
+ * boundary), reproduce the claimed result inside it, decide enforcement, and
32
+ * ALWAYS clean up the worktree. A failed checkout => infra-error (never an
33
+ * auto-pass — we couldn't verify, so we don't silently allow).
34
+ */
35
+ export declare function reproduceAutonomousRun(input: AutonomousReproduceInput, runners: AutonomousReproduceRunners): Promise<AutonomousReproduceOutput>;
36
+ export interface WorkspaceReproduceInput {
37
+ /** The agent's workspace (its uncommitted changes are captured as the diff). */
38
+ readonly cwd: string;
39
+ readonly claimed: ClaimedChecks;
40
+ readonly commands: ReplayCommands;
41
+ /** A fresh, non-existent directory for the verifier-box worktree. */
42
+ readonly worktreeDir: string;
43
+ /** Paths to symlink from the workspace into the worktree (e.g. ["node_modules"]). */
44
+ readonly linkFromRepo?: readonly string[];
45
+ }
46
+ /**
47
+ * Capture the workspace's git state (HEAD as base + the uncommitted diff) and
48
+ * reproduce. Assumes the agent's changes are UNCOMMITTED (the common autonomous
49
+ * case): HEAD is the base and `git diff` is the agent's work. If the agent
50
+ * committed mid-run, the diff is empty and the clean base is reproduced.
51
+ */
52
+ export declare function runWorkspaceReproduction(input: WorkspaceReproduceInput, runners: AutonomousReproduceRunners): Promise<AutonomousReproduceOutput>;
@@ -0,0 +1,71 @@
1
+ import { prepareVerifierCheckout } from "./checkout-prep.js";
2
+ import { reproduceRun } from "./reproduce.js";
3
+ import { decideReproductionVerdict, } from "./verdict.js";
4
+ import { buildProofArtifact } from "./proof-artifact.js";
5
+ import { enforceReproductionVerdict } from "./enforcement.js";
6
+ /**
7
+ * Full production composition: prepare an isolated checkout (the separate trust
8
+ * boundary), reproduce the claimed result inside it, decide enforcement, and
9
+ * ALWAYS clean up the worktree. A failed checkout => infra-error (never an
10
+ * auto-pass — we couldn't verify, so we don't silently allow).
11
+ */
12
+ export async function reproduceAutonomousRun(input, runners) {
13
+ const checkout = await prepareVerifierCheckout({
14
+ repoDir: input.repoDir,
15
+ baseRef: input.baseRef,
16
+ diffText: input.diffText,
17
+ worktreeDir: input.worktreeDir,
18
+ ...(input.linkFromRepo ? { linkFromRepo: input.linkFromRepo } : {}),
19
+ }, runners.git);
20
+ if (!checkout.ok) {
21
+ const result = decideReproductionVerdict({
22
+ claimed: input.claimed,
23
+ observed: { testsPass: null, typecheckPass: null, lintPass: null },
24
+ tampered: false,
25
+ infraError: `checkout:${checkout.error}`,
26
+ });
27
+ const proof = buildProofArtifact(result, input.diffText);
28
+ return { result, proof, enforcement: enforceReproductionVerdict(result.verdict) };
29
+ }
30
+ try {
31
+ const reproRunners = runners.mutation
32
+ ? { replay: runners.replay, mutation: runners.mutation }
33
+ : { replay: runners.replay };
34
+ const { result, proof } = await reproduceRun({
35
+ claimed: input.claimed,
36
+ changedPaths: input.changedPaths,
37
+ checkoutDir: checkout.checkout.checkoutDir,
38
+ commands: input.commands,
39
+ diffText: input.diffText,
40
+ mutationFiles: input.changedPaths,
41
+ }, reproRunners);
42
+ return { result, proof, enforcement: enforceReproductionVerdict(result.verdict) };
43
+ }
44
+ finally {
45
+ await checkout.checkout.cleanup();
46
+ }
47
+ }
48
+ /**
49
+ * Capture the workspace's git state (HEAD as base + the uncommitted diff) and
50
+ * reproduce. Assumes the agent's changes are UNCOMMITTED (the common autonomous
51
+ * case): HEAD is the base and `git diff` is the agent's work. If the agent
52
+ * committed mid-run, the diff is empty and the clean base is reproduced.
53
+ */
54
+ export async function runWorkspaceReproduction(input, runners) {
55
+ const baseRef = (await runners.git.run(input.cwd, ["rev-parse", "HEAD"])).stdout.trim();
56
+ const diffText = (await runners.git.run(input.cwd, ["diff"])).stdout;
57
+ const changedPaths = (await runners.git.run(input.cwd, ["diff", "--name-only"])).stdout
58
+ .split("\n")
59
+ .map((s) => s.trim())
60
+ .filter(Boolean);
61
+ return reproduceAutonomousRun({
62
+ repoDir: input.cwd,
63
+ baseRef,
64
+ diffText,
65
+ changedPaths,
66
+ claimed: input.claimed,
67
+ commands: input.commands,
68
+ worktreeDir: input.worktreeDir,
69
+ ...(input.linkFromRepo ? { linkFromRepo: input.linkFromRepo } : {}),
70
+ }, runners);
71
+ }
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Prepare the separate-trust-boundary checkout for verify-by-reproduction
3
+ * (spec §3.1, the missing prerequisite for a correct production trigger).
4
+ *
5
+ * The verifier box MUST run a CLEAN base checkout + ONLY the agent's diff — in
6
+ * a separate directory — so the agent's dirty workspace (runtime tampering,
7
+ * stray files, monkeypatched runners) can never influence the result. Running
8
+ * the replay in the agent's own `process.cwd()` would be the BenchJack V1
9
+ * violation this defeats.
10
+ */
11
+ export interface GitRunner {
12
+ readonly run: (cwd: string, argv: readonly string[]) => Promise<{
13
+ exitCode: number;
14
+ stdout: string;
15
+ stderr: string;
16
+ }>;
17
+ }
18
+ /** Production GitRunner via execFile (argv-only, injection-safe). */
19
+ export declare function buildExecGitRunner(): GitRunner;
20
+ export interface PrepareCheckoutInput {
21
+ /** The agent's repository (the worktree is registered here). */
22
+ readonly repoDir: string;
23
+ /** The commit the agent started from (before its changes). */
24
+ readonly baseRef: string;
25
+ /** The agent's diff (unified, as from `git diff`). Empty = clean base. */
26
+ readonly diffText: string;
27
+ /** A fresh, non-existent directory for the detached worktree. */
28
+ readonly worktreeDir: string;
29
+ /**
30
+ * Paths (relative to repoDir) to symlink into the worktree after checkout —
31
+ * e.g. ["node_modules"]. A fresh worktree lacks gitignored deps, so without
32
+ * this the replay's `npm test` would fail on missing modules (a FALSE
33
+ * 'contradicted'). Deps are not the grading surface, so sharing them is safe.
34
+ */
35
+ readonly linkFromRepo?: readonly string[];
36
+ }
37
+ export interface VerifierCheckout {
38
+ readonly checkoutDir: string;
39
+ readonly cleanup: () => Promise<void>;
40
+ }
41
+ export type CheckoutResult = {
42
+ readonly ok: true;
43
+ readonly checkout: VerifierCheckout;
44
+ } | {
45
+ readonly ok: false;
46
+ readonly error: string;
47
+ };
48
+ export declare function prepareVerifierCheckout(input: PrepareCheckoutInput, git: GitRunner): Promise<CheckoutResult>;
@@ -0,0 +1,78 @@
1
+ import { writeFileSync, symlinkSync, existsSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ /** Production GitRunner via execFile (argv-only, injection-safe). */
4
+ export function buildExecGitRunner() {
5
+ return {
6
+ run: async (cwd, argv) => {
7
+ const { execFile } = await import("node:child_process");
8
+ return new Promise((resolve) => {
9
+ execFile("git", [...argv], { cwd, maxBuffer: 64 * 1024 * 1024 }, (error, stdout, stderr) => {
10
+ const exitCode = error && typeof error.code === "number"
11
+ ? Number(error.code)
12
+ : error
13
+ ? 1
14
+ : 0;
15
+ resolve({
16
+ exitCode,
17
+ stdout: stdout?.toString() ?? "",
18
+ stderr: stderr?.toString() ?? (error instanceof Error ? error.message : ""),
19
+ });
20
+ });
21
+ });
22
+ },
23
+ };
24
+ }
25
+ export async function prepareVerifierCheckout(input, git) {
26
+ const add = await git.run(input.repoDir, [
27
+ "worktree",
28
+ "add",
29
+ "--detach",
30
+ input.worktreeDir,
31
+ input.baseRef,
32
+ ]);
33
+ if (add.exitCode !== 0) {
34
+ return {
35
+ ok: false,
36
+ error: `worktree-add-failed:exit=${add.exitCode}:${add.stderr.trim().slice(0, 240)}`,
37
+ };
38
+ }
39
+ const cleanup = async () => {
40
+ await git.run(input.repoDir, ["worktree", "remove", "--force", input.worktreeDir]);
41
+ };
42
+ if (input.diffText.trim() !== "") {
43
+ const patchPath = join(input.worktreeDir, ".wotann-verify.patch");
44
+ try {
45
+ writeFileSync(patchPath, input.diffText, "utf-8");
46
+ }
47
+ catch (e) {
48
+ await cleanup();
49
+ return {
50
+ ok: false,
51
+ error: `patch-write-failed:${e instanceof Error ? e.message : String(e)}`,
52
+ };
53
+ }
54
+ const apply = await git.run(input.worktreeDir, ["apply", "--whitespace=nowarn", patchPath]);
55
+ if (apply.exitCode !== 0) {
56
+ await cleanup();
57
+ return {
58
+ ok: false,
59
+ error: `apply-failed:exit=${apply.exitCode}:${apply.stderr.trim().slice(0, 240)}`,
60
+ };
61
+ }
62
+ }
63
+ for (const rel of input.linkFromRepo ?? []) {
64
+ const src = join(input.repoDir, rel);
65
+ const dst = join(input.worktreeDir, rel);
66
+ // Best-effort: skip when the source is absent or the target already exists.
67
+ if (existsSync(src) && !existsSync(dst)) {
68
+ try {
69
+ symlinkSync(src, dst, "dir");
70
+ }
71
+ catch {
72
+ // A failed symlink just means the replay may hit missing deps and
73
+ // report a result the verdict logic handles — never a silent pass.
74
+ }
75
+ }
76
+ }
77
+ return { ok: true, checkout: { checkoutDir: input.worktreeDir, cleanup } };
78
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Grading-surface diff-checker (BenchJack V2/V8 defense). The agent's diff
3
+ * must touch ONLY source — never the test harness / its own grader. A diff
4
+ * that edits the grading surface is TAMPER, not a normal failure.
5
+ *
6
+ * NOTE: package.json is deliberately NOT protected here (legit dep edits are
7
+ * common); detecting edits to its `scripts.test` field specifically is a
8
+ * Phase-B field-level refinement.
9
+ */
10
+ export declare const DEFAULT_PROTECTED_PATTERNS: readonly RegExp[];
11
+ export interface TestAuthorship {
12
+ /** Matches files considered "tests" (whose edits are gated). */
13
+ readonly testFilePattern: RegExp;
14
+ /** Test files the agent legitimately created/edited THIS task (allowlisted). */
15
+ readonly authoredTestFiles: readonly string[];
16
+ }
17
+ export interface DiffCheckResult {
18
+ readonly tampered: boolean;
19
+ readonly offendingPaths: readonly string[];
20
+ }
21
+ /**
22
+ * Pure. `changedPaths` are repo-relative POSIX paths the agent's diff touched.
23
+ * A path is offending if it matches a protected pattern, OR it is a test file
24
+ * (per `authorship.testFilePattern`) that is not in `authoredTestFiles`.
25
+ */
26
+ export declare function checkDiff(changedPaths: readonly string[], patterns?: readonly RegExp[], authorship?: TestAuthorship): DiffCheckResult;
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Grading-surface diff-checker (BenchJack V2/V8 defense). The agent's diff
3
+ * must touch ONLY source — never the test harness / its own grader. A diff
4
+ * that edits the grading surface is TAMPER, not a normal failure.
5
+ *
6
+ * NOTE: package.json is deliberately NOT protected here (legit dep edits are
7
+ * common); detecting edits to its `scripts.test` field specifically is a
8
+ * Phase-B field-level refinement.
9
+ */
10
+ export const DEFAULT_PROTECTED_PATTERNS = Object.freeze([
11
+ /(^|\/)conftest\.py$/,
12
+ /(^|\/)pytest\.ini$/,
13
+ /(^|\/)tox\.ini$/,
14
+ /(^|\/)(jest|vitest|playwright)\.config\.[cm]?[jt]s$/,
15
+ /(^|\/)\.mocharc\.[a-z]+$/,
16
+ /(^|\/)\.git(\/|$)/,
17
+ ]);
18
+ /**
19
+ * Pure. `changedPaths` are repo-relative POSIX paths the agent's diff touched.
20
+ * A path is offending if it matches a protected pattern, OR it is a test file
21
+ * (per `authorship.testFilePattern`) that is not in `authoredTestFiles`.
22
+ */
23
+ export function checkDiff(changedPaths, patterns = DEFAULT_PROTECTED_PATTERNS, authorship) {
24
+ const authored = new Set(authorship?.authoredTestFiles ?? []);
25
+ const offendingPaths = changedPaths.filter((p) => {
26
+ if (patterns.some((re) => re.test(p)))
27
+ return true;
28
+ if (authorship && authorship.testFilePattern.test(p) && !authored.has(p))
29
+ return true;
30
+ return false;
31
+ });
32
+ return { tampered: offendingPaths.length > 0, offendingPaths };
33
+ }
@@ -0,0 +1,14 @@
1
+ import type { ReproductionVerdict } from "./verdict.js";
2
+ export type EnforcementAction = "allow" | "block" | "surface" | "escalate";
3
+ export interface EnforcementDecision {
4
+ readonly action: EnforcementAction;
5
+ readonly reason: string;
6
+ }
7
+ /**
8
+ * Asymmetric enforcement (spec §3.2). A false PASS is the moat-killer; a false
9
+ * BLOCK is recoverable — so reproduction-sourced tamper/contradicted HARD-BLOCK,
10
+ * while weaker signals surface or escalate. Keyed on the REPRODUCTION verdict
11
+ * (trustworthy, executable) — never on bare LLM-judge text (the TNR<25% yes-man
12
+ * problem is exactly why the enforce-flip waited for the reproduction channel).
13
+ */
14
+ export declare function enforceReproductionVerdict(verdict: ReproductionVerdict): EnforcementDecision;
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Asymmetric enforcement (spec §3.2). A false PASS is the moat-killer; a false
3
+ * BLOCK is recoverable — so reproduction-sourced tamper/contradicted HARD-BLOCK,
4
+ * while weaker signals surface or escalate. Keyed on the REPRODUCTION verdict
5
+ * (trustworthy, executable) — never on bare LLM-judge text (the TNR<25% yes-man
6
+ * problem is exactly why the enforce-flip waited for the reproduction channel).
7
+ */
8
+ export function enforceReproductionVerdict(verdict) {
9
+ switch (verdict) {
10
+ case "tamper":
11
+ return { action: "block", reason: "diff tampered with the grading surface" };
12
+ case "contradicted":
13
+ return {
14
+ action: "block",
15
+ reason: "claimed success contradicted by independent reproduction",
16
+ };
17
+ case "weak-tests":
18
+ return {
19
+ action: "surface",
20
+ reason: "reproduction passed but tests are too weak to trust (low mutation score)",
21
+ };
22
+ case "infra-error":
23
+ return {
24
+ action: "escalate",
25
+ reason: "could not reproduce — verify manually before trusting",
26
+ };
27
+ case "reproduced":
28
+ return { action: "allow", reason: "independently reproduced in a separate trust boundary" };
29
+ }
30
+ }
@@ -0,0 +1,15 @@
1
+ import type { ReplayRunner } from "./replay-runner.js";
2
+ /**
3
+ * Production `ReplayRunner` backed by `child_process.execFile` (argv-only — no
4
+ * shell interpolation, so injection-safe regardless of the command content,
5
+ * matching the codebase's execFileNoThrow contract). Runs the claimed commands
6
+ * inside `dir`, the verifier-box checkout.
7
+ *
8
+ * Honest probe: confirms the Node interpreter is runnable; reports the failure
9
+ * explicitly otherwise (so runReplay emits `infra-error`, never a silent pass).
10
+ *
11
+ * Trust-boundary note: this runs on the HOST in a given directory. Full
12
+ * separate-trust-boundary isolation (a container, per spec §3.1) is the deferred
13
+ * production hardening — this is the host-dir baseline so the loop runs for real.
14
+ */
15
+ export declare function buildExecReplayRunner(): ReplayRunner;
@@ -0,0 +1,47 @@
1
+ /**
2
+ * Production `ReplayRunner` backed by `child_process.execFile` (argv-only — no
3
+ * shell interpolation, so injection-safe regardless of the command content,
4
+ * matching the codebase's execFileNoThrow contract). Runs the claimed commands
5
+ * inside `dir`, the verifier-box checkout.
6
+ *
7
+ * Honest probe: confirms the Node interpreter is runnable; reports the failure
8
+ * explicitly otherwise (so runReplay emits `infra-error`, never a silent pass).
9
+ *
10
+ * Trust-boundary note: this runs on the HOST in a given directory. Full
11
+ * separate-trust-boundary isolation (a container, per spec §3.1) is the deferred
12
+ * production hardening — this is the host-dir baseline so the loop runs for real.
13
+ */
14
+ export function buildExecReplayRunner() {
15
+ return {
16
+ probe: async () => {
17
+ const { execFile } = await import("node:child_process");
18
+ return new Promise((resolve) => {
19
+ execFile(process.execPath, ["--version"], (error) => {
20
+ resolve(error
21
+ ? { ok: false, reason: error instanceof Error ? error.message : String(error) }
22
+ : { ok: true });
23
+ });
24
+ });
25
+ },
26
+ runInDir: async (dir, argv) => {
27
+ const { execFile } = await import("node:child_process");
28
+ const [file, ...rest] = argv;
29
+ if (!file)
30
+ return { exitCode: 1, stdout: "", stderr: "empty argv" };
31
+ return new Promise((resolve) => {
32
+ execFile(file, rest, { cwd: dir, maxBuffer: 10 * 1024 * 1024 }, (error, stdout, stderr) => {
33
+ const exitCode = error && typeof error.code === "number"
34
+ ? Number(error.code)
35
+ : error
36
+ ? 1
37
+ : 0;
38
+ resolve({
39
+ exitCode,
40
+ stdout: stdout?.toString() ?? "",
41
+ stderr: stderr?.toString() ?? (error instanceof Error ? error.message : ""),
42
+ });
43
+ });
44
+ });
45
+ },
46
+ };
47
+ }
@@ -0,0 +1,10 @@
1
+ export { decideReproductionVerdict, type ReproductionVerdict, type ClaimedChecks, type ObservedChecks, type ReproductionInput, type ReproductionResult, } from "./verdict.js";
2
+ export { checkDiff, DEFAULT_PROTECTED_PATTERNS, type DiffCheckResult, type TestAuthorship, } from "./diff-checker.js";
3
+ export { runReplay, type ReplayRunner, type ReplayCommands, type ReplayInput, } from "./replay-runner.js";
4
+ export { buildProofArtifact, type ProofArtifact } from "./proof-artifact.js";
5
+ export { gateMutation, runMutationGate, DEFAULT_MUTATION_THRESHOLD, type MutationResult, type MutationGateResult, type MutationRunner, } from "./mutation-gate.js";
6
+ export { enforceReproductionVerdict, type EnforcementAction, type EnforcementDecision, } from "./enforcement.js";
7
+ export { reproduceRun, type ReproduceInput, type ReproduceRunners, type ReproduceOutput, } from "./reproduce.js";
8
+ export { buildExecReplayRunner } from "./exec-runner.js";
9
+ export { prepareVerifierCheckout, buildExecGitRunner, type GitRunner, type PrepareCheckoutInput, type VerifierCheckout, type CheckoutResult, } from "./checkout-prep.js";
10
+ export { reproduceAutonomousRun, runWorkspaceReproduction, type AutonomousReproduceInput, type AutonomousReproduceRunners, type AutonomousReproduceOutput, type WorkspaceReproduceInput, } from "./autonomous-gate.js";
@@ -0,0 +1,10 @@
1
+ export { decideReproductionVerdict, } from "./verdict.js";
2
+ export { checkDiff, DEFAULT_PROTECTED_PATTERNS, } from "./diff-checker.js";
3
+ export { runReplay, } from "./replay-runner.js";
4
+ export { buildProofArtifact } from "./proof-artifact.js";
5
+ export { gateMutation, runMutationGate, DEFAULT_MUTATION_THRESHOLD, } from "./mutation-gate.js";
6
+ export { enforceReproductionVerdict, } from "./enforcement.js";
7
+ export { reproduceRun, } from "./reproduce.js";
8
+ export { buildExecReplayRunner } from "./exec-runner.js";
9
+ export { prepareVerifierCheckout, buildExecGitRunner, } from "./checkout-prep.js";
10
+ export { reproduceAutonomousRun, runWorkspaceReproduction, } from "./autonomous-gate.js";
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Mutation gate (BenchJack V6 defense): a green test run is worthless if the
3
+ * tests have no real assertions ("100% coverage, 0% mutation score"). Mutate
4
+ * the changed code; if the agent's OWN tests don't kill enough mutants,
5
+ * downgrade a "reproduced" verdict to "weak-tests".
6
+ *
7
+ * The gate DECISION is pure. Running the mutation tool (Stryker/mutmut) inside
8
+ * the verifier box is the injected `MutationRunner` (production wiring deferred,
9
+ * same DI pattern as replay-runner / vm-isolation).
10
+ */
11
+ export interface MutationResult {
12
+ readonly killed: number;
13
+ readonly total: number;
14
+ }
15
+ export interface MutationGateResult {
16
+ readonly weakTests: boolean;
17
+ readonly score: number;
18
+ readonly threshold: number;
19
+ readonly reason?: string;
20
+ }
21
+ export declare const DEFAULT_MUTATION_THRESHOLD = 0.6;
22
+ /**
23
+ * Pure. `total <= 0` means no mutants were generated — the tests do not
24
+ * exercise the changed code at all — which is the worst case, so it is `weak`.
25
+ */
26
+ export declare function gateMutation(result: MutationResult, threshold?: number): MutationGateResult;
27
+ /** Injected runner (DI like replay-runner). Production wires Stryker/mutmut; tests stub. */
28
+ export interface MutationRunner {
29
+ readonly probe: () => Promise<{
30
+ ok: boolean;
31
+ reason?: string;
32
+ }>;
33
+ readonly run: (dir: string, changedFiles: readonly string[]) => Promise<MutationResult>;
34
+ }
35
+ /**
36
+ * Honest stub: a failed probe means the mutation tool is unavailable. Mutation
37
+ * testing is a BONUS downgrade (the reproduction channel carries enforcement),
38
+ * so "unavailable" does NOT set `weakTests` — it just records why it was skipped.
39
+ */
40
+ export declare function runMutationGate(dir: string, changedFiles: readonly string[], runner: MutationRunner, threshold?: number): Promise<MutationGateResult & {
41
+ readonly unavailable?: string;
42
+ }>;
@@ -0,0 +1,43 @@
1
+ export const DEFAULT_MUTATION_THRESHOLD = 0.6;
2
+ /**
3
+ * Pure. `total <= 0` means no mutants were generated — the tests do not
4
+ * exercise the changed code at all — which is the worst case, so it is `weak`.
5
+ */
6
+ export function gateMutation(result, threshold = DEFAULT_MUTATION_THRESHOLD) {
7
+ if (result.total <= 0) {
8
+ return {
9
+ weakTests: true,
10
+ score: 0,
11
+ threshold,
12
+ reason: "no mutants generated — tests do not exercise the changed code",
13
+ };
14
+ }
15
+ const score = result.killed / result.total;
16
+ if (score < threshold) {
17
+ return {
18
+ weakTests: true,
19
+ score,
20
+ threshold,
21
+ reason: `mutation score ${score.toFixed(2)} < threshold ${threshold}`,
22
+ };
23
+ }
24
+ return { weakTests: false, score, threshold };
25
+ }
26
+ /**
27
+ * Honest stub: a failed probe means the mutation tool is unavailable. Mutation
28
+ * testing is a BONUS downgrade (the reproduction channel carries enforcement),
29
+ * so "unavailable" does NOT set `weakTests` — it just records why it was skipped.
30
+ */
31
+ export async function runMutationGate(dir, changedFiles, runner, threshold = DEFAULT_MUTATION_THRESHOLD) {
32
+ const probe = await runner.probe();
33
+ if (!probe.ok) {
34
+ return {
35
+ weakTests: false,
36
+ score: 0,
37
+ threshold,
38
+ unavailable: `mutation:unavailable${probe.reason ? `:${probe.reason}` : ""}`,
39
+ };
40
+ }
41
+ const result = await runner.run(dir, changedFiles);
42
+ return gateMutation(result, threshold);
43
+ }
@@ -0,0 +1,16 @@
1
+ import { type ChainExport } from "../../security/hash-audit-chain.js";
2
+ import type { ReproductionResult, ReproductionVerdict, ClaimedChecks, ObservedChecks } from "./verdict.js";
3
+ /**
4
+ * The verdict is a first-class, hash-chained, harness-signed PROOF ARTIFACT —
5
+ * not a bare boolean. `chainExport` is tamper-evident (SHA-256 linked); any
6
+ * post-hoc edit to the recorded verdict/observed breaks `HashAuditChain.verify()`.
7
+ */
8
+ export interface ProofArtifact {
9
+ readonly verdict: ReproductionVerdict;
10
+ readonly diffCid: string;
11
+ readonly claimed: ClaimedChecks;
12
+ readonly observed: ObservedChecks;
13
+ readonly contradictions: readonly string[];
14
+ readonly chainExport: ChainExport;
15
+ }
16
+ export declare function buildProofArtifact(result: ReproductionResult, diffText: string, actor?: string): ProofArtifact;
@@ -0,0 +1,22 @@
1
+ import { HashAuditChain } from "../../security/hash-audit-chain.js";
2
+ import { cidOf } from "../../core/content-cid.js";
3
+ export function buildProofArtifact(result, diffText, actor = "wotann-verifier") {
4
+ const diffCid = cidOf(diffText);
5
+ const chain = new HashAuditChain();
6
+ chain.append("reproduction.verdict", actor, {
7
+ verdict: result.verdict,
8
+ diffCid,
9
+ claimed: result.claimed,
10
+ observed: result.observed,
11
+ contradictions: result.contradictions,
12
+ ...(result.infraError ? { infraError: result.infraError } : {}),
13
+ });
14
+ return {
15
+ verdict: result.verdict,
16
+ diffCid,
17
+ claimed: result.claimed,
18
+ observed: result.observed,
19
+ contradictions: result.contradictions,
20
+ chainExport: chain.exportChain(),
21
+ };
22
+ }
@@ -0,0 +1,37 @@
1
+ import type { ObservedChecks } from "./verdict.js";
2
+ /**
3
+ * Structural runner (mirrors VmRunner in computer-use/vm-isolation.ts). Tests
4
+ * inject an in-memory stub; production wires this to run argv INSIDE the
5
+ * verifier box (container/sandbox = the separate trust boundary). Argv-only,
6
+ * never a shell string — no interpolation.
7
+ */
8
+ export interface ReplayRunner {
9
+ readonly probe: () => Promise<{
10
+ ok: boolean;
11
+ reason?: string;
12
+ }>;
13
+ readonly runInDir: (dir: string, argv: readonly string[]) => Promise<{
14
+ exitCode: number;
15
+ stdout: string;
16
+ stderr: string;
17
+ }>;
18
+ }
19
+ export interface ReplayCommands {
20
+ readonly test?: readonly string[];
21
+ readonly typecheck?: readonly string[];
22
+ readonly lint?: readonly string[];
23
+ }
24
+ export interface ReplayInput {
25
+ /** The clean, applied-diff checkout in the verifier box (separate trust boundary). */
26
+ readonly checkoutDir: string;
27
+ readonly commands: ReplayCommands;
28
+ }
29
+ /**
30
+ * Re-run the claimed commands in the verifier box and report the HARNESS's own
31
+ * pass/fail (exit code 0 = pass). NEVER reads the agent's self-reported result.
32
+ * Honest stub: a failed probe yields all-null observed + infraError, so the
33
+ * verdict logic emits `infra-error` and never auto-passes.
34
+ */
35
+ export declare function runReplay(input: ReplayInput, runner: ReplayRunner): Promise<ObservedChecks & {
36
+ readonly infraError?: string;
37
+ }>;
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Re-run the claimed commands in the verifier box and report the HARNESS's own
3
+ * pass/fail (exit code 0 = pass). NEVER reads the agent's self-reported result.
4
+ * Honest stub: a failed probe yields all-null observed + infraError, so the
5
+ * verdict logic emits `infra-error` and never auto-passes.
6
+ */
7
+ export async function runReplay(input, runner) {
8
+ const probe = await runner.probe();
9
+ if (!probe.ok) {
10
+ return {
11
+ testsPass: null,
12
+ typecheckPass: null,
13
+ lintPass: null,
14
+ infraError: `replay:unavailable${probe.reason ? `:${probe.reason}` : ""}`,
15
+ };
16
+ }
17
+ const observe = async (argv) => {
18
+ if (!argv || argv.length === 0)
19
+ return null;
20
+ const r = await runner.runInDir(input.checkoutDir, argv);
21
+ return r.exitCode === 0;
22
+ };
23
+ return {
24
+ testsPass: await observe(input.commands.test),
25
+ typecheckPass: await observe(input.commands.typecheck),
26
+ lintPass: await observe(input.commands.lint),
27
+ };
28
+ }
@@ -0,0 +1,34 @@
1
+ import { type TestAuthorship } from "./diff-checker.js";
2
+ import { type ReplayRunner, type ReplayCommands } from "./replay-runner.js";
3
+ import { type MutationRunner } from "./mutation-gate.js";
4
+ import { type ClaimedChecks, type ReproductionResult } from "./verdict.js";
5
+ import { type ProofArtifact } from "./proof-artifact.js";
6
+ export interface ReproduceInput {
7
+ readonly claimed: ClaimedChecks;
8
+ /** Repo-relative POSIX paths the agent's diff touched. */
9
+ readonly changedPaths: readonly string[];
10
+ /** The clean, applied-diff checkout in the verifier box (separate trust boundary). */
11
+ readonly checkoutDir: string;
12
+ readonly commands: ReplayCommands;
13
+ /** The diff text (for the proof artifact's content identity). */
14
+ readonly diffText: string;
15
+ readonly authorship?: TestAuthorship;
16
+ /** Changed source files to mutation-test (Phase-B bonus gate). */
17
+ readonly mutationFiles?: readonly string[];
18
+ }
19
+ export interface ReproduceRunners {
20
+ readonly replay: ReplayRunner;
21
+ /** Optional — when present and the reproduction is clean, gates weak-tests. */
22
+ readonly mutation?: MutationRunner;
23
+ }
24
+ export interface ReproduceOutput {
25
+ readonly result: ReproductionResult;
26
+ readonly proof: ProofArtifact;
27
+ }
28
+ /**
29
+ * Compose the reproduction library into one call: tamper-check the diff, replay
30
+ * the claimed commands in the verifier box (capturing the harness's OWN result),
31
+ * optionally mutation-gate a clean reproduction, decide the verdict, and emit
32
+ * the hash-chained proof artifact. Pure orchestration over injected runners.
33
+ */
34
+ export declare function reproduceRun(input: ReproduceInput, runners: ReproduceRunners): Promise<ReproduceOutput>;
@@ -0,0 +1,31 @@
1
+ import { checkDiff } from "./diff-checker.js";
2
+ import { runReplay } from "./replay-runner.js";
3
+ import { runMutationGate } from "./mutation-gate.js";
4
+ import { decideReproductionVerdict, } from "./verdict.js";
5
+ import { buildProofArtifact } from "./proof-artifact.js";
6
+ /**
7
+ * Compose the reproduction library into one call: tamper-check the diff, replay
8
+ * the claimed commands in the verifier box (capturing the harness's OWN result),
9
+ * optionally mutation-gate a clean reproduction, decide the verdict, and emit
10
+ * the hash-chained proof artifact. Pure orchestration over injected runners.
11
+ */
12
+ export async function reproduceRun(input, runners) {
13
+ const diff = checkDiff(input.changedPaths, undefined, input.authorship);
14
+ const observed = await runReplay({ checkoutDir: input.checkoutDir, commands: input.commands }, runners.replay);
15
+ // The mutation gate is a BONUS downgrade — only meaningful on an otherwise
16
+ // clean reproduction (skip it when we already have tamper or infra-error).
17
+ let weakTests = false;
18
+ if (runners.mutation && !diff.tampered && !observed.infraError) {
19
+ const gate = await runMutationGate(input.checkoutDir, input.mutationFiles ?? [], runners.mutation);
20
+ weakTests = gate.weakTests;
21
+ }
22
+ const result = decideReproductionVerdict({
23
+ claimed: input.claimed,
24
+ observed,
25
+ tampered: diff.tampered,
26
+ weakTests,
27
+ infraError: observed.infraError,
28
+ });
29
+ const proof = buildProofArtifact(result, input.diffText);
30
+ return { result, proof };
31
+ }
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Reproduction verdict — the harness's OWN judgment, produced by re-running
3
+ * the claimed work in a separate trust boundary. NEVER the agent's self-report
4
+ * (that is the V7 violation this whole module exists to defeat).
5
+ */
6
+ export type ReproductionVerdict = "reproduced" | "contradicted" | "weak-tests" | "tamper" | "infra-error";
7
+ export interface ClaimedChecks {
8
+ readonly testsPass: boolean;
9
+ readonly typecheckPass: boolean;
10
+ readonly lintPass: boolean;
11
+ }
12
+ /** `null` = the check was not run / unavailable, so it cannot contradict a claim. */
13
+ export interface ObservedChecks {
14
+ readonly testsPass: boolean | null;
15
+ readonly typecheckPass: boolean | null;
16
+ readonly lintPass: boolean | null;
17
+ }
18
+ export interface ReproductionInput {
19
+ readonly claimed: ClaimedChecks;
20
+ readonly observed: ObservedChecks;
21
+ readonly tampered: boolean;
22
+ /** Set by the Phase-B mutation gate; Phase A never sets it. */
23
+ readonly weakTests?: boolean;
24
+ /** Set when the replay could not run at all. */
25
+ readonly infraError?: string;
26
+ }
27
+ export interface ReproductionResult {
28
+ readonly verdict: ReproductionVerdict;
29
+ readonly claimed: ClaimedChecks;
30
+ readonly observed: ObservedChecks;
31
+ readonly contradictions: readonly string[];
32
+ readonly infraError?: string;
33
+ }
34
+ /**
35
+ * Pure decision. Precedence (a false PASS is the moat-killer; a false BLOCK is
36
+ * recoverable, so we bias toward blocking): tamper > infra-error > contradicted
37
+ * > weak-tests > reproduced.
38
+ */
39
+ export declare function decideReproductionVerdict(input: ReproductionInput): ReproductionResult;
@@ -0,0 +1,40 @@
1
+ const CHECKS = [
2
+ { name: "tests", claimedKey: "testsPass", observedKey: "testsPass" },
3
+ { name: "typecheck", claimedKey: "typecheckPass", observedKey: "typecheckPass" },
4
+ { name: "lint", claimedKey: "lintPass", observedKey: "lintPass" },
5
+ ];
6
+ /**
7
+ * Pure decision. Precedence (a false PASS is the moat-killer; a false BLOCK is
8
+ * recoverable, so we bias toward blocking): tamper > infra-error > contradicted
9
+ * > weak-tests > reproduced.
10
+ */
11
+ export function decideReproductionVerdict(input) {
12
+ const contradictions = [];
13
+ for (const c of CHECKS) {
14
+ const claimed = input.claimed[c.claimedKey];
15
+ const observed = input.observed[c.observedKey];
16
+ // Only a claimed-PASS that the harness observed as FAIL is a contradiction.
17
+ // observed === null means "not run", which cannot contradict anything.
18
+ if (claimed === true && observed === false) {
19
+ contradictions.push(`${c.name}: agent claimed pass, harness observed fail`);
20
+ }
21
+ }
22
+ let verdict;
23
+ if (input.tampered)
24
+ verdict = "tamper";
25
+ else if (input.infraError)
26
+ verdict = "infra-error";
27
+ else if (contradictions.length > 0)
28
+ verdict = "contradicted";
29
+ else if (input.weakTests)
30
+ verdict = "weak-tests";
31
+ else
32
+ verdict = "reproduced";
33
+ return {
34
+ verdict,
35
+ claimed: input.claimed,
36
+ observed: input.observed,
37
+ contradictions,
38
+ ...(input.infraError ? { infraError: input.infraError } : {}),
39
+ };
40
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "wotann",
3
- "version": "0.5.96",
3
+ "version": "0.5.97",
4
4
  "description": "WOTANN — The All-Father of AI Agent Harnesses",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",