wotann 0.5.95 → 0.5.97

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dist/index.js +68 -24
  2. package/dist/orchestration/proof-bundles.d.ts +8 -0
  3. package/dist/orchestration/proof-bundles.js +2 -0
  4. package/dist/security/approval-binding.d.ts +52 -0
  5. package/dist/security/approval-binding.js +57 -0
  6. package/dist/security/human-approval.d.ts +2 -0
  7. package/dist/security/human-approval.js +15 -24
  8. package/dist/ui/components/v3/AppV3.d.ts +10 -1
  9. package/dist/ui/components/v3/AppV3.js +34 -5
  10. package/dist/ui/components/v3/Transcript.d.ts +21 -1
  11. package/dist/ui/components/v3/Transcript.js +18 -58
  12. package/dist/ui/components/v3/TranscriptRow.d.ts +45 -0
  13. package/dist/ui/components/v3/TranscriptRow.js +102 -0
  14. package/dist/ui/inline-render.d.ts +28 -0
  15. package/dist/ui/inline-render.js +35 -0
  16. package/dist/verification/reproduction/autonomous-gate.d.ts +52 -0
  17. package/dist/verification/reproduction/autonomous-gate.js +71 -0
  18. package/dist/verification/reproduction/checkout-prep.d.ts +48 -0
  19. package/dist/verification/reproduction/checkout-prep.js +78 -0
  20. package/dist/verification/reproduction/diff-checker.d.ts +26 -0
  21. package/dist/verification/reproduction/diff-checker.js +33 -0
  22. package/dist/verification/reproduction/enforcement.d.ts +14 -0
  23. package/dist/verification/reproduction/enforcement.js +30 -0
  24. package/dist/verification/reproduction/exec-runner.d.ts +15 -0
  25. package/dist/verification/reproduction/exec-runner.js +47 -0
  26. package/dist/verification/reproduction/index.d.ts +10 -0
  27. package/dist/verification/reproduction/index.js +10 -0
  28. package/dist/verification/reproduction/mutation-gate.d.ts +42 -0
  29. package/dist/verification/reproduction/mutation-gate.js +43 -0
  30. package/dist/verification/reproduction/proof-artifact.d.ts +16 -0
  31. package/dist/verification/reproduction/proof-artifact.js +22 -0
  32. package/dist/verification/reproduction/replay-runner.d.ts +37 -0
  33. package/dist/verification/reproduction/replay-runner.js +28 -0
  34. package/dist/verification/reproduction/reproduce.d.ts +34 -0
  35. package/dist/verification/reproduction/reproduce.js +31 -0
  36. package/dist/verification/reproduction/verdict.d.ts +39 -0
  37. package/dist/verification/reproduction/verdict.js +40 -0
  38. package/package.json +1 -1
  39. package/dist/ui/opentui-chat.d.ts +0 -19
  40. package/dist/ui/opentui-chat.js +0 -285
@@ -0,0 +1,45 @@
1
+ /**
2
+ * TranscriptRow — a single memoized conversation row.
3
+ *
4
+ * Extracted from `Transcript.tsx`'s inline row map so it can be rendered
5
+ * in BOTH places the AppV4 inline model needs it:
6
+ * 1. Committed history inside Ink `<Static>` (write-once → terminal
7
+ * scrollback). Static never re-renders an emitted row, so the memo
8
+ * is moot there — but identity-stable rows keep the contract clean.
9
+ * 2. The live in-flight turn (the streaming assistant row + its tool
10
+ * rows). Here the memo earns its keep: while the assistant row's
11
+ * content grows token-by-token, the sibling rows (the user prompt,
12
+ * finished tool rows) MUST NOT re-render.
13
+ *
14
+ * Why a CUSTOM comparator (not the default shallow `React.memo`):
15
+ * `toTranscriptMessages` rebuilds a fresh object for every message on
16
+ * every render (`messages.map((m) => ({...}))`), so a by-reference memo
17
+ * would re-render every row on every streamed token — defeating the
18
+ * point. We compare the fields that actually drive the render. The
19
+ * `attachments` array survives a reference check because
20
+ * `toTranscriptMessages` threads the SAME array through (it never copies
21
+ * it), so an unchanged message keeps an identity-stable attachments ref.
22
+ *
23
+ * Rendering is byte-identical to the prior inline map (Phase 1 is a pure
24
+ * no-op refactor): the horizontal padding stays a concern of the
25
+ * container/Static call-site, never the row, so this component composes
26
+ * the same in either mounting context.
27
+ */
28
+ import React from "react";
29
+ import type { CapabilityProfile } from "../../capability-tier.js";
30
+ import type { TerminalCapabilities } from "../../terminal-capabilities.js";
31
+ import type { TranscriptMessageV3 } from "./Transcript.js";
32
+ export interface TranscriptRowProps {
33
+ readonly msg: TranscriptMessageV3;
34
+ readonly profile: CapabilityProfile;
35
+ readonly terminalCapabilities: TerminalCapabilities;
36
+ }
37
+ declare function TranscriptRowImpl({ msg, profile, terminalCapabilities, }: TranscriptRowProps): React.ReactElement;
38
+ /**
39
+ * Memo comparator — return TRUE to SKIP a re-render. Exported for direct
40
+ * unit testing (the streaming hot-path depends on this returning true for
41
+ * an unchanged sibling row while the assistant row streams).
42
+ */
43
+ export declare function transcriptRowsEqual(prev: TranscriptRowProps, next: TranscriptRowProps): boolean;
44
+ export declare const TranscriptRow: React.MemoExoticComponent<typeof TranscriptRowImpl>;
45
+ export {};
@@ -0,0 +1,102 @@
1
+ import { jsx as _jsx, jsxs as _jsxs } from "#wotann-jsx/jsx-runtime";
2
+ /**
3
+ * TranscriptRow — a single memoized conversation row.
4
+ *
5
+ * Extracted from `Transcript.tsx`'s inline row map so it can be rendered
6
+ * in BOTH places the AppV4 inline model needs it:
7
+ * 1. Committed history inside Ink `<Static>` (write-once → terminal
8
+ * scrollback). Static never re-renders an emitted row, so the memo
9
+ * is moot there — but identity-stable rows keep the contract clean.
10
+ * 2. The live in-flight turn (the streaming assistant row + its tool
11
+ * rows). Here the memo earns its keep: while the assistant row's
12
+ * content grows token-by-token, the sibling rows (the user prompt,
13
+ * finished tool rows) MUST NOT re-render.
14
+ *
15
+ * Why a CUSTOM comparator (not the default shallow `React.memo`):
16
+ * `toTranscriptMessages` rebuilds a fresh object for every message on
17
+ * every render (`messages.map((m) => ({...}))`), so a by-reference memo
18
+ * would re-render every row on every streamed token — defeating the
19
+ * point. We compare the fields that actually drive the render. The
20
+ * `attachments` array survives a reference check because
21
+ * `toTranscriptMessages` threads the SAME array through (it never copies
22
+ * it), so an unchanged message keeps an identity-stable attachments ref.
23
+ *
24
+ * Rendering is byte-identical to the prior inline map (Phase 1 is a pure
25
+ * no-op refactor): the horizontal padding stays a concern of the
26
+ * container/Static call-site, never the row, so this component composes
27
+ * the same in either mounting context.
28
+ */
29
+ import React from "react";
30
+ import { Box, Text } from "ink";
31
+ import { glyph } from "../../theme/tokens.js";
32
+ import { useThemeTone } from "../../theme/context.js";
33
+ import { parseSlashResultMessage, SystemMessageCard } from "./SystemMessageCard.js";
34
+ import { KittyGraphics } from "./KittyGraphics.js";
35
+ // Minimal role markers — Claude Code / Codex parity. User gets a
36
+ // single subtle ❯ (same glyph as the composer prompt); the assistant
37
+ // is PURE content (no badge, no label, no gutter) — the biggest
38
+ // declutter; system/tool get a dim · marker. No per-message gutter
39
+ // bar, no Norse runes, nothing bold.
40
+ const ROLE_STYLES_AB = {
41
+ user: { gutterTone: "primary", badge: glyph.prompt, label: "" },
42
+ assistant: { gutterTone: "muted", badge: "", label: "" },
43
+ system: { gutterTone: "warning", badge: "·", label: "system" },
44
+ tool: { gutterTone: "muted", badge: "·", label: "tool" },
45
+ };
46
+ const ROLE_STYLES_C = {
47
+ user: { gutterTone: "primary", badge: ">", label: "" },
48
+ assistant: { gutterTone: "muted", badge: "", label: "" },
49
+ system: { gutterTone: "warning", badge: "*", label: "system" },
50
+ tool: { gutterTone: "muted", badge: "*", label: "tool" },
51
+ };
52
+ function formatTime(timestamp) {
53
+ if (typeof timestamp !== "number" || !Number.isFinite(timestamp))
54
+ return null;
55
+ const d = new Date(timestamp);
56
+ const hh = String(d.getHours()).padStart(2, "0");
57
+ const mm = String(d.getMinutes()).padStart(2, "0");
58
+ return `${hh}:${mm}`;
59
+ }
60
+ function TranscriptRowImpl({ msg, profile, terminalCapabilities, }) {
61
+ const { tone } = useThemeTone();
62
+ const styles = profile.tier === "C" ? ROLE_STYLES_C : ROLE_STYLES_AB;
63
+ // Route slash-command results to the richer SystemMessageCard so a
64
+ // `/model` dispatch reads as a harness response rather than a user
65
+ // message. The marker-based handshake keeps the Transcript schema
66
+ // unchanged — system messages without the marker fall through to the
67
+ // default render below.
68
+ if (msg.role === "system") {
69
+ const payload = parseSlashResultMessage(msg.content);
70
+ if (payload !== null) {
71
+ return _jsx(SystemMessageCard, { payload: payload, profile: profile });
72
+ }
73
+ }
74
+ const style = styles[msg.role];
75
+ const gutterColor = tone[style.gutterTone];
76
+ const timeString = formatTime(msg.timestamp);
77
+ const lines = msg.content.length === 0 ? [""] : msg.content.split("\n");
78
+ const attachments = msg.attachments ?? [];
79
+ const hasHeader = style.badge !== "" || style.label !== "" || timeString !== null;
80
+ return (_jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [hasHeader && (_jsxs(Box, { justifyContent: "space-between", children: [_jsxs(Box, { gap: 1, children: [style.badge !== "" && _jsx(Text, { color: gutterColor, children: style.badge }), style.label !== "" && _jsx(Text, { color: gutterColor, children: style.label })] }), timeString !== null && _jsx(Text, { color: tone.muted, children: timeString })] })), lines.map((line, i) => (_jsx(Text, { color: tone.text, children: line.length === 0 ? " " : line }, `msg-${msg.id}-line-${i}`))), attachments.map((attachment, i) => {
81
+ if (attachment.kind === "image" && attachment.dataUri !== undefined) {
82
+ return (_jsx(KittyGraphics, { source: attachment.dataUri, capabilities: terminalCapabilities, rows: profile.tier === "C" ? 3 : 5, columns: profile.tier === "C" ? 28 : 40, caption: attachment.path }, `msg-${msg.id}-att-${i}`));
83
+ }
84
+ const label = attachment.kind === "image" ? "image" : "file";
85
+ return (_jsxs(Text, { color: tone.muted, italic: true, children: ["[", label, ": ", attachment.path, "]"] }, `msg-${msg.id}-att-${i}`));
86
+ })] }));
87
+ }
88
+ /**
89
+ * Memo comparator — return TRUE to SKIP a re-render. Exported for direct
90
+ * unit testing (the streaming hot-path depends on this returning true for
91
+ * an unchanged sibling row while the assistant row streams).
92
+ */
93
+ export function transcriptRowsEqual(prev, next) {
94
+ return (prev.msg.id === next.msg.id &&
95
+ prev.msg.content === next.msg.content &&
96
+ prev.msg.role === next.msg.role &&
97
+ prev.msg.timestamp === next.msg.timestamp &&
98
+ prev.msg.attachments === next.msg.attachments &&
99
+ prev.profile === next.profile &&
100
+ prev.terminalCapabilities === next.terminalCapabilities);
101
+ }
102
+ export const TranscriptRow = React.memo(TranscriptRowImpl, transcriptRowsEqual);
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Inline-render mode gate (AppV4 flicker fix, phased rollout).
3
+ *
4
+ * The complete streaming-flicker fix renders the chat surface INLINE in
5
+ * the terminal's MAIN buffer (committed history written once into native
6
+ * scrollback via Ink `<Static>`) instead of the alternate screen buffer.
7
+ * The alt buffer has no scrollback, so `<Static>` is incompatible with it
8
+ * (committed rows would scroll off irrecoverably) — see
9
+ * docs/phase-0-redesign/phase-1-inline-architecture.md §0.
10
+ *
11
+ * Why a flag (not default-on yet): the mount-path change can only be
12
+ * validated for "feel" + cross-terminal correctness (iTerm2 / Terminal.app
13
+ * / tmux) on a PHYSICAL terminal, which CI/headless harnesses cannot
14
+ * reproduce (this is the same fragility that drove the PR #35-#38
15
+ * raw-mode/mount hardening). So inline mode ships behind
16
+ * `WOTANN_TUI_INLINE=1` for opt-in verification; once confirmed on a real
17
+ * terminal it becomes the default for the chat surface.
18
+ *
19
+ * The objective gate (eraseLines-sequence drop ≥90% vs the alt-buffer
20
+ * baseline) is exercised by the PTY byte-trace test, which DOES run here.
21
+ */
22
+ /**
23
+ * Whether this session should render the chat surface inline in the main
24
+ * buffer (Ink `<Static>` committed history) rather than in the alt-screen
25
+ * buffer. Reads `WOTANN_TUI_INLINE`; defaults OFF during the phased
26
+ * rollout. Pass an explicit env for tests.
27
+ */
28
+ export declare function isInlineRenderRequested(env?: NodeJS.ProcessEnv): boolean;
@@ -0,0 +1,35 @@
1
+ /**
2
+ * Inline-render mode gate (AppV4 flicker fix, phased rollout).
3
+ *
4
+ * The complete streaming-flicker fix renders the chat surface INLINE in
5
+ * the terminal's MAIN buffer (committed history written once into native
6
+ * scrollback via Ink `<Static>`) instead of the alternate screen buffer.
7
+ * The alt buffer has no scrollback, so `<Static>` is incompatible with it
8
+ * (committed rows would scroll off irrecoverably) — see
9
+ * docs/phase-0-redesign/phase-1-inline-architecture.md §0.
10
+ *
11
+ * Why a flag (not default-on yet): the mount-path change can only be
12
+ * validated for "feel" + cross-terminal correctness (iTerm2 / Terminal.app
13
+ * / tmux) on a PHYSICAL terminal, which CI/headless harnesses cannot
14
+ * reproduce (this is the same fragility that drove the PR #35-#38
15
+ * raw-mode/mount hardening). So inline mode ships behind
16
+ * `WOTANN_TUI_INLINE=1` for opt-in verification; once confirmed on a real
17
+ * terminal it becomes the default for the chat surface.
18
+ *
19
+ * The objective gate (eraseLines-sequence drop ≥90% vs the alt-buffer
20
+ * baseline) is exercised by the PTY byte-trace test, which DOES run here.
21
+ */
22
+ /** Truthy values that opt a session into inline main-buffer rendering. */
23
+ const TRUTHY = new Set(["1", "true", "yes", "on"]);
24
+ /**
25
+ * Whether this session should render the chat surface inline in the main
26
+ * buffer (Ink `<Static>` committed history) rather than in the alt-screen
27
+ * buffer. Reads `WOTANN_TUI_INLINE`; defaults OFF during the phased
28
+ * rollout. Pass an explicit env for tests.
29
+ */
30
+ export function isInlineRenderRequested(env = process.env) {
31
+ const value = env["WOTANN_TUI_INLINE"];
32
+ if (value === undefined)
33
+ return false;
34
+ return TRUTHY.has(value.trim().toLowerCase());
35
+ }
@@ -0,0 +1,52 @@
1
+ import { type GitRunner } from "./checkout-prep.js";
2
+ import { type ClaimedChecks, type ReproductionResult } from "./verdict.js";
3
+ import { type ProofArtifact } from "./proof-artifact.js";
4
+ import { type EnforcementDecision } from "./enforcement.js";
5
+ import type { ReplayRunner, ReplayCommands } from "./replay-runner.js";
6
+ import type { MutationRunner } from "./mutation-gate.js";
7
+ export interface AutonomousReproduceInput {
8
+ readonly repoDir: string;
9
+ readonly baseRef: string;
10
+ readonly diffText: string;
11
+ readonly changedPaths: readonly string[];
12
+ readonly claimed: ClaimedChecks;
13
+ readonly commands: ReplayCommands;
14
+ /** A fresh, non-existent directory for the verifier-box worktree. */
15
+ readonly worktreeDir: string;
16
+ /** Paths to symlink from repoDir into the worktree (e.g. ["node_modules"]). */
17
+ readonly linkFromRepo?: readonly string[];
18
+ }
19
+ export interface AutonomousReproduceRunners {
20
+ readonly git: GitRunner;
21
+ readonly replay: ReplayRunner;
22
+ readonly mutation?: MutationRunner;
23
+ }
24
+ export interface AutonomousReproduceOutput {
25
+ readonly result: ReproductionResult;
26
+ readonly proof: ProofArtifact;
27
+ readonly enforcement: EnforcementDecision;
28
+ }
29
+ /**
30
+ * Full production composition: prepare an isolated checkout (the separate trust
31
+ * boundary), reproduce the claimed result inside it, decide enforcement, and
32
+ * ALWAYS clean up the worktree. A failed checkout => infra-error (never an
33
+ * auto-pass — we couldn't verify, so we don't silently allow).
34
+ */
35
+ export declare function reproduceAutonomousRun(input: AutonomousReproduceInput, runners: AutonomousReproduceRunners): Promise<AutonomousReproduceOutput>;
36
+ export interface WorkspaceReproduceInput {
37
+ /** The agent's workspace (its uncommitted changes are captured as the diff). */
38
+ readonly cwd: string;
39
+ readonly claimed: ClaimedChecks;
40
+ readonly commands: ReplayCommands;
41
+ /** A fresh, non-existent directory for the verifier-box worktree. */
42
+ readonly worktreeDir: string;
43
+ /** Paths to symlink from the workspace into the worktree (e.g. ["node_modules"]). */
44
+ readonly linkFromRepo?: readonly string[];
45
+ }
46
+ /**
47
+ * Capture the workspace's git state (HEAD as base + the uncommitted diff) and
48
+ * reproduce. Assumes the agent's changes are UNCOMMITTED (the common autonomous
49
+ * case): HEAD is the base and `git diff` is the agent's work. If the agent
50
+ * committed mid-run, the diff is empty and the clean base is reproduced.
51
+ */
52
+ export declare function runWorkspaceReproduction(input: WorkspaceReproduceInput, runners: AutonomousReproduceRunners): Promise<AutonomousReproduceOutput>;
@@ -0,0 +1,71 @@
1
+ import { prepareVerifierCheckout } from "./checkout-prep.js";
2
+ import { reproduceRun } from "./reproduce.js";
3
+ import { decideReproductionVerdict, } from "./verdict.js";
4
+ import { buildProofArtifact } from "./proof-artifact.js";
5
+ import { enforceReproductionVerdict } from "./enforcement.js";
6
+ /**
7
+ * Full production composition: prepare an isolated checkout (the separate trust
8
+ * boundary), reproduce the claimed result inside it, decide enforcement, and
9
+ * ALWAYS clean up the worktree. A failed checkout => infra-error (never an
10
+ * auto-pass — we couldn't verify, so we don't silently allow).
11
+ */
12
+ export async function reproduceAutonomousRun(input, runners) {
13
+ const checkout = await prepareVerifierCheckout({
14
+ repoDir: input.repoDir,
15
+ baseRef: input.baseRef,
16
+ diffText: input.diffText,
17
+ worktreeDir: input.worktreeDir,
18
+ ...(input.linkFromRepo ? { linkFromRepo: input.linkFromRepo } : {}),
19
+ }, runners.git);
20
+ if (!checkout.ok) {
21
+ const result = decideReproductionVerdict({
22
+ claimed: input.claimed,
23
+ observed: { testsPass: null, typecheckPass: null, lintPass: null },
24
+ tampered: false,
25
+ infraError: `checkout:${checkout.error}`,
26
+ });
27
+ const proof = buildProofArtifact(result, input.diffText);
28
+ return { result, proof, enforcement: enforceReproductionVerdict(result.verdict) };
29
+ }
30
+ try {
31
+ const reproRunners = runners.mutation
32
+ ? { replay: runners.replay, mutation: runners.mutation }
33
+ : { replay: runners.replay };
34
+ const { result, proof } = await reproduceRun({
35
+ claimed: input.claimed,
36
+ changedPaths: input.changedPaths,
37
+ checkoutDir: checkout.checkout.checkoutDir,
38
+ commands: input.commands,
39
+ diffText: input.diffText,
40
+ mutationFiles: input.changedPaths,
41
+ }, reproRunners);
42
+ return { result, proof, enforcement: enforceReproductionVerdict(result.verdict) };
43
+ }
44
+ finally {
45
+ await checkout.checkout.cleanup();
46
+ }
47
+ }
48
+ /**
49
+ * Capture the workspace's git state (HEAD as base + the uncommitted diff) and
50
+ * reproduce. Assumes the agent's changes are UNCOMMITTED (the common autonomous
51
+ * case): HEAD is the base and `git diff` is the agent's work. If the agent
52
+ * committed mid-run, the diff is empty and the clean base is reproduced.
53
+ */
54
+ export async function runWorkspaceReproduction(input, runners) {
55
+ const baseRef = (await runners.git.run(input.cwd, ["rev-parse", "HEAD"])).stdout.trim();
56
+ const diffText = (await runners.git.run(input.cwd, ["diff"])).stdout;
57
+ const changedPaths = (await runners.git.run(input.cwd, ["diff", "--name-only"])).stdout
58
+ .split("\n")
59
+ .map((s) => s.trim())
60
+ .filter(Boolean);
61
+ return reproduceAutonomousRun({
62
+ repoDir: input.cwd,
63
+ baseRef,
64
+ diffText,
65
+ changedPaths,
66
+ claimed: input.claimed,
67
+ commands: input.commands,
68
+ worktreeDir: input.worktreeDir,
69
+ ...(input.linkFromRepo ? { linkFromRepo: input.linkFromRepo } : {}),
70
+ }, runners);
71
+ }
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Prepare the separate-trust-boundary checkout for verify-by-reproduction
3
+ * (spec §3.1, the missing prerequisite for a correct production trigger).
4
+ *
5
+ * The verifier box MUST run a CLEAN base checkout + ONLY the agent's diff — in
6
+ * a separate directory — so the agent's dirty workspace (runtime tampering,
7
+ * stray files, monkeypatched runners) can never influence the result. Running
8
+ * the replay in the agent's own `process.cwd()` would be the BenchJack V1
9
+ * violation this defeats.
10
+ */
11
+ export interface GitRunner {
12
+ readonly run: (cwd: string, argv: readonly string[]) => Promise<{
13
+ exitCode: number;
14
+ stdout: string;
15
+ stderr: string;
16
+ }>;
17
+ }
18
+ /** Production GitRunner via execFile (argv-only, injection-safe). */
19
+ export declare function buildExecGitRunner(): GitRunner;
20
+ export interface PrepareCheckoutInput {
21
+ /** The agent's repository (the worktree is registered here). */
22
+ readonly repoDir: string;
23
+ /** The commit the agent started from (before its changes). */
24
+ readonly baseRef: string;
25
+ /** The agent's diff (unified, as from `git diff`). Empty = clean base. */
26
+ readonly diffText: string;
27
+ /** A fresh, non-existent directory for the detached worktree. */
28
+ readonly worktreeDir: string;
29
+ /**
30
+ * Paths (relative to repoDir) to symlink into the worktree after checkout —
31
+ * e.g. ["node_modules"]. A fresh worktree lacks gitignored deps, so without
32
+ * this the replay's `npm test` would fail on missing modules (a FALSE
33
+ * 'contradicted'). Deps are not the grading surface, so sharing them is safe.
34
+ */
35
+ readonly linkFromRepo?: readonly string[];
36
+ }
37
+ export interface VerifierCheckout {
38
+ readonly checkoutDir: string;
39
+ readonly cleanup: () => Promise<void>;
40
+ }
41
+ export type CheckoutResult = {
42
+ readonly ok: true;
43
+ readonly checkout: VerifierCheckout;
44
+ } | {
45
+ readonly ok: false;
46
+ readonly error: string;
47
+ };
48
+ export declare function prepareVerifierCheckout(input: PrepareCheckoutInput, git: GitRunner): Promise<CheckoutResult>;
@@ -0,0 +1,78 @@
1
+ import { writeFileSync, symlinkSync, existsSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ /** Production GitRunner via execFile (argv-only, injection-safe). */
4
+ export function buildExecGitRunner() {
5
+ return {
6
+ run: async (cwd, argv) => {
7
+ const { execFile } = await import("node:child_process");
8
+ return new Promise((resolve) => {
9
+ execFile("git", [...argv], { cwd, maxBuffer: 64 * 1024 * 1024 }, (error, stdout, stderr) => {
10
+ const exitCode = error && typeof error.code === "number"
11
+ ? Number(error.code)
12
+ : error
13
+ ? 1
14
+ : 0;
15
+ resolve({
16
+ exitCode,
17
+ stdout: stdout?.toString() ?? "",
18
+ stderr: stderr?.toString() ?? (error instanceof Error ? error.message : ""),
19
+ });
20
+ });
21
+ });
22
+ },
23
+ };
24
+ }
25
+ export async function prepareVerifierCheckout(input, git) {
26
+ const add = await git.run(input.repoDir, [
27
+ "worktree",
28
+ "add",
29
+ "--detach",
30
+ input.worktreeDir,
31
+ input.baseRef,
32
+ ]);
33
+ if (add.exitCode !== 0) {
34
+ return {
35
+ ok: false,
36
+ error: `worktree-add-failed:exit=${add.exitCode}:${add.stderr.trim().slice(0, 240)}`,
37
+ };
38
+ }
39
+ const cleanup = async () => {
40
+ await git.run(input.repoDir, ["worktree", "remove", "--force", input.worktreeDir]);
41
+ };
42
+ if (input.diffText.trim() !== "") {
43
+ const patchPath = join(input.worktreeDir, ".wotann-verify.patch");
44
+ try {
45
+ writeFileSync(patchPath, input.diffText, "utf-8");
46
+ }
47
+ catch (e) {
48
+ await cleanup();
49
+ return {
50
+ ok: false,
51
+ error: `patch-write-failed:${e instanceof Error ? e.message : String(e)}`,
52
+ };
53
+ }
54
+ const apply = await git.run(input.worktreeDir, ["apply", "--whitespace=nowarn", patchPath]);
55
+ if (apply.exitCode !== 0) {
56
+ await cleanup();
57
+ return {
58
+ ok: false,
59
+ error: `apply-failed:exit=${apply.exitCode}:${apply.stderr.trim().slice(0, 240)}`,
60
+ };
61
+ }
62
+ }
63
+ for (const rel of input.linkFromRepo ?? []) {
64
+ const src = join(input.repoDir, rel);
65
+ const dst = join(input.worktreeDir, rel);
66
+ // Best-effort: skip when the source is absent or the target already exists.
67
+ if (existsSync(src) && !existsSync(dst)) {
68
+ try {
69
+ symlinkSync(src, dst, "dir");
70
+ }
71
+ catch {
72
+ // A failed symlink just means the replay may hit missing deps and
73
+ // report a result the verdict logic handles — never a silent pass.
74
+ }
75
+ }
76
+ }
77
+ return { ok: true, checkout: { checkoutDir: input.worktreeDir, cleanup } };
78
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Grading-surface diff-checker (BenchJack V2/V8 defense). The agent's diff
3
+ * must touch ONLY source — never the test harness / its own grader. A diff
4
+ * that edits the grading surface is TAMPER, not a normal failure.
5
+ *
6
+ * NOTE: package.json is deliberately NOT protected here (legit dep edits are
7
+ * common); detecting edits to its `scripts.test` field specifically is a
8
+ * Phase-B field-level refinement.
9
+ */
10
+ export declare const DEFAULT_PROTECTED_PATTERNS: readonly RegExp[];
11
+ export interface TestAuthorship {
12
+ /** Matches files considered "tests" (whose edits are gated). */
13
+ readonly testFilePattern: RegExp;
14
+ /** Test files the agent legitimately created/edited THIS task (allowlisted). */
15
+ readonly authoredTestFiles: readonly string[];
16
+ }
17
+ export interface DiffCheckResult {
18
+ readonly tampered: boolean;
19
+ readonly offendingPaths: readonly string[];
20
+ }
21
+ /**
22
+ * Pure. `changedPaths` are repo-relative POSIX paths the agent's diff touched.
23
+ * A path is offending if it matches a protected pattern, OR it is a test file
24
+ * (per `authorship.testFilePattern`) that is not in `authoredTestFiles`.
25
+ */
26
+ export declare function checkDiff(changedPaths: readonly string[], patterns?: readonly RegExp[], authorship?: TestAuthorship): DiffCheckResult;
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Grading-surface diff-checker (BenchJack V2/V8 defense). The agent's diff
3
+ * must touch ONLY source — never the test harness / its own grader. A diff
4
+ * that edits the grading surface is TAMPER, not a normal failure.
5
+ *
6
+ * NOTE: package.json is deliberately NOT protected here (legit dep edits are
7
+ * common); detecting edits to its `scripts.test` field specifically is a
8
+ * Phase-B field-level refinement.
9
+ */
10
+ export const DEFAULT_PROTECTED_PATTERNS = Object.freeze([
11
+ /(^|\/)conftest\.py$/,
12
+ /(^|\/)pytest\.ini$/,
13
+ /(^|\/)tox\.ini$/,
14
+ /(^|\/)(jest|vitest|playwright)\.config\.[cm]?[jt]s$/,
15
+ /(^|\/)\.mocharc\.[a-z]+$/,
16
+ /(^|\/)\.git(\/|$)/,
17
+ ]);
18
+ /**
19
+ * Pure. `changedPaths` are repo-relative POSIX paths the agent's diff touched.
20
+ * A path is offending if it matches a protected pattern, OR it is a test file
21
+ * (per `authorship.testFilePattern`) that is not in `authoredTestFiles`.
22
+ */
23
+ export function checkDiff(changedPaths, patterns = DEFAULT_PROTECTED_PATTERNS, authorship) {
24
+ const authored = new Set(authorship?.authoredTestFiles ?? []);
25
+ const offendingPaths = changedPaths.filter((p) => {
26
+ if (patterns.some((re) => re.test(p)))
27
+ return true;
28
+ if (authorship && authorship.testFilePattern.test(p) && !authored.has(p))
29
+ return true;
30
+ return false;
31
+ });
32
+ return { tampered: offendingPaths.length > 0, offendingPaths };
33
+ }
@@ -0,0 +1,14 @@
1
+ import type { ReproductionVerdict } from "./verdict.js";
2
+ export type EnforcementAction = "allow" | "block" | "surface" | "escalate";
3
+ export interface EnforcementDecision {
4
+ readonly action: EnforcementAction;
5
+ readonly reason: string;
6
+ }
7
+ /**
8
+ * Asymmetric enforcement (spec §3.2). A false PASS is the moat-killer; a false
9
+ * BLOCK is recoverable — so reproduction-sourced tamper/contradicted HARD-BLOCK,
10
+ * while weaker signals surface or escalate. Keyed on the REPRODUCTION verdict
11
+ * (trustworthy, executable) — never on bare LLM-judge text (the TNR<25% yes-man
12
+ * problem is exactly why the enforce-flip waited for the reproduction channel).
13
+ */
14
+ export declare function enforceReproductionVerdict(verdict: ReproductionVerdict): EnforcementDecision;
@@ -0,0 +1,30 @@
1
+ /**
2
+ * Asymmetric enforcement (spec §3.2). A false PASS is the moat-killer; a false
3
+ * BLOCK is recoverable — so reproduction-sourced tamper/contradicted HARD-BLOCK,
4
+ * while weaker signals surface or escalate. Keyed on the REPRODUCTION verdict
5
+ * (trustworthy, executable) — never on bare LLM-judge text (the TNR<25% yes-man
6
+ * problem is exactly why the enforce-flip waited for the reproduction channel).
7
+ */
8
+ export function enforceReproductionVerdict(verdict) {
9
+ switch (verdict) {
10
+ case "tamper":
11
+ return { action: "block", reason: "diff tampered with the grading surface" };
12
+ case "contradicted":
13
+ return {
14
+ action: "block",
15
+ reason: "claimed success contradicted by independent reproduction",
16
+ };
17
+ case "weak-tests":
18
+ return {
19
+ action: "surface",
20
+ reason: "reproduction passed but tests are too weak to trust (low mutation score)",
21
+ };
22
+ case "infra-error":
23
+ return {
24
+ action: "escalate",
25
+ reason: "could not reproduce — verify manually before trusting",
26
+ };
27
+ case "reproduced":
28
+ return { action: "allow", reason: "independently reproduced in a separate trust boundary" };
29
+ }
30
+ }
@@ -0,0 +1,15 @@
1
+ import type { ReplayRunner } from "./replay-runner.js";
2
+ /**
3
+ * Production `ReplayRunner` backed by `child_process.execFile` (argv-only — no
4
+ * shell interpolation, so injection-safe regardless of the command content,
5
+ * matching the codebase's execFileNoThrow contract). Runs the claimed commands
6
+ * inside `dir`, the verifier-box checkout.
7
+ *
8
+ * Honest probe: confirms the Node interpreter is runnable; reports the failure
9
+ * explicitly otherwise (so runReplay emits `infra-error`, never a silent pass).
10
+ *
11
+ * Trust-boundary note: this runs on the HOST in a given directory. Full
12
+ * separate-trust-boundary isolation (a container, per spec §3.1) is the deferred
13
+ * production hardening — this is the host-dir baseline so the loop runs for real.
14
+ */
15
+ export declare function buildExecReplayRunner(): ReplayRunner;
@@ -0,0 +1,47 @@
1
+ /**
2
+ * Production `ReplayRunner` backed by `child_process.execFile` (argv-only — no
3
+ * shell interpolation, so injection-safe regardless of the command content,
4
+ * matching the codebase's execFileNoThrow contract). Runs the claimed commands
5
+ * inside `dir`, the verifier-box checkout.
6
+ *
7
+ * Honest probe: confirms the Node interpreter is runnable; reports the failure
8
+ * explicitly otherwise (so runReplay emits `infra-error`, never a silent pass).
9
+ *
10
+ * Trust-boundary note: this runs on the HOST in a given directory. Full
11
+ * separate-trust-boundary isolation (a container, per spec §3.1) is the deferred
12
+ * production hardening — this is the host-dir baseline so the loop runs for real.
13
+ */
14
+ export function buildExecReplayRunner() {
15
+ return {
16
+ probe: async () => {
17
+ const { execFile } = await import("node:child_process");
18
+ return new Promise((resolve) => {
19
+ execFile(process.execPath, ["--version"], (error) => {
20
+ resolve(error
21
+ ? { ok: false, reason: error instanceof Error ? error.message : String(error) }
22
+ : { ok: true });
23
+ });
24
+ });
25
+ },
26
+ runInDir: async (dir, argv) => {
27
+ const { execFile } = await import("node:child_process");
28
+ const [file, ...rest] = argv;
29
+ if (!file)
30
+ return { exitCode: 1, stdout: "", stderr: "empty argv" };
31
+ return new Promise((resolve) => {
32
+ execFile(file, rest, { cwd: dir, maxBuffer: 10 * 1024 * 1024 }, (error, stdout, stderr) => {
33
+ const exitCode = error && typeof error.code === "number"
34
+ ? Number(error.code)
35
+ : error
36
+ ? 1
37
+ : 0;
38
+ resolve({
39
+ exitCode,
40
+ stdout: stdout?.toString() ?? "",
41
+ stderr: stderr?.toString() ?? (error instanceof Error ? error.message : ""),
42
+ });
43
+ });
44
+ });
45
+ },
46
+ };
47
+ }
@@ -0,0 +1,10 @@
1
+ export { decideReproductionVerdict, type ReproductionVerdict, type ClaimedChecks, type ObservedChecks, type ReproductionInput, type ReproductionResult, } from "./verdict.js";
2
+ export { checkDiff, DEFAULT_PROTECTED_PATTERNS, type DiffCheckResult, type TestAuthorship, } from "./diff-checker.js";
3
+ export { runReplay, type ReplayRunner, type ReplayCommands, type ReplayInput, } from "./replay-runner.js";
4
+ export { buildProofArtifact, type ProofArtifact } from "./proof-artifact.js";
5
+ export { gateMutation, runMutationGate, DEFAULT_MUTATION_THRESHOLD, type MutationResult, type MutationGateResult, type MutationRunner, } from "./mutation-gate.js";
6
+ export { enforceReproductionVerdict, type EnforcementAction, type EnforcementDecision, } from "./enforcement.js";
7
+ export { reproduceRun, type ReproduceInput, type ReproduceRunners, type ReproduceOutput, } from "./reproduce.js";
8
+ export { buildExecReplayRunner } from "./exec-runner.js";
9
+ export { prepareVerifierCheckout, buildExecGitRunner, type GitRunner, type PrepareCheckoutInput, type VerifierCheckout, type CheckoutResult, } from "./checkout-prep.js";
10
+ export { reproduceAutonomousRun, runWorkspaceReproduction, type AutonomousReproduceInput, type AutonomousReproduceRunners, type AutonomousReproduceOutput, type WorkspaceReproduceInput, } from "./autonomous-gate.js";