wotann 0.5.95 → 0.5.97
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +68 -24
- package/dist/orchestration/proof-bundles.d.ts +8 -0
- package/dist/orchestration/proof-bundles.js +2 -0
- package/dist/security/approval-binding.d.ts +52 -0
- package/dist/security/approval-binding.js +57 -0
- package/dist/security/human-approval.d.ts +2 -0
- package/dist/security/human-approval.js +15 -24
- package/dist/ui/components/v3/AppV3.d.ts +10 -1
- package/dist/ui/components/v3/AppV3.js +34 -5
- package/dist/ui/components/v3/Transcript.d.ts +21 -1
- package/dist/ui/components/v3/Transcript.js +18 -58
- package/dist/ui/components/v3/TranscriptRow.d.ts +45 -0
- package/dist/ui/components/v3/TranscriptRow.js +102 -0
- package/dist/ui/inline-render.d.ts +28 -0
- package/dist/ui/inline-render.js +35 -0
- package/dist/verification/reproduction/autonomous-gate.d.ts +52 -0
- package/dist/verification/reproduction/autonomous-gate.js +71 -0
- package/dist/verification/reproduction/checkout-prep.d.ts +48 -0
- package/dist/verification/reproduction/checkout-prep.js +78 -0
- package/dist/verification/reproduction/diff-checker.d.ts +26 -0
- package/dist/verification/reproduction/diff-checker.js +33 -0
- package/dist/verification/reproduction/enforcement.d.ts +14 -0
- package/dist/verification/reproduction/enforcement.js +30 -0
- package/dist/verification/reproduction/exec-runner.d.ts +15 -0
- package/dist/verification/reproduction/exec-runner.js +47 -0
- package/dist/verification/reproduction/index.d.ts +10 -0
- package/dist/verification/reproduction/index.js +10 -0
- package/dist/verification/reproduction/mutation-gate.d.ts +42 -0
- package/dist/verification/reproduction/mutation-gate.js +43 -0
- package/dist/verification/reproduction/proof-artifact.d.ts +16 -0
- package/dist/verification/reproduction/proof-artifact.js +22 -0
- package/dist/verification/reproduction/replay-runner.d.ts +37 -0
- package/dist/verification/reproduction/replay-runner.js +28 -0
- package/dist/verification/reproduction/reproduce.d.ts +34 -0
- package/dist/verification/reproduction/reproduce.js +31 -0
- package/dist/verification/reproduction/verdict.d.ts +39 -0
- package/dist/verification/reproduction/verdict.js +40 -0
- package/package.json +1 -1
- package/dist/ui/opentui-chat.d.ts +0 -19
- package/dist/ui/opentui-chat.js +0 -285
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TranscriptRow — a single memoized conversation row.
|
|
3
|
+
*
|
|
4
|
+
* Extracted from `Transcript.tsx`'s inline row map so it can be rendered
|
|
5
|
+
* in BOTH places the AppV4 inline model needs it:
|
|
6
|
+
* 1. Committed history inside Ink `<Static>` (write-once → terminal
|
|
7
|
+
* scrollback). Static never re-renders an emitted row, so the memo
|
|
8
|
+
* is moot there — but identity-stable rows keep the contract clean.
|
|
9
|
+
* 2. The live in-flight turn (the streaming assistant row + its tool
|
|
10
|
+
* rows). Here the memo earns its keep: while the assistant row's
|
|
11
|
+
* content grows token-by-token, the sibling rows (the user prompt,
|
|
12
|
+
* finished tool rows) MUST NOT re-render.
|
|
13
|
+
*
|
|
14
|
+
* Why a CUSTOM comparator (not the default shallow `React.memo`):
|
|
15
|
+
* `toTranscriptMessages` rebuilds a fresh object for every message on
|
|
16
|
+
* every render (`messages.map((m) => ({...}))`), so a by-reference memo
|
|
17
|
+
* would re-render every row on every streamed token — defeating the
|
|
18
|
+
* point. We compare the fields that actually drive the render. The
|
|
19
|
+
* `attachments` array survives a reference check because
|
|
20
|
+
* `toTranscriptMessages` threads the SAME array through (it never copies
|
|
21
|
+
* it), so an unchanged message keeps an identity-stable attachments ref.
|
|
22
|
+
*
|
|
23
|
+
* Rendering is byte-identical to the prior inline map (Phase 1 is a pure
|
|
24
|
+
* no-op refactor): the horizontal padding stays a concern of the
|
|
25
|
+
* container/Static call-site, never the row, so this component composes
|
|
26
|
+
* the same in either mounting context.
|
|
27
|
+
*/
|
|
28
|
+
import React from "react";
|
|
29
|
+
import type { CapabilityProfile } from "../../capability-tier.js";
|
|
30
|
+
import type { TerminalCapabilities } from "../../terminal-capabilities.js";
|
|
31
|
+
import type { TranscriptMessageV3 } from "./Transcript.js";
|
|
32
|
+
export interface TranscriptRowProps {
|
|
33
|
+
readonly msg: TranscriptMessageV3;
|
|
34
|
+
readonly profile: CapabilityProfile;
|
|
35
|
+
readonly terminalCapabilities: TerminalCapabilities;
|
|
36
|
+
}
|
|
37
|
+
declare function TranscriptRowImpl({ msg, profile, terminalCapabilities, }: TranscriptRowProps): React.ReactElement;
|
|
38
|
+
/**
|
|
39
|
+
* Memo comparator — return TRUE to SKIP a re-render. Exported for direct
|
|
40
|
+
* unit testing (the streaming hot-path depends on this returning true for
|
|
41
|
+
* an unchanged sibling row while the assistant row streams).
|
|
42
|
+
*/
|
|
43
|
+
export declare function transcriptRowsEqual(prev: TranscriptRowProps, next: TranscriptRowProps): boolean;
|
|
44
|
+
export declare const TranscriptRow: React.MemoExoticComponent<typeof TranscriptRowImpl>;
|
|
45
|
+
export {};
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import { jsx as _jsx, jsxs as _jsxs } from "#wotann-jsx/jsx-runtime";
|
|
2
|
+
/**
|
|
3
|
+
* TranscriptRow — a single memoized conversation row.
|
|
4
|
+
*
|
|
5
|
+
* Extracted from `Transcript.tsx`'s inline row map so it can be rendered
|
|
6
|
+
* in BOTH places the AppV4 inline model needs it:
|
|
7
|
+
* 1. Committed history inside Ink `<Static>` (write-once → terminal
|
|
8
|
+
* scrollback). Static never re-renders an emitted row, so the memo
|
|
9
|
+
* is moot there — but identity-stable rows keep the contract clean.
|
|
10
|
+
* 2. The live in-flight turn (the streaming assistant row + its tool
|
|
11
|
+
* rows). Here the memo earns its keep: while the assistant row's
|
|
12
|
+
* content grows token-by-token, the sibling rows (the user prompt,
|
|
13
|
+
* finished tool rows) MUST NOT re-render.
|
|
14
|
+
*
|
|
15
|
+
* Why a CUSTOM comparator (not the default shallow `React.memo`):
|
|
16
|
+
* `toTranscriptMessages` rebuilds a fresh object for every message on
|
|
17
|
+
* every render (`messages.map((m) => ({...}))`), so a by-reference memo
|
|
18
|
+
* would re-render every row on every streamed token — defeating the
|
|
19
|
+
* point. We compare the fields that actually drive the render. The
|
|
20
|
+
* `attachments` array survives a reference check because
|
|
21
|
+
* `toTranscriptMessages` threads the SAME array through (it never copies
|
|
22
|
+
* it), so an unchanged message keeps an identity-stable attachments ref.
|
|
23
|
+
*
|
|
24
|
+
* Rendering is byte-identical to the prior inline map (Phase 1 is a pure
|
|
25
|
+
* no-op refactor): the horizontal padding stays a concern of the
|
|
26
|
+
* container/Static call-site, never the row, so this component composes
|
|
27
|
+
* the same in either mounting context.
|
|
28
|
+
*/
|
|
29
|
+
import React from "react";
|
|
30
|
+
import { Box, Text } from "ink";
|
|
31
|
+
import { glyph } from "../../theme/tokens.js";
|
|
32
|
+
import { useThemeTone } from "../../theme/context.js";
|
|
33
|
+
import { parseSlashResultMessage, SystemMessageCard } from "./SystemMessageCard.js";
|
|
34
|
+
import { KittyGraphics } from "./KittyGraphics.js";
|
|
35
|
+
// Minimal role markers — Claude Code / Codex parity. User gets a
|
|
36
|
+
// single subtle ❯ (same glyph as the composer prompt); the assistant
|
|
37
|
+
// is PURE content (no badge, no label, no gutter) — the biggest
|
|
38
|
+
// declutter; system/tool get a dim · marker. No per-message gutter
|
|
39
|
+
// bar, no Norse runes, nothing bold.
|
|
40
|
+
const ROLE_STYLES_AB = {
|
|
41
|
+
user: { gutterTone: "primary", badge: glyph.prompt, label: "" },
|
|
42
|
+
assistant: { gutterTone: "muted", badge: "", label: "" },
|
|
43
|
+
system: { gutterTone: "warning", badge: "·", label: "system" },
|
|
44
|
+
tool: { gutterTone: "muted", badge: "·", label: "tool" },
|
|
45
|
+
};
|
|
46
|
+
const ROLE_STYLES_C = {
|
|
47
|
+
user: { gutterTone: "primary", badge: ">", label: "" },
|
|
48
|
+
assistant: { gutterTone: "muted", badge: "", label: "" },
|
|
49
|
+
system: { gutterTone: "warning", badge: "*", label: "system" },
|
|
50
|
+
tool: { gutterTone: "muted", badge: "*", label: "tool" },
|
|
51
|
+
};
|
|
52
|
+
function formatTime(timestamp) {
|
|
53
|
+
if (typeof timestamp !== "number" || !Number.isFinite(timestamp))
|
|
54
|
+
return null;
|
|
55
|
+
const d = new Date(timestamp);
|
|
56
|
+
const hh = String(d.getHours()).padStart(2, "0");
|
|
57
|
+
const mm = String(d.getMinutes()).padStart(2, "0");
|
|
58
|
+
return `${hh}:${mm}`;
|
|
59
|
+
}
|
|
60
|
+
function TranscriptRowImpl({ msg, profile, terminalCapabilities, }) {
|
|
61
|
+
const { tone } = useThemeTone();
|
|
62
|
+
const styles = profile.tier === "C" ? ROLE_STYLES_C : ROLE_STYLES_AB;
|
|
63
|
+
// Route slash-command results to the richer SystemMessageCard so a
|
|
64
|
+
// `/model` dispatch reads as a harness response rather than a user
|
|
65
|
+
// message. The marker-based handshake keeps the Transcript schema
|
|
66
|
+
// unchanged — system messages without the marker fall through to the
|
|
67
|
+
// default render below.
|
|
68
|
+
if (msg.role === "system") {
|
|
69
|
+
const payload = parseSlashResultMessage(msg.content);
|
|
70
|
+
if (payload !== null) {
|
|
71
|
+
return _jsx(SystemMessageCard, { payload: payload, profile: profile });
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
const style = styles[msg.role];
|
|
75
|
+
const gutterColor = tone[style.gutterTone];
|
|
76
|
+
const timeString = formatTime(msg.timestamp);
|
|
77
|
+
const lines = msg.content.length === 0 ? [""] : msg.content.split("\n");
|
|
78
|
+
const attachments = msg.attachments ?? [];
|
|
79
|
+
const hasHeader = style.badge !== "" || style.label !== "" || timeString !== null;
|
|
80
|
+
return (_jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [hasHeader && (_jsxs(Box, { justifyContent: "space-between", children: [_jsxs(Box, { gap: 1, children: [style.badge !== "" && _jsx(Text, { color: gutterColor, children: style.badge }), style.label !== "" && _jsx(Text, { color: gutterColor, children: style.label })] }), timeString !== null && _jsx(Text, { color: tone.muted, children: timeString })] })), lines.map((line, i) => (_jsx(Text, { color: tone.text, children: line.length === 0 ? " " : line }, `msg-${msg.id}-line-${i}`))), attachments.map((attachment, i) => {
|
|
81
|
+
if (attachment.kind === "image" && attachment.dataUri !== undefined) {
|
|
82
|
+
return (_jsx(KittyGraphics, { source: attachment.dataUri, capabilities: terminalCapabilities, rows: profile.tier === "C" ? 3 : 5, columns: profile.tier === "C" ? 28 : 40, caption: attachment.path }, `msg-${msg.id}-att-${i}`));
|
|
83
|
+
}
|
|
84
|
+
const label = attachment.kind === "image" ? "image" : "file";
|
|
85
|
+
return (_jsxs(Text, { color: tone.muted, italic: true, children: ["[", label, ": ", attachment.path, "]"] }, `msg-${msg.id}-att-${i}`));
|
|
86
|
+
})] }));
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Memo comparator — return TRUE to SKIP a re-render. Exported for direct
|
|
90
|
+
* unit testing (the streaming hot-path depends on this returning true for
|
|
91
|
+
* an unchanged sibling row while the assistant row streams).
|
|
92
|
+
*/
|
|
93
|
+
export function transcriptRowsEqual(prev, next) {
|
|
94
|
+
return (prev.msg.id === next.msg.id &&
|
|
95
|
+
prev.msg.content === next.msg.content &&
|
|
96
|
+
prev.msg.role === next.msg.role &&
|
|
97
|
+
prev.msg.timestamp === next.msg.timestamp &&
|
|
98
|
+
prev.msg.attachments === next.msg.attachments &&
|
|
99
|
+
prev.profile === next.profile &&
|
|
100
|
+
prev.terminalCapabilities === next.terminalCapabilities);
|
|
101
|
+
}
|
|
102
|
+
export const TranscriptRow = React.memo(TranscriptRowImpl, transcriptRowsEqual);
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inline-render mode gate (AppV4 flicker fix, phased rollout).
|
|
3
|
+
*
|
|
4
|
+
* The complete streaming-flicker fix renders the chat surface INLINE in
|
|
5
|
+
* the terminal's MAIN buffer (committed history written once into native
|
|
6
|
+
* scrollback via Ink `<Static>`) instead of the alternate screen buffer.
|
|
7
|
+
* The alt buffer has no scrollback, so `<Static>` is incompatible with it
|
|
8
|
+
* (committed rows would scroll off irrecoverably) — see
|
|
9
|
+
* docs/phase-0-redesign/phase-1-inline-architecture.md §0.
|
|
10
|
+
*
|
|
11
|
+
* Why a flag (not default-on yet): the mount-path change can only be
|
|
12
|
+
* validated for "feel" + cross-terminal correctness (iTerm2 / Terminal.app
|
|
13
|
+
* / tmux) on a PHYSICAL terminal, which CI/headless harnesses cannot
|
|
14
|
+
* reproduce (this is the same fragility that drove the PR #35-#38
|
|
15
|
+
* raw-mode/mount hardening). So inline mode ships behind
|
|
16
|
+
* `WOTANN_TUI_INLINE=1` for opt-in verification; once confirmed on a real
|
|
17
|
+
* terminal it becomes the default for the chat surface.
|
|
18
|
+
*
|
|
19
|
+
* The objective gate (eraseLines-sequence drop ≥90% vs the alt-buffer
|
|
20
|
+
* baseline) is exercised by the PTY byte-trace test, which DOES run here.
|
|
21
|
+
*/
|
|
22
|
+
/**
|
|
23
|
+
* Whether this session should render the chat surface inline in the main
|
|
24
|
+
* buffer (Ink `<Static>` committed history) rather than in the alt-screen
|
|
25
|
+
* buffer. Reads `WOTANN_TUI_INLINE`; defaults OFF during the phased
|
|
26
|
+
* rollout. Pass an explicit env for tests.
|
|
27
|
+
*/
|
|
28
|
+
export declare function isInlineRenderRequested(env?: NodeJS.ProcessEnv): boolean;
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inline-render mode gate (AppV4 flicker fix, phased rollout).
|
|
3
|
+
*
|
|
4
|
+
* The complete streaming-flicker fix renders the chat surface INLINE in
|
|
5
|
+
* the terminal's MAIN buffer (committed history written once into native
|
|
6
|
+
* scrollback via Ink `<Static>`) instead of the alternate screen buffer.
|
|
7
|
+
* The alt buffer has no scrollback, so `<Static>` is incompatible with it
|
|
8
|
+
* (committed rows would scroll off irrecoverably) — see
|
|
9
|
+
* docs/phase-0-redesign/phase-1-inline-architecture.md §0.
|
|
10
|
+
*
|
|
11
|
+
* Why a flag (not default-on yet): the mount-path change can only be
|
|
12
|
+
* validated for "feel" + cross-terminal correctness (iTerm2 / Terminal.app
|
|
13
|
+
* / tmux) on a PHYSICAL terminal, which CI/headless harnesses cannot
|
|
14
|
+
* reproduce (this is the same fragility that drove the PR #35-#38
|
|
15
|
+
* raw-mode/mount hardening). So inline mode ships behind
|
|
16
|
+
* `WOTANN_TUI_INLINE=1` for opt-in verification; once confirmed on a real
|
|
17
|
+
* terminal it becomes the default for the chat surface.
|
|
18
|
+
*
|
|
19
|
+
* The objective gate (eraseLines-sequence drop ≥90% vs the alt-buffer
|
|
20
|
+
* baseline) is exercised by the PTY byte-trace test, which DOES run here.
|
|
21
|
+
*/
|
|
22
|
+
/** Truthy values that opt a session into inline main-buffer rendering. */
|
|
23
|
+
const TRUTHY = new Set(["1", "true", "yes", "on"]);
|
|
24
|
+
/**
|
|
25
|
+
* Whether this session should render the chat surface inline in the main
|
|
26
|
+
* buffer (Ink `<Static>` committed history) rather than in the alt-screen
|
|
27
|
+
* buffer. Reads `WOTANN_TUI_INLINE`; defaults OFF during the phased
|
|
28
|
+
* rollout. Pass an explicit env for tests.
|
|
29
|
+
*/
|
|
30
|
+
export function isInlineRenderRequested(env = process.env) {
|
|
31
|
+
const value = env["WOTANN_TUI_INLINE"];
|
|
32
|
+
if (value === undefined)
|
|
33
|
+
return false;
|
|
34
|
+
return TRUTHY.has(value.trim().toLowerCase());
|
|
35
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import { type GitRunner } from "./checkout-prep.js";
|
|
2
|
+
import { type ClaimedChecks, type ReproductionResult } from "./verdict.js";
|
|
3
|
+
import { type ProofArtifact } from "./proof-artifact.js";
|
|
4
|
+
import { type EnforcementDecision } from "./enforcement.js";
|
|
5
|
+
import type { ReplayRunner, ReplayCommands } from "./replay-runner.js";
|
|
6
|
+
import type { MutationRunner } from "./mutation-gate.js";
|
|
7
|
+
export interface AutonomousReproduceInput {
|
|
8
|
+
readonly repoDir: string;
|
|
9
|
+
readonly baseRef: string;
|
|
10
|
+
readonly diffText: string;
|
|
11
|
+
readonly changedPaths: readonly string[];
|
|
12
|
+
readonly claimed: ClaimedChecks;
|
|
13
|
+
readonly commands: ReplayCommands;
|
|
14
|
+
/** A fresh, non-existent directory for the verifier-box worktree. */
|
|
15
|
+
readonly worktreeDir: string;
|
|
16
|
+
/** Paths to symlink from repoDir into the worktree (e.g. ["node_modules"]). */
|
|
17
|
+
readonly linkFromRepo?: readonly string[];
|
|
18
|
+
}
|
|
19
|
+
export interface AutonomousReproduceRunners {
|
|
20
|
+
readonly git: GitRunner;
|
|
21
|
+
readonly replay: ReplayRunner;
|
|
22
|
+
readonly mutation?: MutationRunner;
|
|
23
|
+
}
|
|
24
|
+
export interface AutonomousReproduceOutput {
|
|
25
|
+
readonly result: ReproductionResult;
|
|
26
|
+
readonly proof: ProofArtifact;
|
|
27
|
+
readonly enforcement: EnforcementDecision;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Full production composition: prepare an isolated checkout (the separate trust
|
|
31
|
+
* boundary), reproduce the claimed result inside it, decide enforcement, and
|
|
32
|
+
* ALWAYS clean up the worktree. A failed checkout => infra-error (never an
|
|
33
|
+
* auto-pass — we couldn't verify, so we don't silently allow).
|
|
34
|
+
*/
|
|
35
|
+
export declare function reproduceAutonomousRun(input: AutonomousReproduceInput, runners: AutonomousReproduceRunners): Promise<AutonomousReproduceOutput>;
|
|
36
|
+
export interface WorkspaceReproduceInput {
|
|
37
|
+
/** The agent's workspace (its uncommitted changes are captured as the diff). */
|
|
38
|
+
readonly cwd: string;
|
|
39
|
+
readonly claimed: ClaimedChecks;
|
|
40
|
+
readonly commands: ReplayCommands;
|
|
41
|
+
/** A fresh, non-existent directory for the verifier-box worktree. */
|
|
42
|
+
readonly worktreeDir: string;
|
|
43
|
+
/** Paths to symlink from the workspace into the worktree (e.g. ["node_modules"]). */
|
|
44
|
+
readonly linkFromRepo?: readonly string[];
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Capture the workspace's git state (HEAD as base + the uncommitted diff) and
|
|
48
|
+
* reproduce. Assumes the agent's changes are UNCOMMITTED (the common autonomous
|
|
49
|
+
* case): HEAD is the base and `git diff` is the agent's work. If the agent
|
|
50
|
+
* committed mid-run, the diff is empty and the clean base is reproduced.
|
|
51
|
+
*/
|
|
52
|
+
export declare function runWorkspaceReproduction(input: WorkspaceReproduceInput, runners: AutonomousReproduceRunners): Promise<AutonomousReproduceOutput>;
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { prepareVerifierCheckout } from "./checkout-prep.js";
|
|
2
|
+
import { reproduceRun } from "./reproduce.js";
|
|
3
|
+
import { decideReproductionVerdict, } from "./verdict.js";
|
|
4
|
+
import { buildProofArtifact } from "./proof-artifact.js";
|
|
5
|
+
import { enforceReproductionVerdict } from "./enforcement.js";
|
|
6
|
+
/**
|
|
7
|
+
* Full production composition: prepare an isolated checkout (the separate trust
|
|
8
|
+
* boundary), reproduce the claimed result inside it, decide enforcement, and
|
|
9
|
+
* ALWAYS clean up the worktree. A failed checkout => infra-error (never an
|
|
10
|
+
* auto-pass — we couldn't verify, so we don't silently allow).
|
|
11
|
+
*/
|
|
12
|
+
export async function reproduceAutonomousRun(input, runners) {
|
|
13
|
+
const checkout = await prepareVerifierCheckout({
|
|
14
|
+
repoDir: input.repoDir,
|
|
15
|
+
baseRef: input.baseRef,
|
|
16
|
+
diffText: input.diffText,
|
|
17
|
+
worktreeDir: input.worktreeDir,
|
|
18
|
+
...(input.linkFromRepo ? { linkFromRepo: input.linkFromRepo } : {}),
|
|
19
|
+
}, runners.git);
|
|
20
|
+
if (!checkout.ok) {
|
|
21
|
+
const result = decideReproductionVerdict({
|
|
22
|
+
claimed: input.claimed,
|
|
23
|
+
observed: { testsPass: null, typecheckPass: null, lintPass: null },
|
|
24
|
+
tampered: false,
|
|
25
|
+
infraError: `checkout:${checkout.error}`,
|
|
26
|
+
});
|
|
27
|
+
const proof = buildProofArtifact(result, input.diffText);
|
|
28
|
+
return { result, proof, enforcement: enforceReproductionVerdict(result.verdict) };
|
|
29
|
+
}
|
|
30
|
+
try {
|
|
31
|
+
const reproRunners = runners.mutation
|
|
32
|
+
? { replay: runners.replay, mutation: runners.mutation }
|
|
33
|
+
: { replay: runners.replay };
|
|
34
|
+
const { result, proof } = await reproduceRun({
|
|
35
|
+
claimed: input.claimed,
|
|
36
|
+
changedPaths: input.changedPaths,
|
|
37
|
+
checkoutDir: checkout.checkout.checkoutDir,
|
|
38
|
+
commands: input.commands,
|
|
39
|
+
diffText: input.diffText,
|
|
40
|
+
mutationFiles: input.changedPaths,
|
|
41
|
+
}, reproRunners);
|
|
42
|
+
return { result, proof, enforcement: enforceReproductionVerdict(result.verdict) };
|
|
43
|
+
}
|
|
44
|
+
finally {
|
|
45
|
+
await checkout.checkout.cleanup();
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Capture the workspace's git state (HEAD as base + the uncommitted diff) and
|
|
50
|
+
* reproduce. Assumes the agent's changes are UNCOMMITTED (the common autonomous
|
|
51
|
+
* case): HEAD is the base and `git diff` is the agent's work. If the agent
|
|
52
|
+
* committed mid-run, the diff is empty and the clean base is reproduced.
|
|
53
|
+
*/
|
|
54
|
+
export async function runWorkspaceReproduction(input, runners) {
|
|
55
|
+
const baseRef = (await runners.git.run(input.cwd, ["rev-parse", "HEAD"])).stdout.trim();
|
|
56
|
+
const diffText = (await runners.git.run(input.cwd, ["diff"])).stdout;
|
|
57
|
+
const changedPaths = (await runners.git.run(input.cwd, ["diff", "--name-only"])).stdout
|
|
58
|
+
.split("\n")
|
|
59
|
+
.map((s) => s.trim())
|
|
60
|
+
.filter(Boolean);
|
|
61
|
+
return reproduceAutonomousRun({
|
|
62
|
+
repoDir: input.cwd,
|
|
63
|
+
baseRef,
|
|
64
|
+
diffText,
|
|
65
|
+
changedPaths,
|
|
66
|
+
claimed: input.claimed,
|
|
67
|
+
commands: input.commands,
|
|
68
|
+
worktreeDir: input.worktreeDir,
|
|
69
|
+
...(input.linkFromRepo ? { linkFromRepo: input.linkFromRepo } : {}),
|
|
70
|
+
}, runners);
|
|
71
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Prepare the separate-trust-boundary checkout for verify-by-reproduction
|
|
3
|
+
* (spec §3.1, the missing prerequisite for a correct production trigger).
|
|
4
|
+
*
|
|
5
|
+
* The verifier box MUST run a CLEAN base checkout + ONLY the agent's diff — in
|
|
6
|
+
* a separate directory — so the agent's dirty workspace (runtime tampering,
|
|
7
|
+
* stray files, monkeypatched runners) can never influence the result. Running
|
|
8
|
+
* the replay in the agent's own `process.cwd()` would be the BenchJack V1
|
|
9
|
+
* violation this defeats.
|
|
10
|
+
*/
|
|
11
|
+
export interface GitRunner {
|
|
12
|
+
readonly run: (cwd: string, argv: readonly string[]) => Promise<{
|
|
13
|
+
exitCode: number;
|
|
14
|
+
stdout: string;
|
|
15
|
+
stderr: string;
|
|
16
|
+
}>;
|
|
17
|
+
}
|
|
18
|
+
/** Production GitRunner via execFile (argv-only, injection-safe). */
|
|
19
|
+
export declare function buildExecGitRunner(): GitRunner;
|
|
20
|
+
export interface PrepareCheckoutInput {
|
|
21
|
+
/** The agent's repository (the worktree is registered here). */
|
|
22
|
+
readonly repoDir: string;
|
|
23
|
+
/** The commit the agent started from (before its changes). */
|
|
24
|
+
readonly baseRef: string;
|
|
25
|
+
/** The agent's diff (unified, as from `git diff`). Empty = clean base. */
|
|
26
|
+
readonly diffText: string;
|
|
27
|
+
/** A fresh, non-existent directory for the detached worktree. */
|
|
28
|
+
readonly worktreeDir: string;
|
|
29
|
+
/**
|
|
30
|
+
* Paths (relative to repoDir) to symlink into the worktree after checkout —
|
|
31
|
+
* e.g. ["node_modules"]. A fresh worktree lacks gitignored deps, so without
|
|
32
|
+
* this the replay's `npm test` would fail on missing modules (a FALSE
|
|
33
|
+
* 'contradicted'). Deps are not the grading surface, so sharing them is safe.
|
|
34
|
+
*/
|
|
35
|
+
readonly linkFromRepo?: readonly string[];
|
|
36
|
+
}
|
|
37
|
+
export interface VerifierCheckout {
|
|
38
|
+
readonly checkoutDir: string;
|
|
39
|
+
readonly cleanup: () => Promise<void>;
|
|
40
|
+
}
|
|
41
|
+
export type CheckoutResult = {
|
|
42
|
+
readonly ok: true;
|
|
43
|
+
readonly checkout: VerifierCheckout;
|
|
44
|
+
} | {
|
|
45
|
+
readonly ok: false;
|
|
46
|
+
readonly error: string;
|
|
47
|
+
};
|
|
48
|
+
export declare function prepareVerifierCheckout(input: PrepareCheckoutInput, git: GitRunner): Promise<CheckoutResult>;
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { writeFileSync, symlinkSync, existsSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
/** Production GitRunner via execFile (argv-only, injection-safe). */
|
|
4
|
+
export function buildExecGitRunner() {
|
|
5
|
+
return {
|
|
6
|
+
run: async (cwd, argv) => {
|
|
7
|
+
const { execFile } = await import("node:child_process");
|
|
8
|
+
return new Promise((resolve) => {
|
|
9
|
+
execFile("git", [...argv], { cwd, maxBuffer: 64 * 1024 * 1024 }, (error, stdout, stderr) => {
|
|
10
|
+
const exitCode = error && typeof error.code === "number"
|
|
11
|
+
? Number(error.code)
|
|
12
|
+
: error
|
|
13
|
+
? 1
|
|
14
|
+
: 0;
|
|
15
|
+
resolve({
|
|
16
|
+
exitCode,
|
|
17
|
+
stdout: stdout?.toString() ?? "",
|
|
18
|
+
stderr: stderr?.toString() ?? (error instanceof Error ? error.message : ""),
|
|
19
|
+
});
|
|
20
|
+
});
|
|
21
|
+
});
|
|
22
|
+
},
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
export async function prepareVerifierCheckout(input, git) {
|
|
26
|
+
const add = await git.run(input.repoDir, [
|
|
27
|
+
"worktree",
|
|
28
|
+
"add",
|
|
29
|
+
"--detach",
|
|
30
|
+
input.worktreeDir,
|
|
31
|
+
input.baseRef,
|
|
32
|
+
]);
|
|
33
|
+
if (add.exitCode !== 0) {
|
|
34
|
+
return {
|
|
35
|
+
ok: false,
|
|
36
|
+
error: `worktree-add-failed:exit=${add.exitCode}:${add.stderr.trim().slice(0, 240)}`,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
const cleanup = async () => {
|
|
40
|
+
await git.run(input.repoDir, ["worktree", "remove", "--force", input.worktreeDir]);
|
|
41
|
+
};
|
|
42
|
+
if (input.diffText.trim() !== "") {
|
|
43
|
+
const patchPath = join(input.worktreeDir, ".wotann-verify.patch");
|
|
44
|
+
try {
|
|
45
|
+
writeFileSync(patchPath, input.diffText, "utf-8");
|
|
46
|
+
}
|
|
47
|
+
catch (e) {
|
|
48
|
+
await cleanup();
|
|
49
|
+
return {
|
|
50
|
+
ok: false,
|
|
51
|
+
error: `patch-write-failed:${e instanceof Error ? e.message : String(e)}`,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
const apply = await git.run(input.worktreeDir, ["apply", "--whitespace=nowarn", patchPath]);
|
|
55
|
+
if (apply.exitCode !== 0) {
|
|
56
|
+
await cleanup();
|
|
57
|
+
return {
|
|
58
|
+
ok: false,
|
|
59
|
+
error: `apply-failed:exit=${apply.exitCode}:${apply.stderr.trim().slice(0, 240)}`,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
for (const rel of input.linkFromRepo ?? []) {
|
|
64
|
+
const src = join(input.repoDir, rel);
|
|
65
|
+
const dst = join(input.worktreeDir, rel);
|
|
66
|
+
// Best-effort: skip when the source is absent or the target already exists.
|
|
67
|
+
if (existsSync(src) && !existsSync(dst)) {
|
|
68
|
+
try {
|
|
69
|
+
symlinkSync(src, dst, "dir");
|
|
70
|
+
}
|
|
71
|
+
catch {
|
|
72
|
+
// A failed symlink just means the replay may hit missing deps and
|
|
73
|
+
// report a result the verdict logic handles — never a silent pass.
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
return { ok: true, checkout: { checkoutDir: input.worktreeDir, cleanup } };
|
|
78
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Grading-surface diff-checker (BenchJack V2/V8 defense). The agent's diff
|
|
3
|
+
* must touch ONLY source — never the test harness / its own grader. A diff
|
|
4
|
+
* that edits the grading surface is TAMPER, not a normal failure.
|
|
5
|
+
*
|
|
6
|
+
* NOTE: package.json is deliberately NOT protected here (legit dep edits are
|
|
7
|
+
* common); detecting edits to its `scripts.test` field specifically is a
|
|
8
|
+
* Phase-B field-level refinement.
|
|
9
|
+
*/
|
|
10
|
+
export declare const DEFAULT_PROTECTED_PATTERNS: readonly RegExp[];
|
|
11
|
+
export interface TestAuthorship {
|
|
12
|
+
/** Matches files considered "tests" (whose edits are gated). */
|
|
13
|
+
readonly testFilePattern: RegExp;
|
|
14
|
+
/** Test files the agent legitimately created/edited THIS task (allowlisted). */
|
|
15
|
+
readonly authoredTestFiles: readonly string[];
|
|
16
|
+
}
|
|
17
|
+
export interface DiffCheckResult {
|
|
18
|
+
readonly tampered: boolean;
|
|
19
|
+
readonly offendingPaths: readonly string[];
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Pure. `changedPaths` are repo-relative POSIX paths the agent's diff touched.
|
|
23
|
+
* A path is offending if it matches a protected pattern, OR it is a test file
|
|
24
|
+
* (per `authorship.testFilePattern`) that is not in `authoredTestFiles`.
|
|
25
|
+
*/
|
|
26
|
+
export declare function checkDiff(changedPaths: readonly string[], patterns?: readonly RegExp[], authorship?: TestAuthorship): DiffCheckResult;
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Grading-surface diff-checker (BenchJack V2/V8 defense). The agent's diff
|
|
3
|
+
* must touch ONLY source — never the test harness / its own grader. A diff
|
|
4
|
+
* that edits the grading surface is TAMPER, not a normal failure.
|
|
5
|
+
*
|
|
6
|
+
* NOTE: package.json is deliberately NOT protected here (legit dep edits are
|
|
7
|
+
* common); detecting edits to its `scripts.test` field specifically is a
|
|
8
|
+
* Phase-B field-level refinement.
|
|
9
|
+
*/
|
|
10
|
+
export const DEFAULT_PROTECTED_PATTERNS = Object.freeze([
|
|
11
|
+
/(^|\/)conftest\.py$/,
|
|
12
|
+
/(^|\/)pytest\.ini$/,
|
|
13
|
+
/(^|\/)tox\.ini$/,
|
|
14
|
+
/(^|\/)(jest|vitest|playwright)\.config\.[cm]?[jt]s$/,
|
|
15
|
+
/(^|\/)\.mocharc\.[a-z]+$/,
|
|
16
|
+
/(^|\/)\.git(\/|$)/,
|
|
17
|
+
]);
|
|
18
|
+
/**
|
|
19
|
+
* Pure. `changedPaths` are repo-relative POSIX paths the agent's diff touched.
|
|
20
|
+
* A path is offending if it matches a protected pattern, OR it is a test file
|
|
21
|
+
* (per `authorship.testFilePattern`) that is not in `authoredTestFiles`.
|
|
22
|
+
*/
|
|
23
|
+
export function checkDiff(changedPaths, patterns = DEFAULT_PROTECTED_PATTERNS, authorship) {
|
|
24
|
+
const authored = new Set(authorship?.authoredTestFiles ?? []);
|
|
25
|
+
const offendingPaths = changedPaths.filter((p) => {
|
|
26
|
+
if (patterns.some((re) => re.test(p)))
|
|
27
|
+
return true;
|
|
28
|
+
if (authorship && authorship.testFilePattern.test(p) && !authored.has(p))
|
|
29
|
+
return true;
|
|
30
|
+
return false;
|
|
31
|
+
});
|
|
32
|
+
return { tampered: offendingPaths.length > 0, offendingPaths };
|
|
33
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { ReproductionVerdict } from "./verdict.js";
|
|
2
|
+
export type EnforcementAction = "allow" | "block" | "surface" | "escalate";
|
|
3
|
+
export interface EnforcementDecision {
|
|
4
|
+
readonly action: EnforcementAction;
|
|
5
|
+
readonly reason: string;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Asymmetric enforcement (spec §3.2). A false PASS is the moat-killer; a false
|
|
9
|
+
* BLOCK is recoverable — so reproduction-sourced tamper/contradicted HARD-BLOCK,
|
|
10
|
+
* while weaker signals surface or escalate. Keyed on the REPRODUCTION verdict
|
|
11
|
+
* (trustworthy, executable) — never on bare LLM-judge text (the TNR<25% yes-man
|
|
12
|
+
* problem is exactly why the enforce-flip waited for the reproduction channel).
|
|
13
|
+
*/
|
|
14
|
+
export declare function enforceReproductionVerdict(verdict: ReproductionVerdict): EnforcementDecision;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Asymmetric enforcement (spec §3.2). A false PASS is the moat-killer; a false
|
|
3
|
+
* BLOCK is recoverable — so reproduction-sourced tamper/contradicted HARD-BLOCK,
|
|
4
|
+
* while weaker signals surface or escalate. Keyed on the REPRODUCTION verdict
|
|
5
|
+
* (trustworthy, executable) — never on bare LLM-judge text (the TNR<25% yes-man
|
|
6
|
+
* problem is exactly why the enforce-flip waited for the reproduction channel).
|
|
7
|
+
*/
|
|
8
|
+
export function enforceReproductionVerdict(verdict) {
|
|
9
|
+
switch (verdict) {
|
|
10
|
+
case "tamper":
|
|
11
|
+
return { action: "block", reason: "diff tampered with the grading surface" };
|
|
12
|
+
case "contradicted":
|
|
13
|
+
return {
|
|
14
|
+
action: "block",
|
|
15
|
+
reason: "claimed success contradicted by independent reproduction",
|
|
16
|
+
};
|
|
17
|
+
case "weak-tests":
|
|
18
|
+
return {
|
|
19
|
+
action: "surface",
|
|
20
|
+
reason: "reproduction passed but tests are too weak to trust (low mutation score)",
|
|
21
|
+
};
|
|
22
|
+
case "infra-error":
|
|
23
|
+
return {
|
|
24
|
+
action: "escalate",
|
|
25
|
+
reason: "could not reproduce — verify manually before trusting",
|
|
26
|
+
};
|
|
27
|
+
case "reproduced":
|
|
28
|
+
return { action: "allow", reason: "independently reproduced in a separate trust boundary" };
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { ReplayRunner } from "./replay-runner.js";
|
|
2
|
+
/**
|
|
3
|
+
* Production `ReplayRunner` backed by `child_process.execFile` (argv-only — no
|
|
4
|
+
* shell interpolation, so injection-safe regardless of the command content,
|
|
5
|
+
* matching the codebase's execFileNoThrow contract). Runs the claimed commands
|
|
6
|
+
* inside `dir`, the verifier-box checkout.
|
|
7
|
+
*
|
|
8
|
+
* Honest probe: confirms the Node interpreter is runnable; reports the failure
|
|
9
|
+
* explicitly otherwise (so runReplay emits `infra-error`, never a silent pass).
|
|
10
|
+
*
|
|
11
|
+
* Trust-boundary note: this runs on the HOST in a given directory. Full
|
|
12
|
+
* separate-trust-boundary isolation (a container, per spec §3.1) is the deferred
|
|
13
|
+
* production hardening — this is the host-dir baseline so the loop runs for real.
|
|
14
|
+
*/
|
|
15
|
+
export declare function buildExecReplayRunner(): ReplayRunner;
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Production `ReplayRunner` backed by `child_process.execFile` (argv-only — no
|
|
3
|
+
* shell interpolation, so injection-safe regardless of the command content,
|
|
4
|
+
* matching the codebase's execFileNoThrow contract). Runs the claimed commands
|
|
5
|
+
* inside `dir`, the verifier-box checkout.
|
|
6
|
+
*
|
|
7
|
+
* Honest probe: confirms the Node interpreter is runnable; reports the failure
|
|
8
|
+
* explicitly otherwise (so runReplay emits `infra-error`, never a silent pass).
|
|
9
|
+
*
|
|
10
|
+
* Trust-boundary note: this runs on the HOST in a given directory. Full
|
|
11
|
+
* separate-trust-boundary isolation (a container, per spec §3.1) is the deferred
|
|
12
|
+
* production hardening — this is the host-dir baseline so the loop runs for real.
|
|
13
|
+
*/
|
|
14
|
+
export function buildExecReplayRunner() {
|
|
15
|
+
return {
|
|
16
|
+
probe: async () => {
|
|
17
|
+
const { execFile } = await import("node:child_process");
|
|
18
|
+
return new Promise((resolve) => {
|
|
19
|
+
execFile(process.execPath, ["--version"], (error) => {
|
|
20
|
+
resolve(error
|
|
21
|
+
? { ok: false, reason: error instanceof Error ? error.message : String(error) }
|
|
22
|
+
: { ok: true });
|
|
23
|
+
});
|
|
24
|
+
});
|
|
25
|
+
},
|
|
26
|
+
runInDir: async (dir, argv) => {
|
|
27
|
+
const { execFile } = await import("node:child_process");
|
|
28
|
+
const [file, ...rest] = argv;
|
|
29
|
+
if (!file)
|
|
30
|
+
return { exitCode: 1, stdout: "", stderr: "empty argv" };
|
|
31
|
+
return new Promise((resolve) => {
|
|
32
|
+
execFile(file, rest, { cwd: dir, maxBuffer: 10 * 1024 * 1024 }, (error, stdout, stderr) => {
|
|
33
|
+
const exitCode = error && typeof error.code === "number"
|
|
34
|
+
? Number(error.code)
|
|
35
|
+
: error
|
|
36
|
+
? 1
|
|
37
|
+
: 0;
|
|
38
|
+
resolve({
|
|
39
|
+
exitCode,
|
|
40
|
+
stdout: stdout?.toString() ?? "",
|
|
41
|
+
stderr: stderr?.toString() ?? (error instanceof Error ? error.message : ""),
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
},
|
|
46
|
+
};
|
|
47
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export { decideReproductionVerdict, type ReproductionVerdict, type ClaimedChecks, type ObservedChecks, type ReproductionInput, type ReproductionResult, } from "./verdict.js";
|
|
2
|
+
export { checkDiff, DEFAULT_PROTECTED_PATTERNS, type DiffCheckResult, type TestAuthorship, } from "./diff-checker.js";
|
|
3
|
+
export { runReplay, type ReplayRunner, type ReplayCommands, type ReplayInput, } from "./replay-runner.js";
|
|
4
|
+
export { buildProofArtifact, type ProofArtifact } from "./proof-artifact.js";
|
|
5
|
+
export { gateMutation, runMutationGate, DEFAULT_MUTATION_THRESHOLD, type MutationResult, type MutationGateResult, type MutationRunner, } from "./mutation-gate.js";
|
|
6
|
+
export { enforceReproductionVerdict, type EnforcementAction, type EnforcementDecision, } from "./enforcement.js";
|
|
7
|
+
export { reproduceRun, type ReproduceInput, type ReproduceRunners, type ReproduceOutput, } from "./reproduce.js";
|
|
8
|
+
export { buildExecReplayRunner } from "./exec-runner.js";
|
|
9
|
+
export { prepareVerifierCheckout, buildExecGitRunner, type GitRunner, type PrepareCheckoutInput, type VerifierCheckout, type CheckoutResult, } from "./checkout-prep.js";
|
|
10
|
+
export { reproduceAutonomousRun, runWorkspaceReproduction, type AutonomousReproduceInput, type AutonomousReproduceRunners, type AutonomousReproduceOutput, type WorkspaceReproduceInput, } from "./autonomous-gate.js";
|