wotann 0.5.96 → 0.5.97
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +42 -0
- package/dist/orchestration/proof-bundles.d.ts +8 -0
- package/dist/orchestration/proof-bundles.js +2 -0
- package/dist/security/approval-binding.d.ts +52 -0
- package/dist/security/approval-binding.js +57 -0
- package/dist/security/human-approval.d.ts +2 -0
- package/dist/security/human-approval.js +15 -24
- package/dist/verification/reproduction/autonomous-gate.d.ts +52 -0
- package/dist/verification/reproduction/autonomous-gate.js +71 -0
- package/dist/verification/reproduction/checkout-prep.d.ts +48 -0
- package/dist/verification/reproduction/checkout-prep.js +78 -0
- package/dist/verification/reproduction/diff-checker.d.ts +26 -0
- package/dist/verification/reproduction/diff-checker.js +33 -0
- package/dist/verification/reproduction/enforcement.d.ts +14 -0
- package/dist/verification/reproduction/enforcement.js +30 -0
- package/dist/verification/reproduction/exec-runner.d.ts +15 -0
- package/dist/verification/reproduction/exec-runner.js +47 -0
- package/dist/verification/reproduction/index.d.ts +10 -0
- package/dist/verification/reproduction/index.js +10 -0
- package/dist/verification/reproduction/mutation-gate.d.ts +42 -0
- package/dist/verification/reproduction/mutation-gate.js +43 -0
- package/dist/verification/reproduction/proof-artifact.d.ts +16 -0
- package/dist/verification/reproduction/proof-artifact.js +22 -0
- package/dist/verification/reproduction/replay-runner.d.ts +37 -0
- package/dist/verification/reproduction/replay-runner.js +28 -0
- package/dist/verification/reproduction/reproduce.d.ts +34 -0
- package/dist/verification/reproduction/reproduce.js +31 -0
- package/dist/verification/reproduction/verdict.d.ts +39 -0
- package/dist/verification/reproduction/verdict.js +40 -0
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -5379,6 +5379,48 @@ program
|
|
|
5379
5379
|
? chalk.green(" Done — Task completed successfully")
|
|
5380
5380
|
: chalk.red(" Failed — Task did not complete"));
|
|
5381
5381
|
console.log();
|
|
5382
|
+
// Independent verify-by-reproduction (opt-in: WOTANN_REPRODUCE=1).
|
|
5383
|
+
// Default-off because re-running the suite in an isolated checkout
|
|
5384
|
+
// roughly doubles run time. Replays the agent's claimed checks in a
|
|
5385
|
+
// clean base+diff worktree (a separate trust boundary, deps symlinked)
|
|
5386
|
+
// and reports a verdict the agent cannot fake. Best-effort — never
|
|
5387
|
+
// breaks the run.
|
|
5388
|
+
if (process.env.WOTANN_REPRODUCE === "1") {
|
|
5389
|
+
try {
|
|
5390
|
+
const [repro, os, path] = await Promise.all([
|
|
5391
|
+
import("./verification/reproduction/index.js"),
|
|
5392
|
+
import("node:os"),
|
|
5393
|
+
import("node:path"),
|
|
5394
|
+
]);
|
|
5395
|
+
const git = repro.buildExecGitRunner();
|
|
5396
|
+
const lastCycle = result.cycles[result.cycles.length - 1];
|
|
5397
|
+
const verdict = await repro.runWorkspaceReproduction({
|
|
5398
|
+
cwd: process.cwd(),
|
|
5399
|
+
claimed: {
|
|
5400
|
+
testsPass: lastCycle?.testsPass ?? false,
|
|
5401
|
+
typecheckPass: lastCycle?.typecheckPass ?? false,
|
|
5402
|
+
lintPass: lastCycle?.lintPass ?? false,
|
|
5403
|
+
},
|
|
5404
|
+
commands: { test: ["npm", "test"], typecheck: ["npm", "run", "typecheck"] },
|
|
5405
|
+
worktreeDir: path.join(os.tmpdir(), `wotann-verify-${Date.now()}`),
|
|
5406
|
+
linkFromRepo: ["node_modules"],
|
|
5407
|
+
}, { git, replay: repro.buildExecReplayRunner() });
|
|
5408
|
+
const tag = verdict.enforcement.action === "block"
|
|
5409
|
+
? chalk.red(`⛔ ${verdict.result.verdict}`)
|
|
5410
|
+
: verdict.enforcement.action === "allow"
|
|
5411
|
+
? chalk.green(`✓ ${verdict.result.verdict}`)
|
|
5412
|
+
: chalk.yellow(`⚠ ${verdict.result.verdict}`);
|
|
5413
|
+
console.log(chalk.bold("Independent reproduction (separate trust boundary):"));
|
|
5414
|
+
console.log(` ${tag} — ${verdict.enforcement.reason}`);
|
|
5415
|
+
for (const c of verdict.result.contradictions) {
|
|
5416
|
+
console.log(chalk.dim(` • ${c}`));
|
|
5417
|
+
}
|
|
5418
|
+
console.log();
|
|
5419
|
+
}
|
|
5420
|
+
catch (e) {
|
|
5421
|
+
console.log(chalk.dim(` Reproduction skipped: ${e instanceof Error ? e.message : String(e)}`));
|
|
5422
|
+
}
|
|
5423
|
+
}
|
|
5382
5424
|
process.exit(result.success ? 0 : 1);
|
|
5383
5425
|
}
|
|
5384
5426
|
finally {
|
|
@@ -47,6 +47,14 @@ export interface AutonomousProofBundle {
|
|
|
47
47
|
readonly visualVerificationEnabled: boolean;
|
|
48
48
|
readonly visualExpectation?: string;
|
|
49
49
|
readonly finalChecks: {
|
|
50
|
+
/**
|
|
51
|
+
* Provenance of these checks. "self-reported" = copied from the agent's
|
|
52
|
+
* own cycle result — a CLAIM, not an independently reproduced result. A
|
|
53
|
+
* green check is a claim, not proof (verify-by-reproduction, V7); a future
|
|
54
|
+
* verdict re-runs these in a separate trust boundary and reports a
|
|
55
|
+
* "reproduced" source instead.
|
|
56
|
+
*/
|
|
57
|
+
readonly source: "self-reported";
|
|
50
58
|
readonly testsPass: boolean;
|
|
51
59
|
readonly typecheckPass: boolean;
|
|
52
60
|
readonly lintPass: boolean;
|
|
@@ -44,6 +44,8 @@ export function writeAutonomousProofBundle(input) {
|
|
|
44
44
|
visualExpectation: input.visualExpectation,
|
|
45
45
|
finalChecks: lastCycle
|
|
46
46
|
? {
|
|
47
|
+
// Self-reported by the agent's own cycle — a claim, not proof (V7).
|
|
48
|
+
source: "self-reported",
|
|
47
49
|
testsPass: lastCycle.testsPass,
|
|
48
50
|
typecheckPass: lastCycle.typecheckPass,
|
|
49
51
|
lintPass: lastCycle.lintPass,
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Exact-bytes approval binding (corpus F4; defeats the OpenClaw CVE-2026-29607
|
|
3
|
+
* "approval persisted at the wrapper level, not the inner command" class and
|
|
4
|
+
* the TOCTOU race). Bind the canonicalized action's HMAC at approval time;
|
|
5
|
+
* re-pin at execution and abort on drift, replay (single-use nonce), or expiry.
|
|
6
|
+
*/
|
|
7
|
+
export interface CanonicalAction {
|
|
8
|
+
readonly tool: string;
|
|
9
|
+
readonly args: readonly string[];
|
|
10
|
+
readonly cwd: string;
|
|
11
|
+
}
|
|
12
|
+
export interface ApprovalBinding {
|
|
13
|
+
readonly bindingId: string;
|
|
14
|
+
readonly actionHash: string;
|
|
15
|
+
readonly nonce: string;
|
|
16
|
+
readonly expiresAt: number;
|
|
17
|
+
}
|
|
18
|
+
export type VerifyResult = {
|
|
19
|
+
readonly ok: true;
|
|
20
|
+
} | {
|
|
21
|
+
readonly ok: false;
|
|
22
|
+
readonly reason: string;
|
|
23
|
+
};
|
|
24
|
+
/**
|
|
25
|
+
* Deterministic canonical form of an action. The CALLER must pre-resolve shell
|
|
26
|
+
* expansion / env-substitution / path resolution BEFORE binding, so the bound
|
|
27
|
+
* bytes ARE exactly what will execute (the literate wrapper is not what runs).
|
|
28
|
+
*/
|
|
29
|
+
export declare function canonicalizeAction(action: CanonicalAction): string;
|
|
30
|
+
export interface ApprovalBinderOptions {
|
|
31
|
+
readonly ttlMs?: number;
|
|
32
|
+
readonly now?: () => number;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Stateful service: the consumed-nonce set lives inside the instance (QB#7
|
|
36
|
+
* per-call state, not module-global). Inject `now` for deterministic tests.
|
|
37
|
+
*/
|
|
38
|
+
export declare class ApprovalBinder {
|
|
39
|
+
private readonly secret;
|
|
40
|
+
private readonly ttlMs;
|
|
41
|
+
private readonly now;
|
|
42
|
+
private readonly consumed;
|
|
43
|
+
private counter;
|
|
44
|
+
constructor(secret: Buffer | string, opts?: ApprovalBinderOptions);
|
|
45
|
+
bind(action: CanonicalAction, nonce?: string): ApprovalBinding;
|
|
46
|
+
/**
|
|
47
|
+
* Re-pin at execution: recompute the HMAC from the action that is ABOUT to
|
|
48
|
+
* run and reject on drift, expiry, or replay. Consumes the nonce on success.
|
|
49
|
+
* Order matters: replay > expiry > drift (cheapest, most-specific first).
|
|
50
|
+
*/
|
|
51
|
+
verify(binding: ApprovalBinding, action: CanonicalAction): VerifyResult;
|
|
52
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { timingSafeEqual } from "node:crypto";
|
|
2
|
+
import { computeBoundaryHmac } from "./prompt-injection-quarantine.js";
|
|
3
|
+
/**
|
|
4
|
+
* Deterministic canonical form of an action. The CALLER must pre-resolve shell
|
|
5
|
+
* expansion / env-substitution / path resolution BEFORE binding, so the bound
|
|
6
|
+
* bytes ARE exactly what will execute (the literate wrapper is not what runs).
|
|
7
|
+
*/
|
|
8
|
+
export function canonicalizeAction(action) {
|
|
9
|
+
return JSON.stringify({ tool: action.tool.trim(), args: action.args, cwd: action.cwd });
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Stateful service: the consumed-nonce set lives inside the instance (QB#7
|
|
13
|
+
* per-call state, not module-global). Inject `now` for deterministic tests.
|
|
14
|
+
*/
|
|
15
|
+
export class ApprovalBinder {
|
|
16
|
+
secret;
|
|
17
|
+
ttlMs;
|
|
18
|
+
now;
|
|
19
|
+
consumed = new Set();
|
|
20
|
+
counter = 0;
|
|
21
|
+
constructor(secret, opts = {}) {
|
|
22
|
+
this.secret = secret;
|
|
23
|
+
this.ttlMs = opts.ttlMs ?? 5 * 60_000;
|
|
24
|
+
this.now = opts.now ?? (() => Date.now());
|
|
25
|
+
}
|
|
26
|
+
bind(action, nonce) {
|
|
27
|
+
const actionHash = computeBoundaryHmac(canonicalizeAction(action), this.secret);
|
|
28
|
+
const id = ++this.counter;
|
|
29
|
+
return {
|
|
30
|
+
bindingId: `bind-${id}`,
|
|
31
|
+
actionHash,
|
|
32
|
+
nonce: nonce ?? `n-${this.now()}-${id}`,
|
|
33
|
+
expiresAt: this.now() + this.ttlMs,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Re-pin at execution: recompute the HMAC from the action that is ABOUT to
|
|
38
|
+
* run and reject on drift, expiry, or replay. Consumes the nonce on success.
|
|
39
|
+
* Order matters: replay > expiry > drift (cheapest, most-specific first).
|
|
40
|
+
*/
|
|
41
|
+
verify(binding, action) {
|
|
42
|
+
if (this.consumed.has(binding.nonce)) {
|
|
43
|
+
return { ok: false, reason: "replay: nonce already consumed" };
|
|
44
|
+
}
|
|
45
|
+
if (this.now() > binding.expiresAt) {
|
|
46
|
+
return { ok: false, reason: "expired: approval window elapsed" };
|
|
47
|
+
}
|
|
48
|
+
const recomputed = computeBoundaryHmac(canonicalizeAction(action), this.secret);
|
|
49
|
+
const a = Buffer.from(recomputed, "utf-8");
|
|
50
|
+
const b = Buffer.from(binding.actionHash, "utf-8");
|
|
51
|
+
if (a.length !== b.length || !timingSafeEqual(a, b)) {
|
|
52
|
+
return { ok: false, reason: "hash drift: action changed since approval" };
|
|
53
|
+
}
|
|
54
|
+
this.consumed.add(binding.nonce);
|
|
55
|
+
return { ok: true };
|
|
56
|
+
}
|
|
57
|
+
}
|
|
@@ -19,6 +19,8 @@ export interface ApprovalRequest {
|
|
|
19
19
|
readonly tool: string;
|
|
20
20
|
readonly args: string;
|
|
21
21
|
readonly reason: string;
|
|
22
|
+
/** Optional exact-bytes approval-binding hash (see security/approval-binding.ts). */
|
|
23
|
+
readonly hash?: string;
|
|
22
24
|
readonly risk: "low" | "medium" | "high" | "critical";
|
|
23
25
|
readonly riskLevel: "moderate" | "high" | "critical";
|
|
24
26
|
readonly createdAt: number;
|
|
@@ -26,7 +26,10 @@ const DEFAULT_POLICIES = [
|
|
|
26
26
|
riskLevel: "moderate",
|
|
27
27
|
channels: ["cli", "desktop"],
|
|
28
28
|
timeoutMs: 60_000, // 1 minute
|
|
29
|
-
|
|
29
|
+
// Fail CLOSED: an absent/stalled human must never auto-approve a write or
|
|
30
|
+
// edit. An adversarial agent could otherwise stall past the timeout to win
|
|
31
|
+
// an automatic approval. (bash is already fail-closed; this matches it.)
|
|
32
|
+
defaultOnTimeout: "deny",
|
|
30
33
|
},
|
|
31
34
|
];
|
|
32
35
|
// ── Approval Manager ─────────────────────────────────────
|
|
@@ -71,7 +74,7 @@ export class HumanApprovalManager {
|
|
|
71
74
|
feedback: `Approval timed out after ${timeoutMs}ms. Default: ${defaultOnTimeout}.`,
|
|
72
75
|
respondedAt: Date.now(),
|
|
73
76
|
};
|
|
74
|
-
this.history.push(response);
|
|
77
|
+
this.history.push({ request, response });
|
|
75
78
|
resolve(response);
|
|
76
79
|
}
|
|
77
80
|
}, timeoutMs);
|
|
@@ -80,7 +83,7 @@ export class HumanApprovalManager {
|
|
|
80
83
|
request,
|
|
81
84
|
resolve: (response) => {
|
|
82
85
|
clearTimeout(timer);
|
|
83
|
-
this.history.push(response);
|
|
86
|
+
this.history.push({ request, response });
|
|
84
87
|
resolve(response);
|
|
85
88
|
},
|
|
86
89
|
});
|
|
@@ -113,7 +116,7 @@ export class HumanApprovalManager {
|
|
|
113
116
|
* Get approval history.
|
|
114
117
|
*/
|
|
115
118
|
getHistory() {
|
|
116
|
-
return this.history;
|
|
119
|
+
return this.history.map((record) => record.response);
|
|
117
120
|
}
|
|
118
121
|
/**
|
|
119
122
|
* Add a custom policy.
|
|
@@ -146,30 +149,18 @@ export class HumanApprovalManager {
|
|
|
146
149
|
* Get full audit log of all approval decisions (request + result pairs).
|
|
147
150
|
*/
|
|
148
151
|
getAuditLog() {
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
tool: "",
|
|
156
|
-
args: "",
|
|
157
|
-
reason: "",
|
|
158
|
-
risk: "medium",
|
|
159
|
-
riskLevel: "moderate",
|
|
160
|
-
createdAt: response.respondedAt,
|
|
161
|
-
timestamp: response.respondedAt,
|
|
162
|
-
timeoutMs: 0,
|
|
163
|
-
channels: [],
|
|
164
|
-
};
|
|
165
|
-
const result = {
|
|
152
|
+
// The REAL request is recorded at decision time and read back verbatim —
|
|
153
|
+
// never reconstructed — so the audit trail can never drift from what was
|
|
154
|
+
// actually approved.
|
|
155
|
+
return this.history.map(({ request, response }) => ({
|
|
156
|
+
request,
|
|
157
|
+
result: {
|
|
166
158
|
approved: response.decision === "approve",
|
|
167
159
|
approvedBy: response.respondedBy,
|
|
168
160
|
feedback: response.feedback,
|
|
169
161
|
decidedAt: response.respondedAt,
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
});
|
|
162
|
+
},
|
|
163
|
+
}));
|
|
173
164
|
}
|
|
174
165
|
// ── Private ────────────────────────────────────────────
|
|
175
166
|
assessRisk(tool, args) {
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import { type GitRunner } from "./checkout-prep.js";
|
|
2
|
+
import { type ClaimedChecks, type ReproductionResult } from "./verdict.js";
|
|
3
|
+
import { type ProofArtifact } from "./proof-artifact.js";
|
|
4
|
+
import { type EnforcementDecision } from "./enforcement.js";
|
|
5
|
+
import type { ReplayRunner, ReplayCommands } from "./replay-runner.js";
|
|
6
|
+
import type { MutationRunner } from "./mutation-gate.js";
|
|
7
|
+
export interface AutonomousReproduceInput {
|
|
8
|
+
readonly repoDir: string;
|
|
9
|
+
readonly baseRef: string;
|
|
10
|
+
readonly diffText: string;
|
|
11
|
+
readonly changedPaths: readonly string[];
|
|
12
|
+
readonly claimed: ClaimedChecks;
|
|
13
|
+
readonly commands: ReplayCommands;
|
|
14
|
+
/** A fresh, non-existent directory for the verifier-box worktree. */
|
|
15
|
+
readonly worktreeDir: string;
|
|
16
|
+
/** Paths to symlink from repoDir into the worktree (e.g. ["node_modules"]). */
|
|
17
|
+
readonly linkFromRepo?: readonly string[];
|
|
18
|
+
}
|
|
19
|
+
export interface AutonomousReproduceRunners {
|
|
20
|
+
readonly git: GitRunner;
|
|
21
|
+
readonly replay: ReplayRunner;
|
|
22
|
+
readonly mutation?: MutationRunner;
|
|
23
|
+
}
|
|
24
|
+
export interface AutonomousReproduceOutput {
|
|
25
|
+
readonly result: ReproductionResult;
|
|
26
|
+
readonly proof: ProofArtifact;
|
|
27
|
+
readonly enforcement: EnforcementDecision;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Full production composition: prepare an isolated checkout (the separate trust
|
|
31
|
+
* boundary), reproduce the claimed result inside it, decide enforcement, and
|
|
32
|
+
* ALWAYS clean up the worktree. A failed checkout => infra-error (never an
|
|
33
|
+
* auto-pass — we couldn't verify, so we don't silently allow).
|
|
34
|
+
*/
|
|
35
|
+
export declare function reproduceAutonomousRun(input: AutonomousReproduceInput, runners: AutonomousReproduceRunners): Promise<AutonomousReproduceOutput>;
|
|
36
|
+
export interface WorkspaceReproduceInput {
|
|
37
|
+
/** The agent's workspace (its uncommitted changes are captured as the diff). */
|
|
38
|
+
readonly cwd: string;
|
|
39
|
+
readonly claimed: ClaimedChecks;
|
|
40
|
+
readonly commands: ReplayCommands;
|
|
41
|
+
/** A fresh, non-existent directory for the verifier-box worktree. */
|
|
42
|
+
readonly worktreeDir: string;
|
|
43
|
+
/** Paths to symlink from the workspace into the worktree (e.g. ["node_modules"]). */
|
|
44
|
+
readonly linkFromRepo?: readonly string[];
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Capture the workspace's git state (HEAD as base + the uncommitted diff) and
|
|
48
|
+
* reproduce. Assumes the agent's changes are UNCOMMITTED (the common autonomous
|
|
49
|
+
* case): HEAD is the base and `git diff` is the agent's work. If the agent
|
|
50
|
+
* committed mid-run, the diff is empty and the clean base is reproduced.
|
|
51
|
+
*/
|
|
52
|
+
export declare function runWorkspaceReproduction(input: WorkspaceReproduceInput, runners: AutonomousReproduceRunners): Promise<AutonomousReproduceOutput>;
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { prepareVerifierCheckout } from "./checkout-prep.js";
|
|
2
|
+
import { reproduceRun } from "./reproduce.js";
|
|
3
|
+
import { decideReproductionVerdict, } from "./verdict.js";
|
|
4
|
+
import { buildProofArtifact } from "./proof-artifact.js";
|
|
5
|
+
import { enforceReproductionVerdict } from "./enforcement.js";
|
|
6
|
+
/**
|
|
7
|
+
* Full production composition: prepare an isolated checkout (the separate trust
|
|
8
|
+
* boundary), reproduce the claimed result inside it, decide enforcement, and
|
|
9
|
+
* ALWAYS clean up the worktree. A failed checkout => infra-error (never an
|
|
10
|
+
* auto-pass — we couldn't verify, so we don't silently allow).
|
|
11
|
+
*/
|
|
12
|
+
export async function reproduceAutonomousRun(input, runners) {
|
|
13
|
+
const checkout = await prepareVerifierCheckout({
|
|
14
|
+
repoDir: input.repoDir,
|
|
15
|
+
baseRef: input.baseRef,
|
|
16
|
+
diffText: input.diffText,
|
|
17
|
+
worktreeDir: input.worktreeDir,
|
|
18
|
+
...(input.linkFromRepo ? { linkFromRepo: input.linkFromRepo } : {}),
|
|
19
|
+
}, runners.git);
|
|
20
|
+
if (!checkout.ok) {
|
|
21
|
+
const result = decideReproductionVerdict({
|
|
22
|
+
claimed: input.claimed,
|
|
23
|
+
observed: { testsPass: null, typecheckPass: null, lintPass: null },
|
|
24
|
+
tampered: false,
|
|
25
|
+
infraError: `checkout:${checkout.error}`,
|
|
26
|
+
});
|
|
27
|
+
const proof = buildProofArtifact(result, input.diffText);
|
|
28
|
+
return { result, proof, enforcement: enforceReproductionVerdict(result.verdict) };
|
|
29
|
+
}
|
|
30
|
+
try {
|
|
31
|
+
const reproRunners = runners.mutation
|
|
32
|
+
? { replay: runners.replay, mutation: runners.mutation }
|
|
33
|
+
: { replay: runners.replay };
|
|
34
|
+
const { result, proof } = await reproduceRun({
|
|
35
|
+
claimed: input.claimed,
|
|
36
|
+
changedPaths: input.changedPaths,
|
|
37
|
+
checkoutDir: checkout.checkout.checkoutDir,
|
|
38
|
+
commands: input.commands,
|
|
39
|
+
diffText: input.diffText,
|
|
40
|
+
mutationFiles: input.changedPaths,
|
|
41
|
+
}, reproRunners);
|
|
42
|
+
return { result, proof, enforcement: enforceReproductionVerdict(result.verdict) };
|
|
43
|
+
}
|
|
44
|
+
finally {
|
|
45
|
+
await checkout.checkout.cleanup();
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Capture the workspace's git state (HEAD as base + the uncommitted diff) and
|
|
50
|
+
* reproduce. Assumes the agent's changes are UNCOMMITTED (the common autonomous
|
|
51
|
+
* case): HEAD is the base and `git diff` is the agent's work. If the agent
|
|
52
|
+
* committed mid-run, the diff is empty and the clean base is reproduced.
|
|
53
|
+
*/
|
|
54
|
+
export async function runWorkspaceReproduction(input, runners) {
|
|
55
|
+
const baseRef = (await runners.git.run(input.cwd, ["rev-parse", "HEAD"])).stdout.trim();
|
|
56
|
+
const diffText = (await runners.git.run(input.cwd, ["diff"])).stdout;
|
|
57
|
+
const changedPaths = (await runners.git.run(input.cwd, ["diff", "--name-only"])).stdout
|
|
58
|
+
.split("\n")
|
|
59
|
+
.map((s) => s.trim())
|
|
60
|
+
.filter(Boolean);
|
|
61
|
+
return reproduceAutonomousRun({
|
|
62
|
+
repoDir: input.cwd,
|
|
63
|
+
baseRef,
|
|
64
|
+
diffText,
|
|
65
|
+
changedPaths,
|
|
66
|
+
claimed: input.claimed,
|
|
67
|
+
commands: input.commands,
|
|
68
|
+
worktreeDir: input.worktreeDir,
|
|
69
|
+
...(input.linkFromRepo ? { linkFromRepo: input.linkFromRepo } : {}),
|
|
70
|
+
}, runners);
|
|
71
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Prepare the separate-trust-boundary checkout for verify-by-reproduction
|
|
3
|
+
* (spec §3.1, the missing prerequisite for a correct production trigger).
|
|
4
|
+
*
|
|
5
|
+
* The verifier box MUST run a CLEAN base checkout + ONLY the agent's diff — in
|
|
6
|
+
* a separate directory — so the agent's dirty workspace (runtime tampering,
|
|
7
|
+
* stray files, monkeypatched runners) can never influence the result. Running
|
|
8
|
+
* the replay in the agent's own `process.cwd()` would be the BenchJack V1
|
|
9
|
+
* violation this defeats.
|
|
10
|
+
*/
|
|
11
|
+
export interface GitRunner {
|
|
12
|
+
readonly run: (cwd: string, argv: readonly string[]) => Promise<{
|
|
13
|
+
exitCode: number;
|
|
14
|
+
stdout: string;
|
|
15
|
+
stderr: string;
|
|
16
|
+
}>;
|
|
17
|
+
}
|
|
18
|
+
/** Production GitRunner via execFile (argv-only, injection-safe). */
|
|
19
|
+
export declare function buildExecGitRunner(): GitRunner;
|
|
20
|
+
export interface PrepareCheckoutInput {
|
|
21
|
+
/** The agent's repository (the worktree is registered here). */
|
|
22
|
+
readonly repoDir: string;
|
|
23
|
+
/** The commit the agent started from (before its changes). */
|
|
24
|
+
readonly baseRef: string;
|
|
25
|
+
/** The agent's diff (unified, as from `git diff`). Empty = clean base. */
|
|
26
|
+
readonly diffText: string;
|
|
27
|
+
/** A fresh, non-existent directory for the detached worktree. */
|
|
28
|
+
readonly worktreeDir: string;
|
|
29
|
+
/**
|
|
30
|
+
* Paths (relative to repoDir) to symlink into the worktree after checkout —
|
|
31
|
+
* e.g. ["node_modules"]. A fresh worktree lacks gitignored deps, so without
|
|
32
|
+
* this the replay's `npm test` would fail on missing modules (a FALSE
|
|
33
|
+
* 'contradicted'). Deps are not the grading surface, so sharing them is safe.
|
|
34
|
+
*/
|
|
35
|
+
readonly linkFromRepo?: readonly string[];
|
|
36
|
+
}
|
|
37
|
+
export interface VerifierCheckout {
|
|
38
|
+
readonly checkoutDir: string;
|
|
39
|
+
readonly cleanup: () => Promise<void>;
|
|
40
|
+
}
|
|
41
|
+
export type CheckoutResult = {
|
|
42
|
+
readonly ok: true;
|
|
43
|
+
readonly checkout: VerifierCheckout;
|
|
44
|
+
} | {
|
|
45
|
+
readonly ok: false;
|
|
46
|
+
readonly error: string;
|
|
47
|
+
};
|
|
48
|
+
export declare function prepareVerifierCheckout(input: PrepareCheckoutInput, git: GitRunner): Promise<CheckoutResult>;
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { writeFileSync, symlinkSync, existsSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
/** Production GitRunner via execFile (argv-only, injection-safe). */
|
|
4
|
+
export function buildExecGitRunner() {
|
|
5
|
+
return {
|
|
6
|
+
run: async (cwd, argv) => {
|
|
7
|
+
const { execFile } = await import("node:child_process");
|
|
8
|
+
return new Promise((resolve) => {
|
|
9
|
+
execFile("git", [...argv], { cwd, maxBuffer: 64 * 1024 * 1024 }, (error, stdout, stderr) => {
|
|
10
|
+
const exitCode = error && typeof error.code === "number"
|
|
11
|
+
? Number(error.code)
|
|
12
|
+
: error
|
|
13
|
+
? 1
|
|
14
|
+
: 0;
|
|
15
|
+
resolve({
|
|
16
|
+
exitCode,
|
|
17
|
+
stdout: stdout?.toString() ?? "",
|
|
18
|
+
stderr: stderr?.toString() ?? (error instanceof Error ? error.message : ""),
|
|
19
|
+
});
|
|
20
|
+
});
|
|
21
|
+
});
|
|
22
|
+
},
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
export async function prepareVerifierCheckout(input, git) {
|
|
26
|
+
const add = await git.run(input.repoDir, [
|
|
27
|
+
"worktree",
|
|
28
|
+
"add",
|
|
29
|
+
"--detach",
|
|
30
|
+
input.worktreeDir,
|
|
31
|
+
input.baseRef,
|
|
32
|
+
]);
|
|
33
|
+
if (add.exitCode !== 0) {
|
|
34
|
+
return {
|
|
35
|
+
ok: false,
|
|
36
|
+
error: `worktree-add-failed:exit=${add.exitCode}:${add.stderr.trim().slice(0, 240)}`,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
const cleanup = async () => {
|
|
40
|
+
await git.run(input.repoDir, ["worktree", "remove", "--force", input.worktreeDir]);
|
|
41
|
+
};
|
|
42
|
+
if (input.diffText.trim() !== "") {
|
|
43
|
+
const patchPath = join(input.worktreeDir, ".wotann-verify.patch");
|
|
44
|
+
try {
|
|
45
|
+
writeFileSync(patchPath, input.diffText, "utf-8");
|
|
46
|
+
}
|
|
47
|
+
catch (e) {
|
|
48
|
+
await cleanup();
|
|
49
|
+
return {
|
|
50
|
+
ok: false,
|
|
51
|
+
error: `patch-write-failed:${e instanceof Error ? e.message : String(e)}`,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
const apply = await git.run(input.worktreeDir, ["apply", "--whitespace=nowarn", patchPath]);
|
|
55
|
+
if (apply.exitCode !== 0) {
|
|
56
|
+
await cleanup();
|
|
57
|
+
return {
|
|
58
|
+
ok: false,
|
|
59
|
+
error: `apply-failed:exit=${apply.exitCode}:${apply.stderr.trim().slice(0, 240)}`,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
for (const rel of input.linkFromRepo ?? []) {
|
|
64
|
+
const src = join(input.repoDir, rel);
|
|
65
|
+
const dst = join(input.worktreeDir, rel);
|
|
66
|
+
// Best-effort: skip when the source is absent or the target already exists.
|
|
67
|
+
if (existsSync(src) && !existsSync(dst)) {
|
|
68
|
+
try {
|
|
69
|
+
symlinkSync(src, dst, "dir");
|
|
70
|
+
}
|
|
71
|
+
catch {
|
|
72
|
+
// A failed symlink just means the replay may hit missing deps and
|
|
73
|
+
// report a result the verdict logic handles — never a silent pass.
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
return { ok: true, checkout: { checkoutDir: input.worktreeDir, cleanup } };
|
|
78
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Grading-surface diff-checker (BenchJack V2/V8 defense). The agent's diff
|
|
3
|
+
* must touch ONLY source — never the test harness / its own grader. A diff
|
|
4
|
+
* that edits the grading surface is TAMPER, not a normal failure.
|
|
5
|
+
*
|
|
6
|
+
* NOTE: package.json is deliberately NOT protected here (legit dep edits are
|
|
7
|
+
* common); detecting edits to its `scripts.test` field specifically is a
|
|
8
|
+
* Phase-B field-level refinement.
|
|
9
|
+
*/
|
|
10
|
+
export declare const DEFAULT_PROTECTED_PATTERNS: readonly RegExp[];
|
|
11
|
+
export interface TestAuthorship {
|
|
12
|
+
/** Matches files considered "tests" (whose edits are gated). */
|
|
13
|
+
readonly testFilePattern: RegExp;
|
|
14
|
+
/** Test files the agent legitimately created/edited THIS task (allowlisted). */
|
|
15
|
+
readonly authoredTestFiles: readonly string[];
|
|
16
|
+
}
|
|
17
|
+
export interface DiffCheckResult {
|
|
18
|
+
readonly tampered: boolean;
|
|
19
|
+
readonly offendingPaths: readonly string[];
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Pure. `changedPaths` are repo-relative POSIX paths the agent's diff touched.
|
|
23
|
+
* A path is offending if it matches a protected pattern, OR it is a test file
|
|
24
|
+
* (per `authorship.testFilePattern`) that is not in `authoredTestFiles`.
|
|
25
|
+
*/
|
|
26
|
+
export declare function checkDiff(changedPaths: readonly string[], patterns?: readonly RegExp[], authorship?: TestAuthorship): DiffCheckResult;
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Grading-surface diff-checker (BenchJack V2/V8 defense). The agent's diff
|
|
3
|
+
* must touch ONLY source — never the test harness / its own grader. A diff
|
|
4
|
+
* that edits the grading surface is TAMPER, not a normal failure.
|
|
5
|
+
*
|
|
6
|
+
* NOTE: package.json is deliberately NOT protected here (legit dep edits are
|
|
7
|
+
* common); detecting edits to its `scripts.test` field specifically is a
|
|
8
|
+
* Phase-B field-level refinement.
|
|
9
|
+
*/
|
|
10
|
+
export const DEFAULT_PROTECTED_PATTERNS = Object.freeze([
|
|
11
|
+
/(^|\/)conftest\.py$/,
|
|
12
|
+
/(^|\/)pytest\.ini$/,
|
|
13
|
+
/(^|\/)tox\.ini$/,
|
|
14
|
+
/(^|\/)(jest|vitest|playwright)\.config\.[cm]?[jt]s$/,
|
|
15
|
+
/(^|\/)\.mocharc\.[a-z]+$/,
|
|
16
|
+
/(^|\/)\.git(\/|$)/,
|
|
17
|
+
]);
|
|
18
|
+
/**
|
|
19
|
+
* Pure. `changedPaths` are repo-relative POSIX paths the agent's diff touched.
|
|
20
|
+
* A path is offending if it matches a protected pattern, OR it is a test file
|
|
21
|
+
* (per `authorship.testFilePattern`) that is not in `authoredTestFiles`.
|
|
22
|
+
*/
|
|
23
|
+
export function checkDiff(changedPaths, patterns = DEFAULT_PROTECTED_PATTERNS, authorship) {
|
|
24
|
+
const authored = new Set(authorship?.authoredTestFiles ?? []);
|
|
25
|
+
const offendingPaths = changedPaths.filter((p) => {
|
|
26
|
+
if (patterns.some((re) => re.test(p)))
|
|
27
|
+
return true;
|
|
28
|
+
if (authorship && authorship.testFilePattern.test(p) && !authored.has(p))
|
|
29
|
+
return true;
|
|
30
|
+
return false;
|
|
31
|
+
});
|
|
32
|
+
return { tampered: offendingPaths.length > 0, offendingPaths };
|
|
33
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import type { ReproductionVerdict } from "./verdict.js";
|
|
2
|
+
export type EnforcementAction = "allow" | "block" | "surface" | "escalate";
|
|
3
|
+
export interface EnforcementDecision {
|
|
4
|
+
readonly action: EnforcementAction;
|
|
5
|
+
readonly reason: string;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Asymmetric enforcement (spec §3.2). A false PASS is the moat-killer; a false
|
|
9
|
+
* BLOCK is recoverable — so reproduction-sourced tamper/contradicted HARD-BLOCK,
|
|
10
|
+
* while weaker signals surface or escalate. Keyed on the REPRODUCTION verdict
|
|
11
|
+
* (trustworthy, executable) — never on bare LLM-judge text (the TNR<25% yes-man
|
|
12
|
+
* problem is exactly why the enforce-flip waited for the reproduction channel).
|
|
13
|
+
*/
|
|
14
|
+
export declare function enforceReproductionVerdict(verdict: ReproductionVerdict): EnforcementDecision;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Asymmetric enforcement (spec §3.2). A false PASS is the moat-killer; a false
|
|
3
|
+
* BLOCK is recoverable — so reproduction-sourced tamper/contradicted HARD-BLOCK,
|
|
4
|
+
* while weaker signals surface or escalate. Keyed on the REPRODUCTION verdict
|
|
5
|
+
* (trustworthy, executable) — never on bare LLM-judge text (the TNR<25% yes-man
|
|
6
|
+
* problem is exactly why the enforce-flip waited for the reproduction channel).
|
|
7
|
+
*/
|
|
8
|
+
export function enforceReproductionVerdict(verdict) {
|
|
9
|
+
switch (verdict) {
|
|
10
|
+
case "tamper":
|
|
11
|
+
return { action: "block", reason: "diff tampered with the grading surface" };
|
|
12
|
+
case "contradicted":
|
|
13
|
+
return {
|
|
14
|
+
action: "block",
|
|
15
|
+
reason: "claimed success contradicted by independent reproduction",
|
|
16
|
+
};
|
|
17
|
+
case "weak-tests":
|
|
18
|
+
return {
|
|
19
|
+
action: "surface",
|
|
20
|
+
reason: "reproduction passed but tests are too weak to trust (low mutation score)",
|
|
21
|
+
};
|
|
22
|
+
case "infra-error":
|
|
23
|
+
return {
|
|
24
|
+
action: "escalate",
|
|
25
|
+
reason: "could not reproduce — verify manually before trusting",
|
|
26
|
+
};
|
|
27
|
+
case "reproduced":
|
|
28
|
+
return { action: "allow", reason: "independently reproduced in a separate trust boundary" };
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { ReplayRunner } from "./replay-runner.js";
|
|
2
|
+
/**
|
|
3
|
+
* Production `ReplayRunner` backed by `child_process.execFile` (argv-only — no
|
|
4
|
+
* shell interpolation, so injection-safe regardless of the command content,
|
|
5
|
+
* matching the codebase's execFileNoThrow contract). Runs the claimed commands
|
|
6
|
+
* inside `dir`, the verifier-box checkout.
|
|
7
|
+
*
|
|
8
|
+
* Honest probe: confirms the Node interpreter is runnable; reports the failure
|
|
9
|
+
* explicitly otherwise (so runReplay emits `infra-error`, never a silent pass).
|
|
10
|
+
*
|
|
11
|
+
* Trust-boundary note: this runs on the HOST in a given directory. Full
|
|
12
|
+
* separate-trust-boundary isolation (a container, per spec §3.1) is the deferred
|
|
13
|
+
* production hardening — this is the host-dir baseline so the loop runs for real.
|
|
14
|
+
*/
|
|
15
|
+
export declare function buildExecReplayRunner(): ReplayRunner;
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Production `ReplayRunner` backed by `child_process.execFile` (argv-only — no
|
|
3
|
+
* shell interpolation, so injection-safe regardless of the command content,
|
|
4
|
+
* matching the codebase's execFileNoThrow contract). Runs the claimed commands
|
|
5
|
+
* inside `dir`, the verifier-box checkout.
|
|
6
|
+
*
|
|
7
|
+
* Honest probe: confirms the Node interpreter is runnable; reports the failure
|
|
8
|
+
* explicitly otherwise (so runReplay emits `infra-error`, never a silent pass).
|
|
9
|
+
*
|
|
10
|
+
* Trust-boundary note: this runs on the HOST in a given directory. Full
|
|
11
|
+
* separate-trust-boundary isolation (a container, per spec §3.1) is the deferred
|
|
12
|
+
* production hardening — this is the host-dir baseline so the loop runs for real.
|
|
13
|
+
*/
|
|
14
|
+
export function buildExecReplayRunner() {
|
|
15
|
+
return {
|
|
16
|
+
probe: async () => {
|
|
17
|
+
const { execFile } = await import("node:child_process");
|
|
18
|
+
return new Promise((resolve) => {
|
|
19
|
+
execFile(process.execPath, ["--version"], (error) => {
|
|
20
|
+
resolve(error
|
|
21
|
+
? { ok: false, reason: error instanceof Error ? error.message : String(error) }
|
|
22
|
+
: { ok: true });
|
|
23
|
+
});
|
|
24
|
+
});
|
|
25
|
+
},
|
|
26
|
+
runInDir: async (dir, argv) => {
|
|
27
|
+
const { execFile } = await import("node:child_process");
|
|
28
|
+
const [file, ...rest] = argv;
|
|
29
|
+
if (!file)
|
|
30
|
+
return { exitCode: 1, stdout: "", stderr: "empty argv" };
|
|
31
|
+
return new Promise((resolve) => {
|
|
32
|
+
execFile(file, rest, { cwd: dir, maxBuffer: 10 * 1024 * 1024 }, (error, stdout, stderr) => {
|
|
33
|
+
const exitCode = error && typeof error.code === "number"
|
|
34
|
+
? Number(error.code)
|
|
35
|
+
: error
|
|
36
|
+
? 1
|
|
37
|
+
: 0;
|
|
38
|
+
resolve({
|
|
39
|
+
exitCode,
|
|
40
|
+
stdout: stdout?.toString() ?? "",
|
|
41
|
+
stderr: stderr?.toString() ?? (error instanceof Error ? error.message : ""),
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
},
|
|
46
|
+
};
|
|
47
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export { decideReproductionVerdict, type ReproductionVerdict, type ClaimedChecks, type ObservedChecks, type ReproductionInput, type ReproductionResult, } from "./verdict.js";
|
|
2
|
+
export { checkDiff, DEFAULT_PROTECTED_PATTERNS, type DiffCheckResult, type TestAuthorship, } from "./diff-checker.js";
|
|
3
|
+
export { runReplay, type ReplayRunner, type ReplayCommands, type ReplayInput, } from "./replay-runner.js";
|
|
4
|
+
export { buildProofArtifact, type ProofArtifact } from "./proof-artifact.js";
|
|
5
|
+
export { gateMutation, runMutationGate, DEFAULT_MUTATION_THRESHOLD, type MutationResult, type MutationGateResult, type MutationRunner, } from "./mutation-gate.js";
|
|
6
|
+
export { enforceReproductionVerdict, type EnforcementAction, type EnforcementDecision, } from "./enforcement.js";
|
|
7
|
+
export { reproduceRun, type ReproduceInput, type ReproduceRunners, type ReproduceOutput, } from "./reproduce.js";
|
|
8
|
+
export { buildExecReplayRunner } from "./exec-runner.js";
|
|
9
|
+
export { prepareVerifierCheckout, buildExecGitRunner, type GitRunner, type PrepareCheckoutInput, type VerifierCheckout, type CheckoutResult, } from "./checkout-prep.js";
|
|
10
|
+
export { reproduceAutonomousRun, runWorkspaceReproduction, type AutonomousReproduceInput, type AutonomousReproduceRunners, type AutonomousReproduceOutput, type WorkspaceReproduceInput, } from "./autonomous-gate.js";
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export { decideReproductionVerdict, } from "./verdict.js";
|
|
2
|
+
export { checkDiff, DEFAULT_PROTECTED_PATTERNS, } from "./diff-checker.js";
|
|
3
|
+
export { runReplay, } from "./replay-runner.js";
|
|
4
|
+
export { buildProofArtifact } from "./proof-artifact.js";
|
|
5
|
+
export { gateMutation, runMutationGate, DEFAULT_MUTATION_THRESHOLD, } from "./mutation-gate.js";
|
|
6
|
+
export { enforceReproductionVerdict, } from "./enforcement.js";
|
|
7
|
+
export { reproduceRun, } from "./reproduce.js";
|
|
8
|
+
export { buildExecReplayRunner } from "./exec-runner.js";
|
|
9
|
+
export { prepareVerifierCheckout, buildExecGitRunner, } from "./checkout-prep.js";
|
|
10
|
+
export { reproduceAutonomousRun, runWorkspaceReproduction, } from "./autonomous-gate.js";
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Mutation gate (BenchJack V6 defense): a green test run is worthless if the
|
|
3
|
+
* tests have no real assertions ("100% coverage, 0% mutation score"). Mutate
|
|
4
|
+
* the changed code; if the agent's OWN tests don't kill enough mutants,
|
|
5
|
+
* downgrade a "reproduced" verdict to "weak-tests".
|
|
6
|
+
*
|
|
7
|
+
* The gate DECISION is pure. Running the mutation tool (Stryker/mutmut) inside
|
|
8
|
+
* the verifier box is the injected `MutationRunner` (production wiring deferred,
|
|
9
|
+
* same DI pattern as replay-runner / vm-isolation).
|
|
10
|
+
*/
|
|
11
|
+
export interface MutationResult {
|
|
12
|
+
readonly killed: number;
|
|
13
|
+
readonly total: number;
|
|
14
|
+
}
|
|
15
|
+
export interface MutationGateResult {
|
|
16
|
+
readonly weakTests: boolean;
|
|
17
|
+
readonly score: number;
|
|
18
|
+
readonly threshold: number;
|
|
19
|
+
readonly reason?: string;
|
|
20
|
+
}
|
|
21
|
+
export declare const DEFAULT_MUTATION_THRESHOLD = 0.6;
|
|
22
|
+
/**
|
|
23
|
+
* Pure. `total <= 0` means no mutants were generated — the tests do not
|
|
24
|
+
* exercise the changed code at all — which is the worst case, so it is `weak`.
|
|
25
|
+
*/
|
|
26
|
+
export declare function gateMutation(result: MutationResult, threshold?: number): MutationGateResult;
|
|
27
|
+
/** Injected runner (DI like replay-runner). Production wires Stryker/mutmut; tests stub. */
|
|
28
|
+
export interface MutationRunner {
|
|
29
|
+
readonly probe: () => Promise<{
|
|
30
|
+
ok: boolean;
|
|
31
|
+
reason?: string;
|
|
32
|
+
}>;
|
|
33
|
+
readonly run: (dir: string, changedFiles: readonly string[]) => Promise<MutationResult>;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Honest stub: a failed probe means the mutation tool is unavailable. Mutation
|
|
37
|
+
* testing is a BONUS downgrade (the reproduction channel carries enforcement),
|
|
38
|
+
* so "unavailable" does NOT set `weakTests` — it just records why it was skipped.
|
|
39
|
+
*/
|
|
40
|
+
export declare function runMutationGate(dir: string, changedFiles: readonly string[], runner: MutationRunner, threshold?: number): Promise<MutationGateResult & {
|
|
41
|
+
readonly unavailable?: string;
|
|
42
|
+
}>;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
export const DEFAULT_MUTATION_THRESHOLD = 0.6;
|
|
2
|
+
/**
|
|
3
|
+
* Pure. `total <= 0` means no mutants were generated — the tests do not
|
|
4
|
+
* exercise the changed code at all — which is the worst case, so it is `weak`.
|
|
5
|
+
*/
|
|
6
|
+
export function gateMutation(result, threshold = DEFAULT_MUTATION_THRESHOLD) {
|
|
7
|
+
if (result.total <= 0) {
|
|
8
|
+
return {
|
|
9
|
+
weakTests: true,
|
|
10
|
+
score: 0,
|
|
11
|
+
threshold,
|
|
12
|
+
reason: "no mutants generated — tests do not exercise the changed code",
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
const score = result.killed / result.total;
|
|
16
|
+
if (score < threshold) {
|
|
17
|
+
return {
|
|
18
|
+
weakTests: true,
|
|
19
|
+
score,
|
|
20
|
+
threshold,
|
|
21
|
+
reason: `mutation score ${score.toFixed(2)} < threshold ${threshold}`,
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
return { weakTests: false, score, threshold };
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Honest stub: a failed probe means the mutation tool is unavailable. Mutation
|
|
28
|
+
* testing is a BONUS downgrade (the reproduction channel carries enforcement),
|
|
29
|
+
* so "unavailable" does NOT set `weakTests` — it just records why it was skipped.
|
|
30
|
+
*/
|
|
31
|
+
export async function runMutationGate(dir, changedFiles, runner, threshold = DEFAULT_MUTATION_THRESHOLD) {
|
|
32
|
+
const probe = await runner.probe();
|
|
33
|
+
if (!probe.ok) {
|
|
34
|
+
return {
|
|
35
|
+
weakTests: false,
|
|
36
|
+
score: 0,
|
|
37
|
+
threshold,
|
|
38
|
+
unavailable: `mutation:unavailable${probe.reason ? `:${probe.reason}` : ""}`,
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
const result = await runner.run(dir, changedFiles);
|
|
42
|
+
return gateMutation(result, threshold);
|
|
43
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import { type ChainExport } from "../../security/hash-audit-chain.js";
|
|
2
|
+
import type { ReproductionResult, ReproductionVerdict, ClaimedChecks, ObservedChecks } from "./verdict.js";
|
|
3
|
+
/**
|
|
4
|
+
* The verdict is a first-class, hash-chained, harness-signed PROOF ARTIFACT —
|
|
5
|
+
* not a bare boolean. `chainExport` is tamper-evident (SHA-256 linked); any
|
|
6
|
+
* post-hoc edit to the recorded verdict/observed breaks `HashAuditChain.verify()`.
|
|
7
|
+
*/
|
|
8
|
+
export interface ProofArtifact {
|
|
9
|
+
readonly verdict: ReproductionVerdict;
|
|
10
|
+
readonly diffCid: string;
|
|
11
|
+
readonly claimed: ClaimedChecks;
|
|
12
|
+
readonly observed: ObservedChecks;
|
|
13
|
+
readonly contradictions: readonly string[];
|
|
14
|
+
readonly chainExport: ChainExport;
|
|
15
|
+
}
|
|
16
|
+
export declare function buildProofArtifact(result: ReproductionResult, diffText: string, actor?: string): ProofArtifact;
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { HashAuditChain } from "../../security/hash-audit-chain.js";
|
|
2
|
+
import { cidOf } from "../../core/content-cid.js";
|
|
3
|
+
export function buildProofArtifact(result, diffText, actor = "wotann-verifier") {
|
|
4
|
+
const diffCid = cidOf(diffText);
|
|
5
|
+
const chain = new HashAuditChain();
|
|
6
|
+
chain.append("reproduction.verdict", actor, {
|
|
7
|
+
verdict: result.verdict,
|
|
8
|
+
diffCid,
|
|
9
|
+
claimed: result.claimed,
|
|
10
|
+
observed: result.observed,
|
|
11
|
+
contradictions: result.contradictions,
|
|
12
|
+
...(result.infraError ? { infraError: result.infraError } : {}),
|
|
13
|
+
});
|
|
14
|
+
return {
|
|
15
|
+
verdict: result.verdict,
|
|
16
|
+
diffCid,
|
|
17
|
+
claimed: result.claimed,
|
|
18
|
+
observed: result.observed,
|
|
19
|
+
contradictions: result.contradictions,
|
|
20
|
+
chainExport: chain.exportChain(),
|
|
21
|
+
};
|
|
22
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import type { ObservedChecks } from "./verdict.js";
|
|
2
|
+
/**
|
|
3
|
+
* Structural runner (mirrors VmRunner in computer-use/vm-isolation.ts). Tests
|
|
4
|
+
* inject an in-memory stub; production wires this to run argv INSIDE the
|
|
5
|
+
* verifier box (container/sandbox = the separate trust boundary). Argv-only,
|
|
6
|
+
* never a shell string — no interpolation.
|
|
7
|
+
*/
|
|
8
|
+
export interface ReplayRunner {
|
|
9
|
+
readonly probe: () => Promise<{
|
|
10
|
+
ok: boolean;
|
|
11
|
+
reason?: string;
|
|
12
|
+
}>;
|
|
13
|
+
readonly runInDir: (dir: string, argv: readonly string[]) => Promise<{
|
|
14
|
+
exitCode: number;
|
|
15
|
+
stdout: string;
|
|
16
|
+
stderr: string;
|
|
17
|
+
}>;
|
|
18
|
+
}
|
|
19
|
+
export interface ReplayCommands {
|
|
20
|
+
readonly test?: readonly string[];
|
|
21
|
+
readonly typecheck?: readonly string[];
|
|
22
|
+
readonly lint?: readonly string[];
|
|
23
|
+
}
|
|
24
|
+
export interface ReplayInput {
|
|
25
|
+
/** The clean, applied-diff checkout in the verifier box (separate trust boundary). */
|
|
26
|
+
readonly checkoutDir: string;
|
|
27
|
+
readonly commands: ReplayCommands;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Re-run the claimed commands in the verifier box and report the HARNESS's own
|
|
31
|
+
* pass/fail (exit code 0 = pass). NEVER reads the agent's self-reported result.
|
|
32
|
+
* Honest stub: a failed probe yields all-null observed + infraError, so the
|
|
33
|
+
* verdict logic emits `infra-error` and never auto-passes.
|
|
34
|
+
*/
|
|
35
|
+
export declare function runReplay(input: ReplayInput, runner: ReplayRunner): Promise<ObservedChecks & {
|
|
36
|
+
readonly infraError?: string;
|
|
37
|
+
}>;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Re-run the claimed commands in the verifier box and report the HARNESS's own
|
|
3
|
+
* pass/fail (exit code 0 = pass). NEVER reads the agent's self-reported result.
|
|
4
|
+
* Honest stub: a failed probe yields all-null observed + infraError, so the
|
|
5
|
+
* verdict logic emits `infra-error` and never auto-passes.
|
|
6
|
+
*/
|
|
7
|
+
export async function runReplay(input, runner) {
|
|
8
|
+
const probe = await runner.probe();
|
|
9
|
+
if (!probe.ok) {
|
|
10
|
+
return {
|
|
11
|
+
testsPass: null,
|
|
12
|
+
typecheckPass: null,
|
|
13
|
+
lintPass: null,
|
|
14
|
+
infraError: `replay:unavailable${probe.reason ? `:${probe.reason}` : ""}`,
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
const observe = async (argv) => {
|
|
18
|
+
if (!argv || argv.length === 0)
|
|
19
|
+
return null;
|
|
20
|
+
const r = await runner.runInDir(input.checkoutDir, argv);
|
|
21
|
+
return r.exitCode === 0;
|
|
22
|
+
};
|
|
23
|
+
return {
|
|
24
|
+
testsPass: await observe(input.commands.test),
|
|
25
|
+
typecheckPass: await observe(input.commands.typecheck),
|
|
26
|
+
lintPass: await observe(input.commands.lint),
|
|
27
|
+
};
|
|
28
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { type TestAuthorship } from "./diff-checker.js";
|
|
2
|
+
import { type ReplayRunner, type ReplayCommands } from "./replay-runner.js";
|
|
3
|
+
import { type MutationRunner } from "./mutation-gate.js";
|
|
4
|
+
import { type ClaimedChecks, type ReproductionResult } from "./verdict.js";
|
|
5
|
+
import { type ProofArtifact } from "./proof-artifact.js";
|
|
6
|
+
export interface ReproduceInput {
|
|
7
|
+
readonly claimed: ClaimedChecks;
|
|
8
|
+
/** Repo-relative POSIX paths the agent's diff touched. */
|
|
9
|
+
readonly changedPaths: readonly string[];
|
|
10
|
+
/** The clean, applied-diff checkout in the verifier box (separate trust boundary). */
|
|
11
|
+
readonly checkoutDir: string;
|
|
12
|
+
readonly commands: ReplayCommands;
|
|
13
|
+
/** The diff text (for the proof artifact's content identity). */
|
|
14
|
+
readonly diffText: string;
|
|
15
|
+
readonly authorship?: TestAuthorship;
|
|
16
|
+
/** Changed source files to mutation-test (Phase-B bonus gate). */
|
|
17
|
+
readonly mutationFiles?: readonly string[];
|
|
18
|
+
}
|
|
19
|
+
export interface ReproduceRunners {
|
|
20
|
+
readonly replay: ReplayRunner;
|
|
21
|
+
/** Optional — when present and the reproduction is clean, gates weak-tests. */
|
|
22
|
+
readonly mutation?: MutationRunner;
|
|
23
|
+
}
|
|
24
|
+
export interface ReproduceOutput {
|
|
25
|
+
readonly result: ReproductionResult;
|
|
26
|
+
readonly proof: ProofArtifact;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Compose the reproduction library into one call: tamper-check the diff, replay
|
|
30
|
+
* the claimed commands in the verifier box (capturing the harness's OWN result),
|
|
31
|
+
* optionally mutation-gate a clean reproduction, decide the verdict, and emit
|
|
32
|
+
* the hash-chained proof artifact. Pure orchestration over injected runners.
|
|
33
|
+
*/
|
|
34
|
+
export declare function reproduceRun(input: ReproduceInput, runners: ReproduceRunners): Promise<ReproduceOutput>;
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { checkDiff } from "./diff-checker.js";
|
|
2
|
+
import { runReplay } from "./replay-runner.js";
|
|
3
|
+
import { runMutationGate } from "./mutation-gate.js";
|
|
4
|
+
import { decideReproductionVerdict, } from "./verdict.js";
|
|
5
|
+
import { buildProofArtifact } from "./proof-artifact.js";
|
|
6
|
+
/**
|
|
7
|
+
* Compose the reproduction library into one call: tamper-check the diff, replay
|
|
8
|
+
* the claimed commands in the verifier box (capturing the harness's OWN result),
|
|
9
|
+
* optionally mutation-gate a clean reproduction, decide the verdict, and emit
|
|
10
|
+
* the hash-chained proof artifact. Pure orchestration over injected runners.
|
|
11
|
+
*/
|
|
12
|
+
export async function reproduceRun(input, runners) {
|
|
13
|
+
const diff = checkDiff(input.changedPaths, undefined, input.authorship);
|
|
14
|
+
const observed = await runReplay({ checkoutDir: input.checkoutDir, commands: input.commands }, runners.replay);
|
|
15
|
+
// The mutation gate is a BONUS downgrade — only meaningful on an otherwise
|
|
16
|
+
// clean reproduction (skip it when we already have tamper or infra-error).
|
|
17
|
+
let weakTests = false;
|
|
18
|
+
if (runners.mutation && !diff.tampered && !observed.infraError) {
|
|
19
|
+
const gate = await runMutationGate(input.checkoutDir, input.mutationFiles ?? [], runners.mutation);
|
|
20
|
+
weakTests = gate.weakTests;
|
|
21
|
+
}
|
|
22
|
+
const result = decideReproductionVerdict({
|
|
23
|
+
claimed: input.claimed,
|
|
24
|
+
observed,
|
|
25
|
+
tampered: diff.tampered,
|
|
26
|
+
weakTests,
|
|
27
|
+
infraError: observed.infraError,
|
|
28
|
+
});
|
|
29
|
+
const proof = buildProofArtifact(result, input.diffText);
|
|
30
|
+
return { result, proof };
|
|
31
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reproduction verdict — the harness's OWN judgment, produced by re-running
|
|
3
|
+
* the claimed work in a separate trust boundary. NEVER the agent's self-report
|
|
4
|
+
* (that is the V7 violation this whole module exists to defeat).
|
|
5
|
+
*/
|
|
6
|
+
export type ReproductionVerdict = "reproduced" | "contradicted" | "weak-tests" | "tamper" | "infra-error";
|
|
7
|
+
export interface ClaimedChecks {
|
|
8
|
+
readonly testsPass: boolean;
|
|
9
|
+
readonly typecheckPass: boolean;
|
|
10
|
+
readonly lintPass: boolean;
|
|
11
|
+
}
|
|
12
|
+
/** `null` = the check was not run / unavailable, so it cannot contradict a claim. */
|
|
13
|
+
export interface ObservedChecks {
|
|
14
|
+
readonly testsPass: boolean | null;
|
|
15
|
+
readonly typecheckPass: boolean | null;
|
|
16
|
+
readonly lintPass: boolean | null;
|
|
17
|
+
}
|
|
18
|
+
export interface ReproductionInput {
|
|
19
|
+
readonly claimed: ClaimedChecks;
|
|
20
|
+
readonly observed: ObservedChecks;
|
|
21
|
+
readonly tampered: boolean;
|
|
22
|
+
/** Set by the Phase-B mutation gate; Phase A never sets it. */
|
|
23
|
+
readonly weakTests?: boolean;
|
|
24
|
+
/** Set when the replay could not run at all. */
|
|
25
|
+
readonly infraError?: string;
|
|
26
|
+
}
|
|
27
|
+
export interface ReproductionResult {
|
|
28
|
+
readonly verdict: ReproductionVerdict;
|
|
29
|
+
readonly claimed: ClaimedChecks;
|
|
30
|
+
readonly observed: ObservedChecks;
|
|
31
|
+
readonly contradictions: readonly string[];
|
|
32
|
+
readonly infraError?: string;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Pure decision. Precedence (a false PASS is the moat-killer; a false BLOCK is
|
|
36
|
+
* recoverable, so we bias toward blocking): tamper > infra-error > contradicted
|
|
37
|
+
* > weak-tests > reproduced.
|
|
38
|
+
*/
|
|
39
|
+
export declare function decideReproductionVerdict(input: ReproductionInput): ReproductionResult;
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
const CHECKS = [
|
|
2
|
+
{ name: "tests", claimedKey: "testsPass", observedKey: "testsPass" },
|
|
3
|
+
{ name: "typecheck", claimedKey: "typecheckPass", observedKey: "typecheckPass" },
|
|
4
|
+
{ name: "lint", claimedKey: "lintPass", observedKey: "lintPass" },
|
|
5
|
+
];
|
|
6
|
+
/**
|
|
7
|
+
* Pure decision. Precedence (a false PASS is the moat-killer; a false BLOCK is
|
|
8
|
+
* recoverable, so we bias toward blocking): tamper > infra-error > contradicted
|
|
9
|
+
* > weak-tests > reproduced.
|
|
10
|
+
*/
|
|
11
|
+
export function decideReproductionVerdict(input) {
|
|
12
|
+
const contradictions = [];
|
|
13
|
+
for (const c of CHECKS) {
|
|
14
|
+
const claimed = input.claimed[c.claimedKey];
|
|
15
|
+
const observed = input.observed[c.observedKey];
|
|
16
|
+
// Only a claimed-PASS that the harness observed as FAIL is a contradiction.
|
|
17
|
+
// observed === null means "not run", which cannot contradict anything.
|
|
18
|
+
if (claimed === true && observed === false) {
|
|
19
|
+
contradictions.push(`${c.name}: agent claimed pass, harness observed fail`);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
let verdict;
|
|
23
|
+
if (input.tampered)
|
|
24
|
+
verdict = "tamper";
|
|
25
|
+
else if (input.infraError)
|
|
26
|
+
verdict = "infra-error";
|
|
27
|
+
else if (contradictions.length > 0)
|
|
28
|
+
verdict = "contradicted";
|
|
29
|
+
else if (input.weakTests)
|
|
30
|
+
verdict = "weak-tests";
|
|
31
|
+
else
|
|
32
|
+
verdict = "reproduced";
|
|
33
|
+
return {
|
|
34
|
+
verdict,
|
|
35
|
+
claimed: input.claimed,
|
|
36
|
+
observed: input.observed,
|
|
37
|
+
contradictions,
|
|
38
|
+
...(input.infraError ? { infraError: input.infraError } : {}),
|
|
39
|
+
};
|
|
40
|
+
}
|