slice-tournament-zoo 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +357 -0
- package/bin/stz.mjs +15 -0
- package/package.json +35 -0
- package/src/README.md +19 -0
- package/src/bridge.ts +950 -0
- package/src/budget.ts +78 -0
- package/src/cli.ts +126 -0
- package/src/cost-tracker.ts +59 -0
- package/src/escalation.ts +89 -0
- package/src/eval-runner.ts +220 -0
- package/src/grpo.ts +54 -0
- package/src/hack-detector.ts +124 -0
- package/src/index.ts +17 -0
- package/src/merge.ts +245 -0
- package/src/mock/README.md +40 -0
- package/src/mock/interfaces.ts +114 -0
- package/src/mock/mock.ts +223 -0
- package/src/mock/orchestrator.ts +457 -0
- package/src/pressure.ts +81 -0
- package/src/project.ts +335 -0
- package/src/seal.ts +182 -0
- package/src/selection.ts +128 -0
- package/src/specdiff.ts +141 -0
- package/src/state.ts +95 -0
- package/src/taxonomy.ts +161 -0
- package/src/types.ts +305 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Layer L3 of the anti-reward-hacking defense (F10): the trace/diff inspector.
|
|
3
|
+
*
|
|
4
|
+
* Scans specimen source for known reward-hacking signatures. Any finding is a
|
|
5
|
+
* disqualification at the eval gate (F7 stage 1). Each finding carries a
|
|
6
|
+
* remediation string that is re-injected into the specimen's next prompt if the
|
|
7
|
+
* failure-replan loop activates (F14, Codex custom-linter pattern).
|
|
8
|
+
*
|
|
9
|
+
* This is a pragmatic regex/line-scan detector (the AST upgrade is a v1.1 task
|
|
10
|
+
* in the roadmap). It is intentionally conservative: it flags the documented
|
|
11
|
+
* patterns and explains why, rather than guessing at novel ones.
|
|
12
|
+
*/
|
|
13
|
+
import type { HackFinding, HackPattern, SpecimenId } from "./types.js";
|
|
14
|
+
|
|
15
|
+
interface Rule {
|
|
16
|
+
pattern: HackPattern;
|
|
17
|
+
/** Matches a line that exhibits the pattern. */
|
|
18
|
+
test: (line: string, ctx: ScanContext) => boolean;
|
|
19
|
+
remediation: string;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
interface ScanContext {
|
|
23
|
+
/** Names of sealed test fixtures the specimen must not key behaviour on. */
|
|
24
|
+
fixtureNames: string[];
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
const RULES: Rule[] = [
|
|
28
|
+
{
|
|
29
|
+
pattern: "test-skip",
|
|
30
|
+
test: (l) =>
|
|
31
|
+
/\b(it|test|describe)\.(skip|todo)\b/.test(l) ||
|
|
32
|
+
/\bx(it|describe)\b/.test(l) ||
|
|
33
|
+
/@pytest\.mark\.skip/.test(l) ||
|
|
34
|
+
/\bpytest\.skip\(/.test(l),
|
|
35
|
+
remediation:
|
|
36
|
+
"Do not skip or mark tests todo. Implement the contract so every sealed test runs and passes.",
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
pattern: "assertion-mutation",
|
|
40
|
+
test: (l) =>
|
|
41
|
+
// Weakening assertions to always-true forms.
|
|
42
|
+
/\bassert\s+true\b/i.test(l) ||
|
|
43
|
+
/\bexpect\([^)]*\)\.toBe(Truthy|Defined)\(\)\s*;?\s*\/\/\s*always/i.test(l) ||
|
|
44
|
+
/\bassert\s+1\s*==\s*1\b/.test(l) ||
|
|
45
|
+
/\bexpect\(true\)\.toBe\(true\)/.test(l),
|
|
46
|
+
remediation:
|
|
47
|
+
"Do not replace meaningful assertions with tautologies. Assert the real contract behaviour.",
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
pattern: "network-bypass",
|
|
51
|
+
test: (l) =>
|
|
52
|
+
// Real network egress in code that should use fixtures.
|
|
53
|
+
/\b(fetch|axios|requests\.(get|post)|urllib|http\.request)\s*\(/.test(l) &&
|
|
54
|
+
!/mock|fixture|localhost|127\.0\.0\.1/.test(l),
|
|
55
|
+
remediation:
|
|
56
|
+
"Do not make live network calls that bypass fixtures. Use the provided test doubles.",
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
pattern: "fixture-keyed-branch",
|
|
60
|
+
test: (l, ctx) =>
|
|
61
|
+
ctx.fixtureNames.length > 0 &&
|
|
62
|
+
ctx.fixtureNames.some(
|
|
63
|
+
(f) => l.includes(f) && /\b(if|switch|case|when|elif)\b/.test(l),
|
|
64
|
+
),
|
|
65
|
+
remediation:
|
|
66
|
+
"Do not special-case behaviour on a specific test fixture name/value. Implement the general contract.",
|
|
67
|
+
},
|
|
68
|
+
{
|
|
69
|
+
pattern: "hardcoded-test-input",
|
|
70
|
+
test: (l) =>
|
|
71
|
+
// return of a literal keyed on an obvious test sentinel.
|
|
72
|
+
/\bif\b.*==.*["'](test|expected|__TEST__|fixture)["']/.test(l) ||
|
|
73
|
+
/return\s+["'](EXPECTED_OUTPUT|__GOLDEN__)["']/.test(l),
|
|
74
|
+
remediation:
|
|
75
|
+
"Do not hardcode expected outputs keyed on test sentinels. Compute the result from inputs.",
|
|
76
|
+
},
|
|
77
|
+
];
|
|
78
|
+
|
|
79
|
+
/** Scan one specimen's source files. `files` maps relative path → contents. */
|
|
80
|
+
export function detectHacks(
|
|
81
|
+
specimen: SpecimenId,
|
|
82
|
+
files: Record<string, string>,
|
|
83
|
+
ctx: ScanContext = { fixtureNames: [] },
|
|
84
|
+
): HackFinding[] {
|
|
85
|
+
const findings: HackFinding[] = [];
|
|
86
|
+
for (const [path, content] of Object.entries(files)) {
|
|
87
|
+
// Only scan implementation files; specimens never author the sealed tests,
|
|
88
|
+
// but a specimen may add its own helper tests — those are still in scope
|
|
89
|
+
// for skip/assertion-mutation detection.
|
|
90
|
+
const lines = content.split("\n");
|
|
91
|
+
lines.forEach((line, i) => {
|
|
92
|
+
const stripped = stripComments(line);
|
|
93
|
+
for (const rule of RULES) {
|
|
94
|
+
if (rule.test(stripped, ctx)) {
|
|
95
|
+
findings.push({
|
|
96
|
+
specimen,
|
|
97
|
+
pattern: rule.pattern,
|
|
98
|
+
location: `${path}:${i + 1}`,
|
|
99
|
+
remediation: rule.remediation,
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
return findings;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/** Strip trailing line comments so commented-out code does not false-positive. */
|
|
109
|
+
function stripComments(line: string): string {
|
|
110
|
+
// Keep `// always` style markers used by assertion-mutation rule, so only
|
|
111
|
+
// strip a comment if it is clearly a full-line comment.
|
|
112
|
+
const trimmed = line.trimStart();
|
|
113
|
+
if (trimmed.startsWith("//") || trimmed.startsWith("#")) return "";
|
|
114
|
+
return line;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/** Aggregate all specimens' remediations for a replan prompt (F14). */
|
|
118
|
+
export function remediationContext(findings: HackFinding[]): string {
|
|
119
|
+
const unique = new Map<HackPattern, string>();
|
|
120
|
+
for (const f of findings) unique.set(f.pattern, f.remediation);
|
|
121
|
+
return [...unique.entries()]
|
|
122
|
+
.map(([p, r]) => `- [${p}] ${r}`)
|
|
123
|
+
.join("\n");
|
|
124
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/** STZ public surface. */
|
|
2
|
+
export * from "./types.js";
|
|
3
|
+
export * from "./grpo.js";
|
|
4
|
+
export * from "./selection.js";
|
|
5
|
+
export * from "./hack-detector.js";
|
|
6
|
+
export * from "./escalation.js";
|
|
7
|
+
export * from "./budget.js";
|
|
8
|
+
export * from "./cost-tracker.js";
|
|
9
|
+
export * from "./taxonomy.js";
|
|
10
|
+
export * from "./state.js";
|
|
11
|
+
export * from "./project.js";
|
|
12
|
+
export * from "./pressure.js";
|
|
13
|
+
export * from "./specdiff.js";
|
|
14
|
+
export * from "./seal.js";
|
|
15
|
+
export * from "./mock/orchestrator.js";
|
|
16
|
+
export * as llm from "./mock/interfaces.js";
|
|
17
|
+
export { MockModelLayer, defaultMockConfig, alwaysFailConfig } from "./mock/mock.js";
|
package/src/merge.ts
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-slice merge integrity — sealed-invariant supersession (F13-adjacent).
|
|
3
|
+
*
|
|
4
|
+
* When several slice winners are assembled into one integrated crate, an EARLIER
|
|
5
|
+
* slice's sealed suite can legitimately fail: it encodes an invariant that was
|
|
6
|
+
* correct in isolation but is obsolete under a LATER slice's composition. The
|
|
7
|
+
* canonical case from the dogfood run: slice-03's suite asserts "aliens never
|
|
8
|
+
* respawn", which slice-05's wave-clear deliberately supersedes. The assembled
|
|
9
|
+
* crate fails slice-03's suite — but that is not a merge defect.
|
|
10
|
+
*
|
|
11
|
+
* The danger is the orchestrator hand-waving that distinction ("looks like the
|
|
12
|
+
* expected interaction, moving on") — exactly the unaudited, gameable judgment
|
|
13
|
+
* STZ exists to eliminate. This module makes the call deterministic and audited
|
|
14
|
+
* instead. A failing sealed suite is only sanctioned when ALL of:
|
|
15
|
+
*
|
|
16
|
+
* 1. a **signature-pinned** compat entry matches the actual failure text (not
|
|
17
|
+
* the test name alone — the exact panic/assert substring),
|
|
18
|
+
* 2. the **superseding invariant also passes** on the same assembled crate
|
|
19
|
+
* (you cannot claim supersession when the replacement isn't even proven),
|
|
20
|
+
* 3. the entry is **approved** (the merge agent may propose but not self-bless).
|
|
21
|
+
*
|
|
22
|
+
* Trust boundary (be honest, same split as `eval` vs `record-eval`): this module
|
|
23
|
+
* does NOT run the suites — it can't, the assembled crate may be Rust. It
|
|
24
|
+
* consumes the *reported* per-suite results and deterministically ADJUDICATES
|
|
25
|
+
* those failures against the audited compat rules. A dishonest results file
|
|
26
|
+
* defeats it; the orchestration contract (run the suites in an ephemeral scratch
|
|
27
|
+
* copy, never the canonical crate) lives in the merge command doc.
|
|
28
|
+
*
|
|
29
|
+
* Rules 3 (ephemeral scratch) and the "agent can't self-approve" half of rule 1
|
|
30
|
+
* are conventions backed by AUDIT, not hard barriers: the append-only `history`
|
|
31
|
+
* makes a self-approval a visible anomaly rather than a silent one — STZ's
|
|
32
|
+
* auditability-over-prevention posture (N1). A compat entry is transitional debt:
|
|
33
|
+
* it points at a pending wave-aware amendment and is retired once that
|
|
34
|
+
* `seal-amend` lands and the amended (wave-aware) suite replaces the old one.
|
|
35
|
+
*/
|
|
36
|
+
import { readFileSync, existsSync, mkdirSync } from "node:fs";
|
|
37
|
+
import { writeFileSync } from "node:fs";
|
|
38
|
+
import { join } from "node:path";
|
|
39
|
+
import { stzPath } from "./taxonomy.js";
|
|
40
|
+
|
|
41
|
+
/** The replacement invariant whose passing proves a supersession is legitimate. */
|
|
42
|
+
export interface ReplacementInvariant {
|
|
43
|
+
/** Sealed suite (slice id) whose passing proves the superseding invariant. */
|
|
44
|
+
slice: string;
|
|
45
|
+
/** Optional specific test within that suite (informational; suite-level enforced). */
|
|
46
|
+
test?: string;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** One audited "this old invariant is superseded by a later slice" entry. */
|
|
50
|
+
export interface MergeCompatEntry {
|
|
51
|
+
id: string;
|
|
52
|
+
/** The sealed suite that now fails on the assembled crate. */
|
|
53
|
+
supersededSlice: string;
|
|
54
|
+
/** Optional test target/name within the superseded suite. */
|
|
55
|
+
supersededTest?: string;
|
|
56
|
+
/** Signature-pinned: the exact panic/assert substring that must appear in the
|
|
57
|
+
* reported failure. Matching the test name alone is forbidden — that is how a
|
|
58
|
+
* genuinely new bug in the same test would be laundered as "expected". */
|
|
59
|
+
panicSubstring: string;
|
|
60
|
+
/** The slice whose newer invariant legitimately supersedes the old one. */
|
|
61
|
+
supersededBy: string;
|
|
62
|
+
/** The replacement invariant that MUST simultaneously pass (rule 2). */
|
|
63
|
+
replacement: ReplacementInvariant;
|
|
64
|
+
reason: string;
|
|
65
|
+
/** Pointer to the pending wave-aware amendment that retires this entry (debt). */
|
|
66
|
+
pendingAmendment: string;
|
|
67
|
+
/** Approval gate (rule 3): the merge agent proposes false; an approver flips true. */
|
|
68
|
+
approved: boolean;
|
|
69
|
+
/** Who/why, recorded on approval — a self-approval is then an auditable anomaly. */
|
|
70
|
+
approvedBy?: string;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
export interface MergeCompatHistoryEvent {
|
|
74
|
+
seq: number;
|
|
75
|
+
action: "propose" | "approve" | "retire";
|
|
76
|
+
id: string;
|
|
77
|
+
detail: string;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
export interface MergeCompatManifest {
|
|
81
|
+
schemaVersion: 1;
|
|
82
|
+
entries: MergeCompatEntry[];
|
|
83
|
+
/** Append-only audit of propose/approve/retire — the protection for rules 1/3. */
|
|
84
|
+
history: MergeCompatHistoryEvent[];
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/** One reported sealed-suite result on the assembled crate (caller-supplied). */
|
|
88
|
+
export interface SealedSuiteResult {
|
|
89
|
+
slice: string;
|
|
90
|
+
passed: boolean;
|
|
91
|
+
/** The failure/panic text (signature) when passed === false. */
|
|
92
|
+
failure?: string;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export interface MergeVerdict {
|
|
96
|
+
ok: boolean;
|
|
97
|
+
/** Failures sanctioned by a matched + superseding-proven + approved entry. */
|
|
98
|
+
sanctioned: { slice: string; entryId: string; supersededBy: string }[];
|
|
99
|
+
/** Matched + superseding passes but NOT approved → blocks, awaits approval. */
|
|
100
|
+
pendingApproval: { slice: string; entryId: string }[];
|
|
101
|
+
/** Matched but the replacement invariant did NOT pass → blocks even if approved. */
|
|
102
|
+
invalid: { slice: string; entryId: string; reason: string }[];
|
|
103
|
+
/** No entry matches the signature → blocks; suspect a real merge defect. */
|
|
104
|
+
unsanctioned: { slice: string; reason: string }[];
|
|
105
|
+
/** Approved entries that sanctioned nothing this run — informational retire candidates. */
|
|
106
|
+
unused: string[];
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// ── paths + persistence ──────────────────────────────────────────────────────
|
|
110
|
+
|
|
111
|
+
const COMPAT_REL = join("90-audit", "merge-compat.json");
|
|
112
|
+
|
|
113
|
+
export function mergeCompatPath(root: string): string {
|
|
114
|
+
return stzPath(root, COMPAT_REL);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export function freshCompatManifest(): MergeCompatManifest {
|
|
118
|
+
return { schemaVersion: 1, entries: [], history: [] };
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export function loadCompat(root: string): MergeCompatManifest {
|
|
122
|
+
const p = mergeCompatPath(root);
|
|
123
|
+
if (!existsSync(p)) return freshCompatManifest();
|
|
124
|
+
return JSON.parse(readFileSync(p, "utf8")) as MergeCompatManifest;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
export function saveCompat(root: string, manifest: MergeCompatManifest): void {
|
|
128
|
+
const p = mergeCompatPath(root);
|
|
129
|
+
mkdirSync(join(p, ".."), { recursive: true });
|
|
130
|
+
writeFileSync(p, JSON.stringify(manifest, null, 2) + "\n", "utf8");
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function appendHistory(m: MergeCompatManifest, action: MergeCompatHistoryEvent["action"], id: string, detail: string): void {
|
|
134
|
+
m.history.push({ seq: m.history.length, action, id, detail });
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// ── mutations (propose / approve / retire) ───────────────────────────────────
|
|
138
|
+
|
|
139
|
+
export type ProposeInput = Omit<MergeCompatEntry, "approved" | "approvedBy">;
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Propose a compat entry. Always lands `approved:false` regardless of the input
|
|
143
|
+
* (the merge agent cannot self-approve — rule 3). Rejects an empty
|
|
144
|
+
* `panicSubstring` (it would match every failure) and a duplicate id.
|
|
145
|
+
*/
|
|
146
|
+
export function proposeCompat(m: MergeCompatManifest, entry: ProposeInput): { ok: true } | { ok: false; error: string } {
|
|
147
|
+
if (!entry.id) return { ok: false, error: "entry id is required" };
|
|
148
|
+
if (!entry.panicSubstring || entry.panicSubstring.trim() === "") {
|
|
149
|
+
return { ok: false, error: "panicSubstring must be non-empty (an empty signature would match every failure)" };
|
|
150
|
+
}
|
|
151
|
+
if (!entry.pendingAmendment || entry.pendingAmendment.trim() === "") {
|
|
152
|
+
return { ok: false, error: "pendingAmendment is required — a compat entry is transitional debt and must name the amendment that retires it" };
|
|
153
|
+
}
|
|
154
|
+
if (m.entries.some((e) => e.id === entry.id)) return { ok: false, error: `duplicate entry id: ${entry.id}` };
|
|
155
|
+
m.entries.push({ ...entry, approved: false });
|
|
156
|
+
appendHistory(m, "propose", entry.id, `${entry.supersededSlice} superseded by ${entry.supersededBy}; sig="${entry.panicSubstring}"`);
|
|
157
|
+
return { ok: true };
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/** Approve a proposed entry. Records who/why so a self-approval is auditable. */
|
|
161
|
+
export function approveCompat(m: MergeCompatManifest, id: string, by: string): { ok: true } | { ok: false; error: string } {
|
|
162
|
+
const e = m.entries.find((x) => x.id === id);
|
|
163
|
+
if (!e) return { ok: false, error: `no such compat entry: ${id}` };
|
|
164
|
+
if (e.approved) return { ok: false, error: `already approved: ${id}` };
|
|
165
|
+
e.approved = true;
|
|
166
|
+
e.approvedBy = by;
|
|
167
|
+
appendHistory(m, "approve", id, `approved by ${by}`);
|
|
168
|
+
return { ok: true };
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Retire an entry — the transitional-debt end state (rule 5). Should correspond
|
|
173
|
+
* to a `seal-amend` of the superseded suite (now wave-aware); the amendment
|
|
174
|
+
* reference is recorded so the audit links the two.
|
|
175
|
+
*/
|
|
176
|
+
export function retireCompat(m: MergeCompatManifest, id: string, amendmentRef: string): { ok: true } | { ok: false; error: string } {
|
|
177
|
+
const idx = m.entries.findIndex((x) => x.id === id);
|
|
178
|
+
if (idx < 0) return { ok: false, error: `no such compat entry: ${id}` };
|
|
179
|
+
m.entries.splice(idx, 1);
|
|
180
|
+
appendHistory(m, "retire", id, `retired; superseded suite amended via ${amendmentRef}`);
|
|
181
|
+
return { ok: true };
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// ── the deterministic verdict (the heart) ────────────────────────────────────
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Adjudicate reported sealed-suite results against the compat manifest. Pure and
|
|
188
|
+
* total: same inputs → same verdict (N6). A failing suite is sanctioned only when
|
|
189
|
+
* a signature-matched entry is approved AND its replacement invariant passes;
|
|
190
|
+
* matched-but-unapproved is `pendingApproval`, matched-but-replacement-fails is
|
|
191
|
+
* `invalid`, and no-match is `unsanctioned`. Any non-empty pendingApproval /
|
|
192
|
+
* invalid / unsanctioned blocks the merge.
|
|
193
|
+
*/
|
|
194
|
+
export function validateMerge(results: SealedSuiteResult[], manifest: MergeCompatManifest): MergeVerdict {
|
|
195
|
+
const bySlice = new Map(results.map((r) => [r.slice, r]));
|
|
196
|
+
const passing = (slice: string): boolean => bySlice.get(slice)?.passed === true;
|
|
197
|
+
|
|
198
|
+
const sanctioned: MergeVerdict["sanctioned"] = [];
|
|
199
|
+
const pendingApproval: MergeVerdict["pendingApproval"] = [];
|
|
200
|
+
const invalid: MergeVerdict["invalid"] = [];
|
|
201
|
+
const unsanctioned: MergeVerdict["unsanctioned"] = [];
|
|
202
|
+
const used = new Set<string>();
|
|
203
|
+
|
|
204
|
+
for (const r of results) {
|
|
205
|
+
if (r.passed) continue;
|
|
206
|
+
const failure = r.failure ?? "";
|
|
207
|
+
// Signature-pinned: the entry's panicSubstring must appear in the actual text.
|
|
208
|
+
const matches = manifest.entries.filter(
|
|
209
|
+
(e) => e.supersededSlice === r.slice && e.panicSubstring.length > 0 && failure.includes(e.panicSubstring),
|
|
210
|
+
);
|
|
211
|
+
if (matches.length === 0) {
|
|
212
|
+
unsanctioned.push({ slice: r.slice, reason: "no compat entry matches the failure signature — suspect a real merge defect" });
|
|
213
|
+
continue;
|
|
214
|
+
}
|
|
215
|
+
// Best outcome first: approved + replacement-proven → sanctioned.
|
|
216
|
+
const sanction = matches.find((e) => e.approved && passing(e.replacement.slice));
|
|
217
|
+
if (sanction) {
|
|
218
|
+
sanctioned.push({ slice: r.slice, entryId: sanction.id, supersededBy: sanction.supersededBy });
|
|
219
|
+
used.add(sanction.id);
|
|
220
|
+
continue;
|
|
221
|
+
}
|
|
222
|
+
// Replacement proven but not yet approved → pending (rule 3).
|
|
223
|
+
const pend = matches.find((e) => !e.approved && passing(e.replacement.slice));
|
|
224
|
+
if (pend) {
|
|
225
|
+
pendingApproval.push({ slice: r.slice, entryId: pend.id });
|
|
226
|
+
used.add(pend.id);
|
|
227
|
+
continue;
|
|
228
|
+
}
|
|
229
|
+
// Matched but the replacement invariant is not proven → invalid, blocks even
|
|
230
|
+
// if approved (rule 2: no supersession claim without a proven replacement).
|
|
231
|
+
// Distinguish "ran and failed" from "never reported" — saying a suite "did
|
|
232
|
+
// not pass" when it simply wasn't run is the same misleading verdict this
|
|
233
|
+
// whole feature replaces.
|
|
234
|
+
const inv = matches[0]!;
|
|
235
|
+
const reason = bySlice.has(inv.replacement.slice)
|
|
236
|
+
? `replacement invariant ${inv.replacement.slice} did not pass — supersession unproven`
|
|
237
|
+
: `replacement suite ${inv.replacement.slice} was not in the reported results — cannot prove supersession; run and report it`;
|
|
238
|
+
invalid.push({ slice: r.slice, entryId: inv.id, reason });
|
|
239
|
+
used.add(inv.id);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const unused = manifest.entries.filter((e) => e.approved && !used.has(e.id)).map((e) => e.id);
|
|
243
|
+
const ok = unsanctioned.length === 0 && invalid.length === 0 && pendingApproval.length === 0;
|
|
244
|
+
return { ok, sanctioned, pendingApproval, invalid, unsanctioned, unused };
|
|
245
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Mock testing harness
|
|
2
|
+
|
|
3
|
+
This folder holds the **deterministic mock run** — a self-contained, no-network
|
|
4
|
+
demo of the STZ pipeline. It is a testing and illustration aid, not part of the
|
|
5
|
+
production path. The production harness runs in-session through the `/stz:*`
|
|
6
|
+
commands and the `stz bridge` CLI (`src/bridge.ts`, `src/project.ts`); none of
|
|
7
|
+
those depend on anything in this folder.
|
|
8
|
+
|
|
9
|
+
## What is here
|
|
10
|
+
|
|
11
|
+
- `orchestrator.ts` — a single-process pipeline that drives one slice through all
|
|
12
|
+
eight phases against a fake model layer. Used only by `stz run` and by
|
|
13
|
+
`test/orchestrator.test.ts`.
|
|
14
|
+
- `interfaces.ts` — the model-layer seam (`Specimen`, `Judge`, `TestAuthor`,
|
|
15
|
+
`Documenter`, `ModelLayer`). A live TypeScript model implementation would
|
|
16
|
+
implement these; today only the mock does.
|
|
17
|
+
- `mock.ts` — the deterministic `MockModelLayer`. Specimen quality is configured,
|
|
18
|
+
not sampled, so every run is reproducible (N6). The mock eval runner still runs
|
|
19
|
+
the **real** hack-detector, so the anti-reward-hacking layer is exercised for
|
|
20
|
+
real even though the model is fake.
|
|
21
|
+
|
|
22
|
+
## Run it
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
stz run <dir> # or: node bin/stz.mjs run <dir>
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
This drives the demo slice end to end: four specimens compete, a test-skipping
|
|
29
|
+
specimen is disqualified by the hack-detector, a GRPO-weighted winner is chosen,
|
|
30
|
+
and the full `.stz/` audit tree is written under `<dir>`. No API keys, no
|
|
31
|
+
network, no subagents.
|
|
32
|
+
|
|
33
|
+
## Why it exists
|
|
34
|
+
|
|
35
|
+
The mock proves the deterministic spine (taxonomy, state, selection, GRPO,
|
|
36
|
+
escalation, budget, pressure log, spec-diff, audit) end to end without spending
|
|
37
|
+
tokens. The same spine powers the real in-session tournament; the only thing the
|
|
38
|
+
mock replaces is the model layer. It is covered by `test/orchestrator.test.ts`
|
|
39
|
+
(success path, bounded-escalation failure path, budget kill-switch, determinism,
|
|
40
|
+
anti-hacking integration).
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The LLM / subagent boundary (§3 Generative AI Layer).
|
|
3
|
+
*
|
|
4
|
+
* These interfaces are the seam between the deterministic STZ spine and the
|
|
5
|
+
* non-deterministic model layer. A live implementation invokes Claude Code /
|
|
6
|
+
* Codex subagents; the deterministic mock (./mock.ts) lets the whole pipeline
|
|
7
|
+
* run end-to-end in tests without any network call. Keeping the seam this thin
|
|
8
|
+
* is what makes "drop in a live impl" real rather than aspirational.
|
|
9
|
+
*/
|
|
10
|
+
import type {
|
|
11
|
+
DonePredicate,
|
|
12
|
+
EvalResult,
|
|
13
|
+
PairwiseVote,
|
|
14
|
+
SliceManifest,
|
|
15
|
+
SpecimenId,
|
|
16
|
+
} from "../types.js";
|
|
17
|
+
import type { Spec } from "../specdiff.js";
|
|
18
|
+
|
|
19
|
+
/** What a specimen produces when it implements a slice (F6). */
|
|
20
|
+
export interface SpecimenOutput {
|
|
21
|
+
specimen: SpecimenId;
|
|
22
|
+
/** relative path → file contents (the implementation diff, materialized). */
|
|
23
|
+
files: Record<string, string>;
|
|
24
|
+
/** Strategy label assigned by the diversification subagent (R5). */
|
|
25
|
+
strategy: string;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** Elicitation subagent (F2): questionnaire → done predicates + complexity. */
|
|
29
|
+
export interface Elicitor {
|
|
30
|
+
elicit(request: string): Promise<{
|
|
31
|
+
questionnaire: Record<string, string>;
|
|
32
|
+
donePredicates: DonePredicate[];
|
|
33
|
+
complexity: number;
|
|
34
|
+
}>;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/** Frozen test-author subagent (F10/L1): authors the sealed held-out suite. */
|
|
38
|
+
export interface TestAuthor {
|
|
39
|
+
authorTests(manifest: SliceManifest): Promise<{
|
|
40
|
+
/** relative path → sealed test contents (judge-only, read-only). */
|
|
41
|
+
sealed: Record<string, string>;
|
|
42
|
+
rubric: string;
|
|
43
|
+
}>;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Strategy-diversification subagent (R5): N distinct implementation strategies. */
|
|
47
|
+
export interface Strategist {
|
|
48
|
+
strategies(manifest: SliceManifest, n: number): Promise<string[]>;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/** An implementer specimen (F6). */
|
|
52
|
+
export interface Specimen {
|
|
53
|
+
implement(
|
|
54
|
+
manifest: SliceManifest,
|
|
55
|
+
strategy: string,
|
|
56
|
+
/** Refinement context from the pressure log on a retry round (F9/F14). */
|
|
57
|
+
refinement: string | null,
|
|
58
|
+
): Promise<SpecimenOutput>;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/** Eval runner (§3 Compute): runs sealed suite + coverage + mutation (F7/F11). */
|
|
62
|
+
export interface EvalRunner {
|
|
63
|
+
evaluate(
|
|
64
|
+
output: SpecimenOutput,
|
|
65
|
+
sealed: Record<string, string>,
|
|
66
|
+
): Promise<EvalResult>;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/** Judge subagent (F7 stage 2): one pairwise vote, frozen separate context. */
|
|
70
|
+
export interface Judge {
|
|
71
|
+
vote(
|
|
72
|
+
a: SpecimenOutput,
|
|
73
|
+
b: SpecimenOutput,
|
|
74
|
+
sealed: Record<string, string>,
|
|
75
|
+
): Promise<SpecimenId>;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Documenter subagent (F13): as-built spec from winning merged code. */
|
|
79
|
+
export interface Documenter {
|
|
80
|
+
asBuilt(winner: SpecimenOutput): Promise<Spec>;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/** Planner subagent (F5): intent spec from the manifest. */
|
|
84
|
+
export interface Planner {
|
|
85
|
+
intentSpec(manifest: SliceManifest): Promise<Spec>;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/** The full set of model-layer collaborators the orchestrator needs. */
|
|
89
|
+
export interface ModelLayer {
|
|
90
|
+
elicitor: Elicitor;
|
|
91
|
+
testAuthor: TestAuthor;
|
|
92
|
+
strategist: Strategist;
|
|
93
|
+
specimen: Specimen;
|
|
94
|
+
evalRunner: EvalRunner;
|
|
95
|
+
judge: Judge;
|
|
96
|
+
documenter: Documenter;
|
|
97
|
+
planner: Planner;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/** Convenience: aggregate V votes for a pair into the list form (F7). */
|
|
101
|
+
export async function votePair(
|
|
102
|
+
judge: Judge,
|
|
103
|
+
a: SpecimenOutput,
|
|
104
|
+
b: SpecimenOutput,
|
|
105
|
+
sealed: Record<string, string>,
|
|
106
|
+
votes: number,
|
|
107
|
+
): Promise<PairwiseVote[]> {
|
|
108
|
+
const out: PairwiseVote[] = [];
|
|
109
|
+
for (let v = 0; v < votes; v++) {
|
|
110
|
+
const winner = await judge.vote(a, b, sealed);
|
|
111
|
+
out.push({ a: a.specimen, b: b.specimen, winner });
|
|
112
|
+
}
|
|
113
|
+
return out;
|
|
114
|
+
}
|