slice-tournament-zoo 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,223 @@
1
+ /**
2
+ * Deterministic mock model layer.
3
+ *
4
+ * Lets the full slice pipeline run end-to-end with zero network calls and
5
+ * perfectly reproducible outputs (N6). Specimen "quality" is configured, not
6
+ * sampled, so tests can drive both the success path and the failure path
7
+ * (no passers → retry → replan → halt). The mock EvalRunner runs the *real*
8
+ * hack-detector over the specimen files, so the anti-hacking layer is exercised
9
+ * for real even though the model is fake.
10
+ */
11
+ import type {
12
+ DonePredicate,
13
+ EvalResult,
14
+ HackPattern,
15
+ SliceManifest,
16
+ SpecimenId,
17
+ } from "../types.js";
18
+ import type {
19
+ Documenter,
20
+ Elicitor,
21
+ EvalRunner,
22
+ Judge,
23
+ ModelLayer,
24
+ Planner,
25
+ Specimen,
26
+ SpecimenOutput,
27
+ Strategist,
28
+ TestAuthor,
29
+ } from "./interfaces.js";
30
+ import type { Spec } from "../specdiff.js";
31
+ import { detectHacks } from "../hack-detector.js";
32
+ import { evalReward } from "../selection.js";
33
+
34
+ /** Quality profile for one mock specimen in one round. */
35
+ export interface MockProfile {
36
+ specimen: SpecimenId;
37
+ strategy: string;
38
+ passGate: boolean;
39
+ testPassRate: number;
40
+ coverage: number;
41
+ mutationScore: number;
42
+ /** If set, inject a file containing this hack pattern (→ disqualification). */
43
+ hack?: HackPattern;
44
+ }
45
+
46
+ export interface MockConfig {
47
+ complexity: number;
48
+ donePredicates: DonePredicate[];
49
+ /** Profiles per round; index = round (0 = initial, 1 = retry, …). Last reused. */
50
+ profilesByRound: MockProfile[][];
51
+ votesPerPair: number;
52
+ }
53
+
54
+ const HACK_SNIPPET: Record<HackPattern, string> = {
55
+ "test-skip": "it.skip('contract', () => {});",
56
+ "assertion-mutation": "expect(true).toBe(true);",
57
+ "network-bypass": "const r = await fetch('https://api.example.com/score');",
58
+ "fixture-keyed-branch": "if (input === FIXTURE_GOLDEN) return cached;",
59
+ "hardcoded-test-input": "return 'EXPECTED_OUTPUT';",
60
+ };
61
+
62
+ export function defaultMockConfig(): MockConfig {
63
+ return {
64
+ complexity: 2,
65
+ donePredicates: [
66
+ { id: "schema", expr: "returns_schema(Result)", kind: "schema" },
67
+ { id: "latency", expr: "p95_latency_ms < 200", kind: "metric" },
68
+ ],
69
+ profilesByRound: [
70
+ [
71
+ { specimen: "a", strategy: "iterator-based", passGate: true, testPassRate: 1.0, coverage: 0.95, mutationScore: 0.1 },
72
+ { specimen: "b", strategy: "stream-based", passGate: true, testPassRate: 1.0, coverage: 0.88, mutationScore: 0.2 },
73
+ { specimen: "c", strategy: "batch-based", passGate: true, testPassRate: 0.9, coverage: 0.8, mutationScore: 0.3 },
74
+ { specimen: "d", strategy: "recursive", passGate: false, testPassRate: 0.4, coverage: 0.5, mutationScore: 0.6, hack: "test-skip" },
75
+ ],
76
+ ],
77
+ votesPerPair: 8,
78
+ };
79
+ }
80
+
81
+ /** A config whose every specimen always fails — drives retry → replan → halt. */
82
+ export function alwaysFailConfig(): MockConfig {
83
+ const fail = (specimen: SpecimenId, strategy: string): MockProfile => ({
84
+ specimen,
85
+ strategy,
86
+ passGate: false,
87
+ testPassRate: 0.3,
88
+ coverage: 0.4,
89
+ mutationScore: 0.7,
90
+ });
91
+ return {
92
+ complexity: 5,
93
+ donePredicates: [{ id: "x", expr: "ok == true", kind: "test" }],
94
+ profilesByRound: [
95
+ [fail("a", "iterator-based"), fail("b", "stream-based"), fail("c", "batch-based"), fail("d", "recursive")],
96
+ ],
97
+ votesPerPair: 8,
98
+ };
99
+ }
100
+
101
+ export class MockModelLayer implements ModelLayer {
102
+ private round = 0;
103
+ constructor(private cfg: MockConfig = defaultMockConfig()) {}
104
+
105
+ /** Advance to the next escalation round (orchestrator calls on retry/replan). */
106
+ nextRound(): void {
107
+ this.round++;
108
+ }
109
+
110
+ private currentProfiles(): MockProfile[] {
111
+ const r = Math.min(this.round, this.cfg.profilesByRound.length - 1);
112
+ return this.cfg.profilesByRound[r]!;
113
+ }
114
+
115
+ elicitor: Elicitor = {
116
+ elicit: async (request: string) => ({
117
+ questionnaire: { request, scope: "single-repo", interface: "documented" },
118
+ donePredicates: this.cfg.donePredicates,
119
+ complexity: this.cfg.complexity,
120
+ }),
121
+ };
122
+
123
+ testAuthor: TestAuthor = {
124
+ authorTests: async (manifest: SliceManifest) => ({
125
+ sealed: {
126
+ "held-out/contract.spec.ts": `// sealed held-out suite for ${manifest.id}\n// property-based generators run at judge time (R8)\n`,
127
+ },
128
+ rubric: "trace quality, convention adherence, coverage delta, as-built clarity",
129
+ }),
130
+ };
131
+
132
+ strategist: Strategist = {
133
+ strategies: async (_m: SliceManifest, n: number) => {
134
+ const profiles = this.currentProfiles();
135
+ const out = profiles.slice(0, n).map((p) => p.strategy);
136
+ while (out.length < n) out.push(`strategy-${out.length}`);
137
+ return out;
138
+ },
139
+ };
140
+
141
+ specimen: Specimen = {
142
+ implement: async (manifest, strategy, _refinement) => {
143
+ const p = this.currentProfiles().find((x) => x.strategy === strategy)
144
+ ?? this.currentProfiles()[0]!;
145
+ const files: Record<string, string> = {
146
+ [`src/${manifest.name}.ts`]:
147
+ `// ${manifest.name} via ${strategy}\nexport function run(input: unknown) {\n return process(input);\n}\n`,
148
+ };
149
+ if (p.hack) {
150
+ files[`src/${manifest.name}.hack.ts`] = `${HACK_SNIPPET[p.hack]}\n`;
151
+ }
152
+ return { specimen: p.specimen, files, strategy } satisfies SpecimenOutput;
153
+ },
154
+ };
155
+
156
+ evalRunner: EvalRunner = {
157
+ evaluate: async (output: SpecimenOutput, _sealed) => {
158
+ const p = this.currentProfiles().find((x) => x.specimen === output.specimen)!;
159
+ const hackFindings = detectHacks(output.specimen, output.files, {
160
+ fixtureNames: ["FIXTURE_GOLDEN"],
161
+ });
162
+ return {
163
+ specimen: output.specimen,
164
+ passedGate: p.passGate && hackFindings.length === 0,
165
+ testPassRate: p.testPassRate,
166
+ coverage: p.coverage,
167
+ mutationScore: p.mutationScore,
168
+ hackFindings,
169
+ } satisfies EvalResult;
170
+ },
171
+ };
172
+
173
+ judge: Judge = {
174
+ // Deterministic, majority-correct judge: the higher-reward specimen wins.
175
+ // Reward is recomputed from the configured profile so votes are stable (N6).
176
+ vote: async (a, b, _sealed): Promise<SpecimenId> => {
177
+ const ra = this.rewardFor(a.specimen);
178
+ const rb = this.rewardFor(b.specimen);
179
+ if (ra > rb) return a.specimen;
180
+ if (rb > ra) return b.specimen;
181
+ return a.specimen < b.specimen ? a.specimen : b.specimen;
182
+ },
183
+ };
184
+
185
+ documenter: Documenter = {
186
+ asBuilt: async (winner: SpecimenOutput): Promise<Spec> => ({
187
+ claims: [
188
+ `implements the contract`,
189
+ `passes sealed held-out suite`,
190
+ `exposes run() entrypoint`,
191
+ // Extra: the specific strategy the winner chose (plan left "how" open,
192
+ // R5) — surfaces in the spec-diff as "built beyond plan", not "missing".
193
+ `via ${winner.strategy} strategy`,
194
+ ],
195
+ }),
196
+ };
197
+
198
+ planner: Planner = {
199
+ // Intent spec states behavioural claims about the slice. Done-predicates
200
+ // live in the manifest/questionnaire and are checked by the sealed suite,
201
+ // so they are not restated as spec claims here — that keeps the intent vs
202
+ // as-built diff a comparison of delivered behaviour (F13).
203
+ intentSpec: async (_manifest: SliceManifest): Promise<Spec> => ({
204
+ claims: [
205
+ `implements the contract`,
206
+ `passes sealed held-out suite`,
207
+ `exposes run() entrypoint`,
208
+ ],
209
+ }),
210
+ };
211
+
212
+ private rewardFor(s: SpecimenId): number {
213
+ const p = this.currentProfiles().find((x) => x.specimen === s)!;
214
+ return evalReward({
215
+ specimen: s,
216
+ passedGate: p.passGate,
217
+ testPassRate: p.testPassRate,
218
+ coverage: p.coverage,
219
+ mutationScore: p.mutationScore,
220
+ hackFindings: [],
221
+ });
222
+ }
223
+ }
@@ -0,0 +1,457 @@
1
+ /**
2
+ * The per-slice orchestrator (§3 Pipeline Topology). Sequences the 8 phases
3
+ * (F1), checkpoints `state.json` at every phase boundary (F16), tracks cost
4
+ * through the ledger middleware (N5/N6), runs the adversarial tournament (F6),
5
+ * applies hybrid selection (F7) + GRPO (F8), persists the pressure log (F9),
6
+ * drives bounded escalation (F14), and materializes the audit artifacts (F13).
7
+ *
8
+ * The model layer is injected (ModelLayer), so this runs identically against
9
+ * the deterministic mock and a future live Claude Code / Codex implementation.
10
+ *
11
+ * STUBBED vs the full design (logged via the `log` sink, surfaced in AS-BUILT):
12
+ * - git worktrees per specimen → prototypes/specimen-X/ directories instead.
13
+ * - per-worktree ephemeral observability stacks → not spun up.
14
+ * - live Python eval drivers / mutation / PBT → mock EvalRunner.
15
+ * - local embeddings / cross-slice RAG → not built.
16
+ */
17
+ import { join } from "node:path";
18
+ import { mkdir, writeFile } from "node:fs/promises";
19
+ import type {
20
+ Judgment,
21
+ Phase,
22
+ SliceManifest,
23
+ SliceState,
24
+ SpecimenId,
25
+ } from "../types.js";
26
+ import { PHASES } from "../types.js";
27
+ import { CostTracker } from "../cost-tracker.js";
28
+ import {
29
+ freshState,
30
+ saveState,
31
+ setPhaseStatus,
32
+ appendEvent,
33
+ } from "../state.js";
34
+ import { allocateBudget, wouldExceed } from "../budget.js";
35
+ import { scaffold, writeDoc, stzPath } from "../taxonomy.js";
36
+ import { select, pairings, evalReward } from "../selection.js";
37
+ import { votePair, type ModelLayer, type SpecimenOutput } from "./interfaces.js";
38
+ import {
39
+ onNoPassers,
40
+ initialEscalation,
41
+ type EscalationState,
42
+ } from "../escalation.js";
43
+ import {
44
+ renderPressureLog,
45
+ refinementContext,
46
+ type CulledSpecimen,
47
+ type PressureLog,
48
+ } from "../pressure.js";
49
+ import { diffSpecs, renderSpecDiff, isFaithful, type Spec } from "../specdiff.js";
50
+
51
+ export interface OrchestratorOptions {
52
+ root: string;
53
+ manifest: SliceManifest;
54
+ model: ModelLayer & { nextRound?: () => void };
55
+ /** Specimen count N (F6, default 4). */
56
+ n?: number;
57
+ poolRemaining?: number;
58
+ /** Progress sink (defaults to no-op; CLI passes console.log). */
59
+ log?: (msg: string) => void;
60
+ }
61
+
62
+ export interface SliceResult {
63
+ sliceId: string;
64
+ state: SliceState;
65
+ judgment: Judgment | null;
66
+ winner: SpecimenId | null;
67
+ halted: boolean;
68
+ faithful: boolean;
69
+ /** Relative paths of materialized audit artifacts under .stz/. */
70
+ artifacts: string[];
71
+ rounds: number;
72
+ }
73
+
74
+ /** Synthetic per-call token charge so the ledger/budget are exercised (N5). */
75
+ const TOKENS_PER_CALL = { prompt: 1200, completion: 800 };
76
+
77
+ /** Raised when a slice would breach its hard token cap (N5/R3 kill-switch). */
78
+ export class BudgetExceededError extends Error {
79
+ constructor(message: string) {
80
+ super(message);
81
+ this.name = "BudgetExceededError";
82
+ }
83
+ }
84
+
85
+ export async function runSlice(opts: OrchestratorOptions): Promise<SliceResult> {
86
+ const { root, manifest, model } = opts;
87
+ const n = opts.n ?? 4;
88
+ const log = opts.log ?? (() => {});
89
+ const tracker = new CostTracker();
90
+ const sliceDir = join("40-slices", manifest.id);
91
+ const artifacts: string[] = [];
92
+
93
+ await scaffold(root);
94
+
95
+ // Elicitation drives the budget, so allocate after we know complexity.
96
+ let state = freshState(manifest.id, manifest.complexity, opts.poolRemaining ?? 5_000_000);
97
+
98
+ const charge = (phase: Phase, role: Parameters<CostTracker["record"]>[0]["role"]) => {
99
+ const cost = TOKENS_PER_CALL.prompt + TOKENS_PER_CALL.completion;
100
+ // N5 hard per-slice token cap / R3 kill-switch: refuse to proceed past the
101
+ // cap rather than silently overrunning. The cap is enforced here, the one
102
+ // place every model call is metered.
103
+ if (wouldExceed(state.budget, cost)) {
104
+ throw new BudgetExceededError(
105
+ `${manifest.id}: token cap ${state.budget.tokenCap} would be exceeded ` +
106
+ `(spent ${state.budget.tokensSpent} + ${cost} in ${phase}/${role}).`,
107
+ );
108
+ }
109
+ const rec = tracker.record({
110
+ id: `${manifest.id}-${role}-${tracker.count()}`,
111
+ phase,
112
+ role,
113
+ model: "mock",
114
+ temperature: 0,
115
+ seed: 0,
116
+ promptTokens: TOKENS_PER_CALL.prompt,
117
+ completionTokens: TOKENS_PER_CALL.completion,
118
+ });
119
+ state.budget.tokensSpent += rec.promptTokens + rec.completionTokens;
120
+ state.callCount = tracker.count();
121
+ };
122
+
123
+ const checkpoint = async () => {
124
+ await saveState(root, state);
125
+ };
126
+
127
+ // ── Phase 1: elicitation (F2) ───────────────────────────────────────────
128
+ state = setPhaseStatus(state, "elicitation", "running");
129
+ await checkpoint();
130
+ const elicited = await model.elicitor.elicit(manifest.contract);
131
+ charge("elicitation", "elicitor");
132
+ // Re-derive budget from the elicited complexity (F15).
133
+ state.budget = {
134
+ ...allocateBudget(elicited.complexity, opts.poolRemaining ?? 5_000_000),
135
+ tokensSpent: state.budget.tokensSpent,
136
+ };
137
+ if (elicited.donePredicates.length === 0) {
138
+ throw new Error("F2 violation: elicitation produced no machine-checkable predicates");
139
+ }
140
+ await writeDoc(root, join("00-intent", "questionnaire.md"), {
141
+ frontmatter: {
142
+ summary: `Elicitation for ${manifest.id}: ${elicited.donePredicates.length} done-predicates, complexity ${elicited.complexity}.`,
143
+ complexity: elicited.complexity,
144
+ },
145
+ body:
146
+ `# Elicitation — ${manifest.id}\n\n## Questionnaire\n` +
147
+ Object.entries(elicited.questionnaire).map(([k, v]) => `- **${k}:** ${v}`).join("\n") +
148
+ `\n\n## Done predicates (machine-checkable)\n` +
149
+ elicited.donePredicates.map((d) => `- \`${d.expr}\` (${d.kind})`).join("\n") +
150
+ "\n",
151
+ });
152
+ artifacts.push(`${sliceDir}/../../00-intent/questionnaire.md`);
153
+ state = setPhaseStatus(state, "elicitation", "done");
154
+ await checkpoint();
155
+ log(`[${manifest.id}] elicitation: ${elicited.donePredicates.length} predicates, complexity ${elicited.complexity}`);
156
+
157
+ // ── Phase 2: research (stub, F3 routes documented) ──────────────────────
158
+ state = setPhaseStatus(state, "research", "running");
159
+ await checkpoint();
160
+ charge("research", "researcher");
161
+ await writeDoc(root, join("10-research", "validated.md"), {
162
+ frontmatter: { summary: `Research for ${manifest.id} (stubbed: tiered ground-truth validation routes documented, not executed).` },
163
+ body: "# Validated research\n\n_Stub: live research + 3-route ground-truth validation (F3) deferred to live model layer._\n",
164
+ });
165
+ state = setPhaseStatus(state, "research", "done");
166
+ state = setPhaseStatus(state, "ground-truth-validation", "done");
167
+ await checkpoint();
168
+
169
+ // ── Phase 4: standards (stub) ───────────────────────────────────────────
170
+ state = setPhaseStatus(state, "standards", "running");
171
+ await checkpoint();
172
+ await writeDoc(root, join("20-standards", "conventions.md"), {
173
+ frontmatter: { summary: `Conventions v1 for ${manifest.id}.`, version: 1 },
174
+ body: "# Conventions (v1)\n\n- Slices are contract-bounded.\n- No secrets in the markdown tree.\n",
175
+ });
176
+ state = setPhaseStatus(state, "standards", "done");
177
+ await checkpoint();
178
+
179
+ // ── Phase 5: test-authoring (F10/L1 frozen, sealed) ─────────────────────
180
+ state = setPhaseStatus(state, "test-authoring", "running");
181
+ await checkpoint();
182
+ const { sealed, rubric } = await model.testAuthor.authorTests(manifest);
183
+ charge("test-authoring", "test-author");
184
+ for (const [path, contents] of Object.entries(sealed)) {
185
+ await writeDoc(root, join("30-tests", path), {
186
+ frontmatter: { summary: `Sealed held-out test ${path} (read-only; judge-loaded only).`, sealed: true },
187
+ body: contents,
188
+ });
189
+ }
190
+ await writeDoc(root, join("30-tests", "rubric.md"), {
191
+ frontmatter: { summary: "Judge ranking rubric." },
192
+ body: `# Rubric\n\n${rubric}\n`,
193
+ });
194
+ artifacts.push("30-tests/held-out");
195
+ state = setPhaseStatus(state, "test-authoring", "done");
196
+ await checkpoint();
197
+ log(`[${manifest.id}] test-author: sealed ${Object.keys(sealed).length} held-out file(s)`);
198
+
199
+ // ── Phase 6: planning (F5 intent spec) ──────────────────────────────────
200
+ state = setPhaseStatus(state, "planning", "running");
201
+ await checkpoint();
202
+ let intent = await model.planner.intentSpec(manifest);
203
+ charge("planning", "planner");
204
+ await writeDoc(root, join(sliceDir, "plan.md"), {
205
+ frontmatter: { summary: `Intent spec for ${manifest.id}: ${intent.claims.length} claims.` },
206
+ body: `# Intent spec — ${manifest.id}\n\n${intent.claims.map((c) => `- ${c}`).join("\n")}\n`,
207
+ });
208
+ artifacts.push(`${sliceDir}/plan.md`);
209
+ state = setPhaseStatus(state, "planning", "done");
210
+ await checkpoint();
211
+
212
+ // ── Phases 7+8: tournament + judgment with bounded escalation (F6/F7/F14) ─
213
+ let esc: EscalationState = initialEscalation();
214
+ let refinement: string | null = null;
215
+ let judgment: Judgment | null = null;
216
+ let winner: SpecimenId | null = null;
217
+ let asBuilt: Spec | null = null;
218
+ let rounds = 0;
219
+
220
+ // eslint-disable-next-line no-constant-condition
221
+ while (true) {
222
+ rounds++;
223
+ state = setPhaseStatus(state, "tournament", "running");
224
+ state.escalation = esc.stage;
225
+ state.retryCount = esc.retryCount;
226
+ state.replanCount = esc.replanCount;
227
+ await checkpoint();
228
+
229
+ // Strategy diversification (R5) → spawn N specimens in parallel (F6).
230
+ const strategies = await model.strategist.strategies(manifest, n);
231
+ charge("tournament", "specimen");
232
+ log(`[${manifest.id}] round ${rounds}: spawning ${n} specimens [${strategies.join(", ")}] (worktrees STUBBED → prototype dirs)`);
233
+
234
+ const outputs: SpecimenOutput[] = [];
235
+ for (const strat of strategies) {
236
+ const out = await model.specimen.implement(manifest, strat, refinement);
237
+ charge("tournament", "specimen");
238
+ outputs.push(out);
239
+ // Materialize into prototypes/specimen-X/ (worktree stand-in).
240
+ const protoDir = stzPath(root, join(sliceDir, "prototypes", `specimen-${out.specimen}`));
241
+ await mkdir(protoDir, { recursive: true });
242
+ for (const [path, contents] of Object.entries(out.files)) {
243
+ const full = join(protoDir, path);
244
+ await mkdir(join(full, ".."), { recursive: true });
245
+ await writeFile(full, contents, "utf8");
246
+ }
247
+ }
248
+
249
+ // Eval gate: sealed suite + coverage + mutation + hack-pattern detect (F7/F10).
250
+ const evals = [];
251
+ for (const out of outputs) {
252
+ const r = await model.evalRunner.evaluate(out, sealed);
253
+ charge("tournament", "specimen");
254
+ evals.push(r);
255
+ }
256
+ state = setPhaseStatus(state, "tournament", "done");
257
+ state.activeSpecimens = evals.filter((e) => e.passedGate).map((e) => e.specimen);
258
+ await checkpoint();
259
+
260
+ // Judgment (F7 stage 2 + F8).
261
+ state = setPhaseStatus(state, "judgment", "running");
262
+ await checkpoint();
263
+ const passers = evals.filter((e) => e.passedGate).map((e) => e.specimen);
264
+ const outById = new Map(outputs.map((o) => [o.specimen, o]));
265
+
266
+ if (passers.length === 0) {
267
+ // No passers → consult the escalation FSM (F14). The orchestrator NEVER
268
+ // loops on its own; the FSM alone decides whether another round is allowed.
269
+ const culled = buildPressure(manifest.id, evals, outputs, []);
270
+ await persistPressure(root, manifest.id, culled, []);
271
+ artifacts.push(`50-pressure/${manifest.id}`);
272
+ const { next, action } = onNoPassers(esc);
273
+ esc = next;
274
+ state.escalation = esc.stage;
275
+ state = appendEvent(state, "judgment", `escalation-${action.type}`, action.note);
276
+ await checkpoint();
277
+ log(`[${manifest.id}] no passers → ${action.type}: ${action.note}`);
278
+
279
+ if (action.type === "halt") {
280
+ state.failureReport = structuredFailure(manifest, evals, rounds);
281
+ await writeDoc(root, join(sliceDir, "failure-report.md"), {
282
+ frontmatter: { summary: `Halt: no passers after ${rounds} round(s).` },
283
+ body: state.failureReport,
284
+ });
285
+ artifacts.push(`${sliceDir}/failure-report.md`);
286
+ state = setPhaseStatus(state, "judgment", "failed");
287
+ await checkpoint();
288
+ await writeAudit(root, manifest.id, tracker, state);
289
+ return { sliceId: manifest.id, state, judgment: null, winner: null, halted: true, faithful: false, artifacts, rounds };
290
+ }
291
+
292
+ // retry or replan: build refinement context (F9), advance the model round.
293
+ const advantages = select(evals, []).judgment.advantages;
294
+ refinement = refinementContext({ sliceId: manifest.id, culled }, advantages);
295
+ if (action.type === "replan") {
296
+ // Re-enter planning with failure analysis (F14).
297
+ state = setPhaseStatus(state, "planning", "running");
298
+ intent = await model.planner.intentSpec(manifest);
299
+ charge("planning", "planner");
300
+ state = setPhaseStatus(state, "planning", "done");
301
+ }
302
+ model.nextRound?.();
303
+ continue;
304
+ }
305
+
306
+ // We have passers → pairwise V votes (F7), then select.
307
+ const pairs = pairings(passers);
308
+ const votes = [];
309
+ for (const [pa, pb] of pairs) {
310
+ const oa = outById.get(pa)!;
311
+ const ob = outById.get(pb)!;
312
+ const v = await votePair(model.judge, oa, ob, sealed, manifest.judge.votesPerPair);
313
+ for (let i = 0; i < manifest.judge.votesPerPair; i++) charge("judgment", "judge");
314
+ votes.push(...v);
315
+ }
316
+ const sel = select(evals, votes);
317
+ judgment = sel.judgment;
318
+ winner = judgment.winner;
319
+ state.activeSpecimens = passers;
320
+ state = appendEvent(state, "judgment", "winner", `winner=${winner}, ranking=[${judgment.ranking.join(",")}]`);
321
+
322
+ // Pressure log for culled + non-winning passers (F9).
323
+ const culled = buildPressure(manifest.id, evals, outputs, judgment.ranking.slice(1).concat(sel.eliminated.map((e) => e.specimen)));
324
+ await persistPressure(root, manifest.id, culled, judgment.advantages);
325
+ artifacts.push(`50-pressure/${manifest.id}`);
326
+
327
+ // Documenter → as-built spec → spec-diff (F13).
328
+ const winnerOut = outById.get(winner!)!;
329
+ asBuilt = await model.documenter.asBuilt(winnerOut);
330
+ charge("judgment", "documenter");
331
+ const sdiff = diffSpecs(intent, asBuilt);
332
+ await writeDoc(root, join(sliceDir, "tournament.md"), {
333
+ frontmatter: { summary: `Tournament ${manifest.id}: winner specimen-${winner}, ${passers.length}/${outputs.length} passed gate.` },
334
+ body:
335
+ `# Tournament — ${manifest.id}\n\n- **winner:** specimen-${winner}\n- **ranking:** ${judgment.ranking.join(" > ")}\n` +
336
+ `- **votes:** ${votes.length} pairwise (V=${manifest.judge.votesPerPair}/pair)\n\n## GRPO advantages\n` +
337
+ judgment.advantages.map((a) => `- specimen-${a.specimen}: reward=${a.reward.toFixed(3)} advantage=${a.advantage.toFixed(3)}`).join("\n") +
338
+ "\n",
339
+ });
340
+ await writeDoc(root, join(sliceDir, "spec-diff.md"), {
341
+ frontmatter: { summary: `Spec diff ${manifest.id}: ${sdiff.missing.length} missing, ${sdiff.added.length} added, ${sdiff.kept.length} kept.` },
342
+ body: renderSpecDiff(manifest.id, sdiff),
343
+ });
344
+ artifacts.push(`${sliceDir}/tournament.md`, `${sliceDir}/spec-diff.md`);
345
+
346
+ state = setPhaseStatus(state, "judgment", "done");
347
+ await checkpoint();
348
+ log(`[${manifest.id}] winner=specimen-${winner}; faithful=${isFaithful(sdiff)}`);
349
+ await writeAudit(root, manifest.id, tracker, state);
350
+ return {
351
+ sliceId: manifest.id,
352
+ state,
353
+ judgment,
354
+ winner,
355
+ halted: false,
356
+ faithful: isFaithful(sdiff),
357
+ artifacts,
358
+ rounds,
359
+ };
360
+ }
361
+ }
362
+
363
+ function buildPressure(
364
+ _sliceId: string,
365
+ evals: { specimen: SpecimenId; testPassRate: number; hackFindings: any[] }[],
366
+ outputs: SpecimenOutput[],
367
+ culledIds: SpecimenId[],
368
+ ): CulledSpecimen[] {
369
+ const wanted = culledIds.length > 0
370
+ ? new Set(culledIds)
371
+ : new Set(evals.map((e) => e.specimen)); // all, when none passed
372
+ const outById = new Map(outputs.map((o) => [o.specimen, o]));
373
+ return evals
374
+ .filter((e) => wanted.has(e.specimen))
375
+ .map((e) => {
376
+ const out = outById.get(e.specimen);
377
+ return {
378
+ specimen: e.specimen,
379
+ reason: e.hackFindings.length
380
+ ? `hack: ${e.hackFindings.map((f) => f.pattern).join(",")}`
381
+ : `gate testPassRate=${e.testPassRate.toFixed(2)}`,
382
+ diff: out ? Object.entries(out.files).map(([p, c]) => `+++ ${p}\n${c}`).join("\n") : "",
383
+ critique: "",
384
+ hackFindings: e.hackFindings,
385
+ } as CulledSpecimen;
386
+ });
387
+ }
388
+
389
+ async function persistPressure(
390
+ root: string,
391
+ sliceId: string,
392
+ culled: CulledSpecimen[],
393
+ advantages: Judgment["advantages"],
394
+ ): Promise<void> {
395
+ const log: PressureLog = { sliceId, culled };
396
+ await writeDoc(root, join("50-pressure", sliceId, "pressure.md"), {
397
+ frontmatter: { summary: `Pressure log ${sliceId}: ${culled.length} culled specimen(s).` },
398
+ body: renderPressureLog(log),
399
+ });
400
+ if (advantages.length > 0) {
401
+ await writeDoc(root, join("50-pressure", sliceId, "refinement.md"), {
402
+ frontmatter: { summary: `PDR top-K refinement context for ${sliceId}.` },
403
+ body: refinementContext(log, advantages),
404
+ });
405
+ }
406
+ }
407
+
408
+ async function writeAudit(
409
+ root: string,
410
+ sliceId: string,
411
+ tracker: CostTracker,
412
+ state: SliceState,
413
+ ): Promise<void> {
414
+ const callsPath = stzPath(root, join("90-audit", "calls", `${sliceId}.jsonl`));
415
+ await mkdir(join(callsPath, ".."), { recursive: true });
416
+ await writeFile(callsPath, tracker.toJSONL() + "\n", "utf8");
417
+ await writeDoc(root, join("90-audit", "cost.md"), {
418
+ frontmatter: { summary: `Cost for ${sliceId}: ${tracker.totalTokens()} tokens over ${tracker.count()} calls.` },
419
+ body:
420
+ `# Cost — ${sliceId}\n\n- **total tokens:** ${tracker.totalTokens()}\n- **calls:** ${tracker.count()}\n` +
421
+ `- **budget cap:** ${state.budget.tokenCap}\n- **within cap:** ${state.budget.tokensSpent <= state.budget.tokenCap}\n`,
422
+ });
423
+ await writeDoc(root, join("90-audit", "journal.md"), {
424
+ frontmatter: { summary: `Replayable event journal for ${sliceId}: ${state.events.length} events.` },
425
+ body:
426
+ `# Journal — ${sliceId}\n\n` +
427
+ state.events.map((e) => `${e.seq}. [${e.phase}] ${e.kind}: ${e.detail}`).join("\n") +
428
+ "\n",
429
+ });
430
+ }
431
+
432
+ function structuredFailure(
433
+ manifest: SliceManifest,
434
+ evals: { specimen: SpecimenId; testPassRate: number; hackFindings: any[] }[],
435
+ rounds: number,
436
+ ): string {
437
+ return [
438
+ `# Structured failure report — ${manifest.id}`,
439
+ "",
440
+ `Halted after ${rounds} round(s): no specimen passed the sealed eval gate.`,
441
+ "",
442
+ "## Per-specimen outcome (final round)",
443
+ ...evals.map(
444
+ (e) =>
445
+ `- specimen-${e.specimen}: testPassRate=${e.testPassRate.toFixed(2)}, ` +
446
+ `hacks=[${e.hackFindings.map((f) => f.pattern).join(",") || "none"}]`,
447
+ ),
448
+ "",
449
+ "## Escalation budget",
450
+ "- GRPO retry: exhausted (1/1)",
451
+ "- replan: exhausted (1/1)",
452
+ "",
453
+ "Recommend human review of the contract and sealed suite difficulty.",
454
+ ].join("\n");
455
+ }
456
+
457
+ export { PHASES };