slice-tournament-zoo 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -399,7 +399,8 @@ For contributors and anyone going past day-to-day operation:
399
399
  - **Sealed-suite integrity** — the guide-vs-sensor contract behind the frozen
400
400
  held-out suite: [`docs/development/sealed-suite.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/development/sealed-suite.md).
401
401
  - **Requirement-to-test mapping** — [`docs/TESTPLAN.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/TESTPLAN.md).
402
- - **What is real versus deferred** — [`docs/AS-BUILT.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/AS-BUILT.md).
402
+ - **Roadmap — what is built, deferred, and planned next** —
403
+ [`docs/ROADMAP.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/ROADMAP.md).
403
404
 
404
405
  ## License
405
406
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "slice-tournament-zoo",
3
- "version": "0.6.0",
3
+ "version": "0.7.1",
4
4
  "description": "STZ: a contract-bounded slice pipeline that implements each slice adversarially via an N-specimen tournament with frozen sealed tests, GRPO-style selection, layered anti-reward-hacking, and a replayable markdown audit trail.",
5
5
  "license": "Apache-2.0",
6
6
  "homepage": "https://github.com/dr-robert-li/slice-tournament-zoo#readme",
package/src/README.md CHANGED
@@ -14,6 +14,6 @@ the production path — see [`mock/`](https://github.com/dr-robert-li/slice-tour
14
14
  ## Further reading
15
15
 
16
16
  - The requirement-to-test mapping is in [`docs/TESTPLAN.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/TESTPLAN.md).
17
- - What is real versus deferred is in [`docs/AS-BUILT.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/AS-BUILT.md).
17
+ - What is built, deferred, and planned next is in [`docs/ROADMAP.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/ROADMAP.md).
18
18
  - Running the engine locally / in CI: [`docs/development/local-and-testing.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/development/local-and-testing.md).
19
19
  - The deterministic bridge CLI: [`docs/development/bridge-cli.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/development/bridge-cli.md).
package/src/bridge.ts CHANGED
@@ -33,6 +33,7 @@ import type {
33
33
  ProjectPhase,
34
34
  ProjectSliceEntry,
35
35
  RunConfig,
36
+ SpecimenId,
36
37
  } from "./types.js";
37
38
  import { PROJECT_PHASES } from "./types.js";
38
39
  import { scaffold, writeDoc, readDoc, stzPath } from "./taxonomy.js";
@@ -57,6 +58,7 @@ import {
57
58
  } from "./project.js";
58
59
  import { detectHacks } from "./hack-detector.js";
59
60
  import { STZ_VERSION, SCHEMA_VERSION, PACKAGE_NAME } from "./version.js";
61
+ import { onNoPassers, type EscalationState } from "./escalation.js";
60
62
  import { evalGate, select, pairings } from "./selection.js";
61
63
  import { diffSpecs, renderSpecDiff, isFaithful, unmatchedIntentIds, mismatchedAsBuiltIds, type Spec } from "./specdiff.js";
62
64
  import { seal, verifySeal, amendSeal, heldOutFiles } from "./seal.js";
@@ -247,10 +249,136 @@ function gate(args: Record<string, string>): void {
247
249
  const { root, slice } = args as { root: string; slice: string };
248
250
  const evals = loadEvals(root, slice);
249
251
  const { passers, eliminated } = evalGate(evals);
250
- // Emit the pairing schedule the command must drive with judge agents.
252
+ // Emit the pairing schedule the command must drive with judge agents. `gate`
253
+ // is a pure read — it never advances escalation. When `passers` is empty the
254
+ // command calls `escalate` (below), which owns the state transition; keeping
255
+ // them separate means a re-run of `gate` can't double-advance the FSM.
251
256
  print({ passers, eliminated, pairings: pairings(passers) });
252
257
  }
253
258
 
259
+ /** Build the pressure-log entries: every specimen that is not the winner is a
260
+ * negative exemplar (F9). `winner` is null for a no-passers round (all culled). */
261
+ function culledFromEvals(
262
+ root: string,
263
+ slice: string,
264
+ evals: EvalResult[],
265
+ winner: SpecimenId | null,
266
+ ): CulledSpecimen[] {
267
+ return evals
268
+ .filter((e) => e.specimen !== winner)
269
+ .map((e) => ({
270
+ specimen: e.specimen,
271
+ reason: e.hackFindings.length
272
+ ? `hack: ${e.hackFindings.map((f) => f.pattern).join(",")}`
273
+ : `gate testPassRate=${e.testPassRate.toFixed(2)}`,
274
+ diff: Object.entries(readSpecimenFiles(root, slice, e.specimen))
275
+ .map(([p, c]) => `+++ ${p}\n${c}`)
276
+ .join("\n"),
277
+ critique: "",
278
+ hackFindings: e.hackFindings,
279
+ }));
280
+ }
281
+
282
+ /**
283
+ * Bounded cross-round escalation (F14), driven from the command-level `/stz:run`
284
+ * loop. Call this ONCE after a gate that yielded zero passers. It is the single
285
+ * deterministic owner of "are we allowed another round?": it advances the
286
+ * escalation FSM over `state.json`, persists the new counts, and on retry/replan
287
+ * writes the PDR refinement context the next round's specimens consume — exactly
288
+ * the path the mock orchestrator drives internally, now exposed to the real
289
+ * command so it is not the LLM deciding when to stop.
290
+ *
291
+ * The sealed suite is NOT touched here: retry/replan re-enter the tournament with
292
+ * the SAME frozen suite (the command re-runs `seal-verify` each round). Re-using
293
+ * the FSM's hard ceiling (≤1 retry, ≤1 replan) means even a stray double-call is
294
+ * fail-safe — it halts early, it never loops.
295
+ */
296
+ async function escalateCmd(args: Record<string, string>): Promise<void> {
297
+ const { root, slice } = args as { root: string; slice: string };
298
+ const evals = loadEvals(root, slice);
299
+ let state = await loadState(root, slice);
300
+
301
+ const cur: EscalationState = {
302
+ stage: state.escalation,
303
+ retryCount: state.retryCount,
304
+ replanCount: state.replanCount,
305
+ };
306
+ // The round that just failed (1-based): rounds already consumed + this one.
307
+ const failedRound = cur.retryCount + cur.replanCount + 1;
308
+ const { next, action } = onNoPassers(cur);
309
+ state.escalation = next.stage;
310
+ state.retryCount = next.retryCount;
311
+ state.replanCount = next.replanCount;
312
+ state = appendEvent(state, "judgment", `escalation-${action.type}`, action.note);
313
+
314
+ // The whole field is culled this round (no winner). Persist the pressure log so
315
+ // the negative exemplars are auditable regardless of what comes next (F9).
316
+ const culled = culledFromEvals(root, slice, evals, null);
317
+ await writeDoc(root, join("50-pressure", slice, "pressure.md"), {
318
+ frontmatter: { summary: `Pressure log ${slice}: round ${failedRound}, ${culled.length} culled (no passers).` },
319
+ body: renderPressureLog({ sliceId: slice, culled }),
320
+ });
321
+
322
+ if (action.type === "halt") {
323
+ const report =
324
+ `# Failure report — ${slice}\n\n` +
325
+ `No specimen passed the sealed-suite gate after ${failedRound} round(s) ` +
326
+ `(${next.retryCount} retry, ${next.replanCount} replan). The bounded-escalation ` +
327
+ `budget (≤1 retry, ≤1 replan) is exhausted; halting per F14.\n\n` +
328
+ `## Per-specimen gate outcomes (final round)\n` +
329
+ evals
330
+ .map((e) => {
331
+ const why = e.hackFindings.length
332
+ ? `disqualified — hack: ${e.hackFindings.map((f) => f.pattern).join(", ")}`
333
+ : `gate fail — testPassRate=${e.testPassRate.toFixed(2)}, coverage=${e.coverage.toFixed(2)}, mutation=${e.mutationScore.toFixed(2)}`;
334
+ return `- specimen-${e.specimen}: ${why}`;
335
+ })
336
+ .join("\n") +
337
+ "\n";
338
+ state.failureReport = report;
339
+ state = setPhaseStatus(state, "judgment", "failed");
340
+ await writeDoc(root, join(sliceRel(slice), "failure-report.md"), {
341
+ frontmatter: { summary: `Halt: no passers after ${failedRound} round(s).` },
342
+ body: report,
343
+ });
344
+ await saveState(root, state);
345
+ print({
346
+ action: "halt",
347
+ note: action.note,
348
+ round: failedRound,
349
+ escalation: state.escalation,
350
+ retryCount: state.retryCount,
351
+ replanCount: state.replanCount,
352
+ failureReportPath: stzPath(root, join(sliceRel(slice), "failure-report.md")),
353
+ });
354
+ return;
355
+ }
356
+
357
+ // retry or replan → build the PDR refinement context (F9) from this round's
358
+ // group-relative advantages (no votes: GRPO over the eval rewards alone), the
359
+ // same computation the mock uses (orchestrator select(evals, [])).
360
+ const advantages = select(evals, []).judgment.advantages;
361
+ await writeDoc(root, join("50-pressure", slice, "refinement.md"), {
362
+ frontmatter: { summary: `PDR refinement for ${slice} after round ${failedRound} (${action.type}).` },
363
+ body: refinementContext({ sliceId: slice, culled }, advantages),
364
+ });
365
+ if (action.type === "replan") {
366
+ // Re-enter planning: the command rewrites intent.json before re-spawning.
367
+ state = setPhaseStatus(state, "planning", "running");
368
+ }
369
+ await saveState(root, state);
370
+ print({
371
+ action: action.type,
372
+ note: action.note,
373
+ round: failedRound,
374
+ nextRound: failedRound + 1,
375
+ escalation: state.escalation,
376
+ retryCount: state.retryCount,
377
+ replanCount: state.replanCount,
378
+ refinementPath: stzPath(root, join("50-pressure", slice, "refinement.md")),
379
+ });
380
+ }
381
+
254
382
  function recordVotes(args: Record<string, string>): void {
255
383
  const { root, slice } = args as { root: string; slice: string };
256
384
  const votes = readJSON<PairwiseVote[]>(args.votes!);
@@ -293,19 +421,7 @@ async function finalize(args: Record<string, string>): Promise<void> {
293
421
  : { ranking: [], winner: null, advantages: [], votes: [] };
294
422
 
295
423
  // Pressure log: every non-winning specimen is a negative exemplar (F9).
296
- const culled: CulledSpecimen[] = evals
297
- .filter((e) => e.specimen !== judgment.winner)
298
- .map((e) => ({
299
- specimen: e.specimen,
300
- reason: e.hackFindings.length
301
- ? `hack: ${e.hackFindings.map((f) => f.pattern).join(",")}`
302
- : `gate testPassRate=${e.testPassRate.toFixed(2)}`,
303
- diff: Object.entries(readSpecimenFiles(root, slice, e.specimen))
304
- .map(([p, c]) => `+++ ${p}\n${c}`)
305
- .join("\n"),
306
- critique: "",
307
- hackFindings: e.hackFindings,
308
- }));
424
+ const culled = culledFromEvals(root, slice, evals, judgment.winner);
309
425
  await writeDoc(root, join("50-pressure", slice, "pressure.md"), {
310
426
  frontmatter: { summary: `Pressure log ${slice}: ${culled.length} culled.` },
311
427
  body: renderPressureLog({ sliceId: slice, culled }),
@@ -932,6 +1048,7 @@ export async function runBridge(argv: string[]): Promise<void> {
932
1048
  case "record-eval": recordEval(args); break;
933
1049
  case "eval": evalCmd(args); break;
934
1050
  case "gate": gate(args); break;
1051
+ case "escalate": await escalateCmd(args); break;
935
1052
  case "record-votes": recordVotes(args); break;
936
1053
  case "select": await selectCmd(args); break;
937
1054
  case "finalize": await finalize(args); break;
@@ -8,7 +8,7 @@
8
8
  * The model layer is injected (ModelLayer), so this runs identically against
9
9
  * the deterministic mock and a future live Claude Code / Codex implementation.
10
10
  *
11
- * STUBBED vs the full design (logged via the `log` sink, surfaced in AS-BUILT):
11
+ * STUBBED vs the full design (logged via the `log` sink, surfaced in ROADMAP):
12
12
  * - git worktrees per specimen → prototypes/specimen-X/ directories instead.
13
13
  * - per-worktree ephemeral observability stacks → not spun up.
14
14
  * - live Python eval drivers / mutation / PBT → mock EvalRunner.