slice-tournament-zoo 0.6.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/package.json +1 -1
- package/src/README.md +1 -1
- package/src/bridge.ts +131 -14
- package/src/mock/orchestrator.ts +1 -1
package/README.md
CHANGED
|
@@ -399,7 +399,8 @@ For contributors and anyone going past day-to-day operation:
|
|
|
399
399
|
- **Sealed-suite integrity** — the guide-vs-sensor contract behind the frozen
|
|
400
400
|
held-out suite: [`docs/development/sealed-suite.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/development/sealed-suite.md).
|
|
401
401
|
- **Requirement-to-test mapping** — [`docs/TESTPLAN.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/TESTPLAN.md).
|
|
402
|
-
- **
|
|
402
|
+
- **Roadmap — what is built, deferred, and planned next** —
|
|
403
|
+
[`docs/ROADMAP.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/ROADMAP.md).
|
|
403
404
|
|
|
404
405
|
## License
|
|
405
406
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "slice-tournament-zoo",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.1",
|
|
4
4
|
"description": "STZ: a contract-bounded slice pipeline that implements each slice adversarially via an N-specimen tournament with frozen sealed tests, GRPO-style selection, layered anti-reward-hacking, and a replayable markdown audit trail.",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"homepage": "https://github.com/dr-robert-li/slice-tournament-zoo#readme",
|
package/src/README.md
CHANGED
|
@@ -14,6 +14,6 @@ the production path — see [`mock/`](https://github.com/dr-robert-li/slice-tour
|
|
|
14
14
|
## Further reading
|
|
15
15
|
|
|
16
16
|
- The requirement-to-test mapping is in [`docs/TESTPLAN.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/TESTPLAN.md).
|
|
17
|
-
- What is
|
|
17
|
+
- What is built, deferred, and planned next is in [`docs/ROADMAP.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/ROADMAP.md).
|
|
18
18
|
- Running the engine locally / in CI: [`docs/development/local-and-testing.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/development/local-and-testing.md).
|
|
19
19
|
- The deterministic bridge CLI: [`docs/development/bridge-cli.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/development/bridge-cli.md).
|
package/src/bridge.ts
CHANGED
|
@@ -33,6 +33,7 @@ import type {
|
|
|
33
33
|
ProjectPhase,
|
|
34
34
|
ProjectSliceEntry,
|
|
35
35
|
RunConfig,
|
|
36
|
+
SpecimenId,
|
|
36
37
|
} from "./types.js";
|
|
37
38
|
import { PROJECT_PHASES } from "./types.js";
|
|
38
39
|
import { scaffold, writeDoc, readDoc, stzPath } from "./taxonomy.js";
|
|
@@ -57,6 +58,7 @@ import {
|
|
|
57
58
|
} from "./project.js";
|
|
58
59
|
import { detectHacks } from "./hack-detector.js";
|
|
59
60
|
import { STZ_VERSION, SCHEMA_VERSION, PACKAGE_NAME } from "./version.js";
|
|
61
|
+
import { onNoPassers, type EscalationState } from "./escalation.js";
|
|
60
62
|
import { evalGate, select, pairings } from "./selection.js";
|
|
61
63
|
import { diffSpecs, renderSpecDiff, isFaithful, unmatchedIntentIds, mismatchedAsBuiltIds, type Spec } from "./specdiff.js";
|
|
62
64
|
import { seal, verifySeal, amendSeal, heldOutFiles } from "./seal.js";
|
|
@@ -247,10 +249,136 @@ function gate(args: Record<string, string>): void {
|
|
|
247
249
|
const { root, slice } = args as { root: string; slice: string };
|
|
248
250
|
const evals = loadEvals(root, slice);
|
|
249
251
|
const { passers, eliminated } = evalGate(evals);
|
|
250
|
-
// Emit the pairing schedule the command must drive with judge agents.
|
|
252
|
+
// Emit the pairing schedule the command must drive with judge agents. `gate`
|
|
253
|
+
// is a pure read — it never advances escalation. When `passers` is empty the
|
|
254
|
+
// command calls `escalate` (below), which owns the state transition; keeping
|
|
255
|
+
// them separate means a re-run of `gate` can't double-advance the FSM.
|
|
251
256
|
print({ passers, eliminated, pairings: pairings(passers) });
|
|
252
257
|
}
|
|
253
258
|
|
|
259
|
+
/** Build the pressure-log entries: every specimen that is not the winner is a
|
|
260
|
+
* negative exemplar (F9). `winner` is null for a no-passers round (all culled). */
|
|
261
|
+
function culledFromEvals(
|
|
262
|
+
root: string,
|
|
263
|
+
slice: string,
|
|
264
|
+
evals: EvalResult[],
|
|
265
|
+
winner: SpecimenId | null,
|
|
266
|
+
): CulledSpecimen[] {
|
|
267
|
+
return evals
|
|
268
|
+
.filter((e) => e.specimen !== winner)
|
|
269
|
+
.map((e) => ({
|
|
270
|
+
specimen: e.specimen,
|
|
271
|
+
reason: e.hackFindings.length
|
|
272
|
+
? `hack: ${e.hackFindings.map((f) => f.pattern).join(",")}`
|
|
273
|
+
: `gate testPassRate=${e.testPassRate.toFixed(2)}`,
|
|
274
|
+
diff: Object.entries(readSpecimenFiles(root, slice, e.specimen))
|
|
275
|
+
.map(([p, c]) => `+++ ${p}\n${c}`)
|
|
276
|
+
.join("\n"),
|
|
277
|
+
critique: "",
|
|
278
|
+
hackFindings: e.hackFindings,
|
|
279
|
+
}));
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Bounded cross-round escalation (F14), driven from the command-level `/stz:run`
|
|
284
|
+
* loop. Call this ONCE after a gate that yielded zero passers. It is the single
|
|
285
|
+
* deterministic owner of "are we allowed another round?": it advances the
|
|
286
|
+
* escalation FSM over `state.json`, persists the new counts, and on retry/replan
|
|
287
|
+
* writes the PDR refinement context the next round's specimens consume — exactly
|
|
288
|
+
* the path the mock orchestrator drives internally, now exposed to the real
|
|
289
|
+
* command so it is not the LLM deciding when to stop.
|
|
290
|
+
*
|
|
291
|
+
* The sealed suite is NOT touched here: retry/replan re-enter the tournament with
|
|
292
|
+
* the SAME frozen suite (the command re-runs `seal-verify` each round). Re-using
|
|
293
|
+
* the FSM's hard ceiling (≤1 retry, ≤1 replan) means even a stray double-call is
|
|
294
|
+
* fail-safe — it halts early, it never loops.
|
|
295
|
+
*/
|
|
296
|
+
async function escalateCmd(args: Record<string, string>): Promise<void> {
|
|
297
|
+
const { root, slice } = args as { root: string; slice: string };
|
|
298
|
+
const evals = loadEvals(root, slice);
|
|
299
|
+
let state = await loadState(root, slice);
|
|
300
|
+
|
|
301
|
+
const cur: EscalationState = {
|
|
302
|
+
stage: state.escalation,
|
|
303
|
+
retryCount: state.retryCount,
|
|
304
|
+
replanCount: state.replanCount,
|
|
305
|
+
};
|
|
306
|
+
// The round that just failed (1-based): rounds already consumed + this one.
|
|
307
|
+
const failedRound = cur.retryCount + cur.replanCount + 1;
|
|
308
|
+
const { next, action } = onNoPassers(cur);
|
|
309
|
+
state.escalation = next.stage;
|
|
310
|
+
state.retryCount = next.retryCount;
|
|
311
|
+
state.replanCount = next.replanCount;
|
|
312
|
+
state = appendEvent(state, "judgment", `escalation-${action.type}`, action.note);
|
|
313
|
+
|
|
314
|
+
// The whole field is culled this round (no winner). Persist the pressure log so
|
|
315
|
+
// the negative exemplars are auditable regardless of what comes next (F9).
|
|
316
|
+
const culled = culledFromEvals(root, slice, evals, null);
|
|
317
|
+
await writeDoc(root, join("50-pressure", slice, "pressure.md"), {
|
|
318
|
+
frontmatter: { summary: `Pressure log ${slice}: round ${failedRound}, ${culled.length} culled (no passers).` },
|
|
319
|
+
body: renderPressureLog({ sliceId: slice, culled }),
|
|
320
|
+
});
|
|
321
|
+
|
|
322
|
+
if (action.type === "halt") {
|
|
323
|
+
const report =
|
|
324
|
+
`# Failure report — ${slice}\n\n` +
|
|
325
|
+
`No specimen passed the sealed-suite gate after ${failedRound} round(s) ` +
|
|
326
|
+
`(${next.retryCount} retry, ${next.replanCount} replan). The bounded-escalation ` +
|
|
327
|
+
`budget (≤1 retry, ≤1 replan) is exhausted; halting per F14.\n\n` +
|
|
328
|
+
`## Per-specimen gate outcomes (final round)\n` +
|
|
329
|
+
evals
|
|
330
|
+
.map((e) => {
|
|
331
|
+
const why = e.hackFindings.length
|
|
332
|
+
? `disqualified — hack: ${e.hackFindings.map((f) => f.pattern).join(", ")}`
|
|
333
|
+
: `gate fail — testPassRate=${e.testPassRate.toFixed(2)}, coverage=${e.coverage.toFixed(2)}, mutation=${e.mutationScore.toFixed(2)}`;
|
|
334
|
+
return `- specimen-${e.specimen}: ${why}`;
|
|
335
|
+
})
|
|
336
|
+
.join("\n") +
|
|
337
|
+
"\n";
|
|
338
|
+
state.failureReport = report;
|
|
339
|
+
state = setPhaseStatus(state, "judgment", "failed");
|
|
340
|
+
await writeDoc(root, join(sliceRel(slice), "failure-report.md"), {
|
|
341
|
+
frontmatter: { summary: `Halt: no passers after ${failedRound} round(s).` },
|
|
342
|
+
body: report,
|
|
343
|
+
});
|
|
344
|
+
await saveState(root, state);
|
|
345
|
+
print({
|
|
346
|
+
action: "halt",
|
|
347
|
+
note: action.note,
|
|
348
|
+
round: failedRound,
|
|
349
|
+
escalation: state.escalation,
|
|
350
|
+
retryCount: state.retryCount,
|
|
351
|
+
replanCount: state.replanCount,
|
|
352
|
+
failureReportPath: stzPath(root, join(sliceRel(slice), "failure-report.md")),
|
|
353
|
+
});
|
|
354
|
+
return;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// retry or replan → build the PDR refinement context (F9) from this round's
|
|
358
|
+
// group-relative advantages (no votes: GRPO over the eval rewards alone), the
|
|
359
|
+
// same computation the mock uses (orchestrator select(evals, [])).
|
|
360
|
+
const advantages = select(evals, []).judgment.advantages;
|
|
361
|
+
await writeDoc(root, join("50-pressure", slice, "refinement.md"), {
|
|
362
|
+
frontmatter: { summary: `PDR refinement for ${slice} after round ${failedRound} (${action.type}).` },
|
|
363
|
+
body: refinementContext({ sliceId: slice, culled }, advantages),
|
|
364
|
+
});
|
|
365
|
+
if (action.type === "replan") {
|
|
366
|
+
// Re-enter planning: the command rewrites intent.json before re-spawning.
|
|
367
|
+
state = setPhaseStatus(state, "planning", "running");
|
|
368
|
+
}
|
|
369
|
+
await saveState(root, state);
|
|
370
|
+
print({
|
|
371
|
+
action: action.type,
|
|
372
|
+
note: action.note,
|
|
373
|
+
round: failedRound,
|
|
374
|
+
nextRound: failedRound + 1,
|
|
375
|
+
escalation: state.escalation,
|
|
376
|
+
retryCount: state.retryCount,
|
|
377
|
+
replanCount: state.replanCount,
|
|
378
|
+
refinementPath: stzPath(root, join("50-pressure", slice, "refinement.md")),
|
|
379
|
+
});
|
|
380
|
+
}
|
|
381
|
+
|
|
254
382
|
function recordVotes(args: Record<string, string>): void {
|
|
255
383
|
const { root, slice } = args as { root: string; slice: string };
|
|
256
384
|
const votes = readJSON<PairwiseVote[]>(args.votes!);
|
|
@@ -293,19 +421,7 @@ async function finalize(args: Record<string, string>): Promise<void> {
|
|
|
293
421
|
: { ranking: [], winner: null, advantages: [], votes: [] };
|
|
294
422
|
|
|
295
423
|
// Pressure log: every non-winning specimen is a negative exemplar (F9).
|
|
296
|
-
const culled
|
|
297
|
-
.filter((e) => e.specimen !== judgment.winner)
|
|
298
|
-
.map((e) => ({
|
|
299
|
-
specimen: e.specimen,
|
|
300
|
-
reason: e.hackFindings.length
|
|
301
|
-
? `hack: ${e.hackFindings.map((f) => f.pattern).join(",")}`
|
|
302
|
-
: `gate testPassRate=${e.testPassRate.toFixed(2)}`,
|
|
303
|
-
diff: Object.entries(readSpecimenFiles(root, slice, e.specimen))
|
|
304
|
-
.map(([p, c]) => `+++ ${p}\n${c}`)
|
|
305
|
-
.join("\n"),
|
|
306
|
-
critique: "",
|
|
307
|
-
hackFindings: e.hackFindings,
|
|
308
|
-
}));
|
|
424
|
+
const culled = culledFromEvals(root, slice, evals, judgment.winner);
|
|
309
425
|
await writeDoc(root, join("50-pressure", slice, "pressure.md"), {
|
|
310
426
|
frontmatter: { summary: `Pressure log ${slice}: ${culled.length} culled.` },
|
|
311
427
|
body: renderPressureLog({ sliceId: slice, culled }),
|
|
@@ -932,6 +1048,7 @@ export async function runBridge(argv: string[]): Promise<void> {
|
|
|
932
1048
|
case "record-eval": recordEval(args); break;
|
|
933
1049
|
case "eval": evalCmd(args); break;
|
|
934
1050
|
case "gate": gate(args); break;
|
|
1051
|
+
case "escalate": await escalateCmd(args); break;
|
|
935
1052
|
case "record-votes": recordVotes(args); break;
|
|
936
1053
|
case "select": await selectCmd(args); break;
|
|
937
1054
|
case "finalize": await finalize(args); break;
|
package/src/mock/orchestrator.ts
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* The model layer is injected (ModelLayer), so this runs identically against
|
|
9
9
|
* the deterministic mock and a future live Claude Code / Codex implementation.
|
|
10
10
|
*
|
|
11
|
-
* STUBBED vs the full design (logged via the `log` sink, surfaced in
|
|
11
|
+
* STUBBED vs the full design (logged via the `log` sink, surfaced in ROADMAP):
|
|
12
12
|
* - git worktrees per specimen → prototypes/specimen-X/ directories instead.
|
|
13
13
|
* - per-worktree ephemeral observability stacks → not spun up.
|
|
14
14
|
* - live Python eval drivers / mutation / PBT → mock EvalRunner.
|