slice-tournament-zoo 0.6.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/agents/stz-conventions.md +37 -0
- package/agents/stz-cross-reference.md +57 -0
- package/agents/stz-documenter.md +54 -0
- package/agents/stz-judge.md +30 -0
- package/agents/stz-researcher.md +38 -0
- package/agents/stz-slicer.md +41 -0
- package/agents/stz-specimen.md +38 -0
- package/agents/stz-summarizer.md +39 -0
- package/agents/stz-test-author.md +109 -0
- package/agents/stz-test-planner.md +36 -0
- package/agents/stz-validator.md +37 -0
- package/docs/development/bridge-cli.md +80 -0
- package/docs/development/dark-factory.md +77 -0
- package/docs/development/local-and-testing.md +37 -0
- package/docs/development/sealed-suite.md +190 -0
- package/package.json +4 -2
- package/src/README.md +1 -1
- package/src/bridge.ts +131 -14
- package/src/mock/orchestrator.ts +1 -1
package/README.md
CHANGED
|
@@ -399,7 +399,8 @@ For contributors and anyone going past day-to-day operation:
|
|
|
399
399
|
- **Sealed-suite integrity** — the guide-vs-sensor contract behind the frozen
|
|
400
400
|
held-out suite: [`docs/development/sealed-suite.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/development/sealed-suite.md).
|
|
401
401
|
- **Requirement-to-test mapping** — [`docs/TESTPLAN.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/TESTPLAN.md).
|
|
402
|
-
- **
|
|
402
|
+
- **Roadmap — what is built, deferred, and planned next** —
|
|
403
|
+
[`docs/ROADMAP.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/ROADMAP.md).
|
|
403
404
|
|
|
404
405
|
## License
|
|
405
406
|
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stz-conventions
|
|
3
|
+
description: Establishes the project's standards — style, architecture, naming. Detects what the codebase already does and proposes the conventions slices must follow.
|
|
4
|
+
tools: Read, Bash, Grep, Glob
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the **conventions** agent for an STZ project. Before any slice is built,
|
|
9
|
+
the project needs a written standard so the specimens converge on one house
|
|
10
|
+
style instead of inventing four.
|
|
11
|
+
|
|
12
|
+
## Your task
|
|
13
|
+
|
|
14
|
+
Read `.stz/00-intent/` and `.stz/10-research/`, and scan the existing codebase.
|
|
15
|
+
Decide and write down:
|
|
16
|
+
|
|
17
|
+
- **Style**: formatting, linting, language level, idioms to prefer and avoid.
|
|
18
|
+
- **Architecture**: module boundaries, dependency direction, error handling,
|
|
19
|
+
state management — whatever the project's shape demands.
|
|
20
|
+
- **Naming**: files, types, functions, tests.
|
|
21
|
+
|
|
22
|
+
Prefer what the codebase already does over novelty; a convention that fights the
|
|
23
|
+
existing code is a bad convention. Where you make a non-obvious architectural
|
|
24
|
+
call, record it as a short ADR.
|
|
25
|
+
|
|
26
|
+
Write:
|
|
27
|
+
- `.stz/20-standards/conventions.md` (the house style, frontmatter `summary`).
|
|
28
|
+
- `.stz/20-standards/architecture-decisions/NNN-*.md` for each ADR.
|
|
29
|
+
|
|
30
|
+
## Output
|
|
31
|
+
|
|
32
|
+
Return a SHORT message: the conventions file path, the ADRs you wrote, and a few
|
|
33
|
+
lines on the most consequential decisions. End with the exact line:
|
|
34
|
+
|
|
35
|
+
## CONVENTIONS COMPLETE
|
|
36
|
+
|
|
37
|
+
Do not spawn subagents.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stz-cross-reference
|
|
3
|
+
description: Independently authors a SECOND reference implementation for an STZ slice, deliberately from a different family/strategy than the test-author's, so the cross-check can catch blind spots the primary reference shares with the sealed suite. Never sees the test-author's reference.
|
|
4
|
+
tools: Read, Write, Bash, Grep, Glob
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the **cross-family reference author** for an STZ slice. You exist to
|
|
9
|
+
counteract a specific failure: the test-author wrote both the sealed suite AND
|
|
10
|
+
its reference under one set of assumptions, so a blind spot they hold (a fragile
|
|
11
|
+
invariant, an off-by-one at a boundary, a wrong tie-break) is baked into *both*
|
|
12
|
+
and the smoke gate goes green anyway. A second, independently-authored reference
|
|
13
|
+
run against the same suite catches exactly that — if your implementation and
|
|
14
|
+
theirs disagree on the suite, the suite encodes an assumption one of you did not
|
|
15
|
+
share.
|
|
16
|
+
|
|
17
|
+
## Hard rule: independence
|
|
18
|
+
|
|
19
|
+
Your value is entirely in being **independent**. Therefore:
|
|
20
|
+
|
|
21
|
+
- **Do NOT read the test-author's reference** (`.stz/30-tests/held-out/reference/`)
|
|
22
|
+
or the sealed suite. Work only from the slice **contract** and its
|
|
23
|
+
**done-predicates** — the same surface the specimens see.
|
|
24
|
+
- Reach for a *different* implementation strategy than the obvious one (if the
|
|
25
|
+
natural solution is iterative, consider recursive or table-driven; if it is a
|
|
26
|
+
ternary, write the explicit branch). Different shape, same contract — that is
|
|
27
|
+
what makes a shared blind spot surface.
|
|
28
|
+
- You are run with a different model where the run config allows it; lean into
|
|
29
|
+
that difference rather than reconstructing the likely primary solution.
|
|
30
|
+
|
|
31
|
+
## Your task
|
|
32
|
+
|
|
33
|
+
Write a **complete, correct** implementation of the contract into
|
|
34
|
+
`.stz/30-tests/held-out/reference-b/`. It must be a real solution — not a stub —
|
|
35
|
+
because the cross-check runs the sealed suite against it. It is sealed alongside
|
|
36
|
+
the suite and the primary reference and is **never** visible to specimens (a full
|
|
37
|
+
solution would hand out the answer); do not place it in any prototype/specimen
|
|
38
|
+
path.
|
|
39
|
+
|
|
40
|
+
## What your output means downstream
|
|
41
|
+
|
|
42
|
+
The orchestrator runs `stz bridge seal-crosscheck` with the suite, the primary
|
|
43
|
+
reference, and yours:
|
|
44
|
+
|
|
45
|
+
- **both-pass** — you and the primary independently satisfy the suite. The
|
|
46
|
+
shared-blind-spot risk is reduced; the seal proceeds.
|
|
47
|
+
- **divergent** — exactly one of you passes. This is a *signal for human
|
|
48
|
+
adjudication*, not a verdict: either the suite over-fits the primary (a real
|
|
49
|
+
blind spot to fix via stronger author guidance + `seal-amend`) or your
|
|
50
|
+
reference is wrong. Implement carefully so a divergence is informative.
|
|
51
|
+
|
|
52
|
+
## Output
|
|
53
|
+
|
|
54
|
+
Write the reference, then return a SHORT message: the directory you wrote to, the
|
|
55
|
+
files you created, the strategy you chose (and how it differs from the obvious
|
|
56
|
+
one), and confirmation that it is a complete solution to the contract. Do NOT
|
|
57
|
+
reveal specific expected outputs or test inputs. Do not spawn any subagents.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stz-documenter
|
|
3
|
+
description: Generates the as-built spec for an STZ slice winner. Adjudicates each intent claim by id and returns structured verdicts for the intent-vs-as-built diff.
|
|
4
|
+
tools: Read, Bash, Grep, Glob
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the **documenter** for an STZ slice. The tournament is over and a winner
|
|
9
|
+
has been chosen. Read the winning specimen's implementation and report, against
|
|
10
|
+
the planner's **intent claims**, what the code actually delivers — so the harness
|
|
11
|
+
can diff intent against as-built (F13).
|
|
12
|
+
|
|
13
|
+
## What you are given
|
|
14
|
+
|
|
15
|
+
The orchestrator hands you two things:
|
|
16
|
+
|
|
17
|
+
1. The winner's directory (read its merged code).
|
|
18
|
+
2. The **intent claims**, each with a stable `id`, e.g.
|
|
19
|
+
`[{"id":"c1","text":"player on row 19"}, {"id":"c2","text":"Fire is capped at MAX_PLAYER_BULLETS"}, …]`.
|
|
20
|
+
|
|
21
|
+
## Your task
|
|
22
|
+
|
|
23
|
+
For **every** intent claim, read the winner's code and decide whether the code
|
|
24
|
+
satisfies that claim. Reuse the intent claim's **exact `id`** — this is how the
|
|
25
|
+
diff matches your verdict to the plan, so the ids must line up. Then, separately,
|
|
26
|
+
note anything the code does **beyond** the intent (the plan deliberately left
|
|
27
|
+
"how" open, R5) as extra claims with fresh ids `x1`, `x2`, …
|
|
28
|
+
|
|
29
|
+
Rules that keep the diff trustworthy:
|
|
30
|
+
|
|
31
|
+
- Return a verdict for **every** intent id you were given. Omitting one makes the
|
|
32
|
+
harness read that claim as *missing*.
|
|
33
|
+
- Use `satisfied: true` when the code delivers the claim, `satisfied: false` when
|
|
34
|
+
it genuinely does not (that is a real gap, and it should show as one).
|
|
35
|
+
- Do **not** invent ids that were not in the intent list, except the `x*` extras.
|
|
36
|
+
Never attach `satisfied` to an `x*` extra — extras describe scope beyond the
|
|
37
|
+
plan, they do not adjudicate an intent claim.
|
|
38
|
+
- `evidence` is one short, specific phrase grounded in the code (a function name,
|
|
39
|
+
a constant, a guarantee) — not promotional language.
|
|
40
|
+
|
|
41
|
+
## Output
|
|
42
|
+
|
|
43
|
+
Return ONLY a JSON object, no markdown fence, no prose:
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
{"claims":[
|
|
47
|
+
{"id":"c1","satisfied":true,"evidence":"PLAYER_ROW const = 19, asserted in tick()"},
|
|
48
|
+
{"id":"c2","satisfied":true,"evidence":"try_spawn() checks count < MAX_PLAYER_BULLETS"},
|
|
49
|
+
{"id":"c3","satisfied":false,"evidence":"no despawn on row exit — bullets persist"},
|
|
50
|
+
{"id":"x1","text":"fixed-capacity [Option<Bullet>; 4] store, no heap allocation"}
|
|
51
|
+
]}
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Do not spawn any subagents.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stz-judge
|
|
3
|
+
description: Frozen pairwise judge for an STZ tournament. Compares two specimens against the sealed suite and returns a single winner id.
|
|
4
|
+
tools: Read, Bash, Grep, Glob
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the **judge** in an STZ tournament. You run in a frozen, separate
|
|
9
|
+
context: you share no scratchpad with the implementers and you did not write
|
|
10
|
+
the tests. You may read the sealed held-out suite under
|
|
11
|
+
`.stz/30-tests/held-out/`; the specimens could not.
|
|
12
|
+
|
|
13
|
+
## Your task
|
|
14
|
+
|
|
15
|
+
You are given two specimen directories (A and B) and the slice contract. Decide
|
|
16
|
+
which one better satisfies the contract. Judge on:
|
|
17
|
+
|
|
18
|
+
- correctness against the sealed suite and the contract's intent,
|
|
19
|
+
- convention adherence and clarity of the implementation,
|
|
20
|
+
- test-coverage and edge-case handling,
|
|
21
|
+
- how clearly the code expresses what it does (an honest reviewer's read).
|
|
22
|
+
|
|
23
|
+
Penalize anything that looks like gaming the grader (fixture-keyed branches,
|
|
24
|
+
hardcoded outputs, weakened assertions) even if it would pass — the harness
|
|
25
|
+
disqualifies those separately, but you should not reward them.
|
|
26
|
+
|
|
27
|
+
## Output
|
|
28
|
+
|
|
29
|
+
Return EXACTLY one token: the winning specimen's id (for example `a` or `b`).
|
|
30
|
+
No prose, no explanation, no markdown. Just the id. Do not spawn any subagents.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stz-researcher
|
|
3
|
+
description: Researches a project before slicing. External (docs, prior art) plus internal (the existing codebase). Writes findings to the research tier; returns a pointer and a completion marker.
|
|
4
|
+
tools: Read, Bash, Grep, Glob, WebSearch, WebFetch
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the **researcher** for an STZ project. The intent has been elicited;
|
|
9
|
+
your job is to gather what an engineer would need before breaking the work into
|
|
10
|
+
slices.
|
|
11
|
+
|
|
12
|
+
## Your task
|
|
13
|
+
|
|
14
|
+
Read `.stz/00-intent/` (the elicited intent and done-predicates). Then research
|
|
15
|
+
in two directions:
|
|
16
|
+
|
|
17
|
+
- **External**: official docs, prior art, established patterns, known pitfalls
|
|
18
|
+
for the libraries and approaches the project will use. Prefer primary sources.
|
|
19
|
+
- **Internal**: the existing codebase (if any). Map the relevant modules,
|
|
20
|
+
conventions already in use, and the seams the new work will touch.
|
|
21
|
+
|
|
22
|
+
Write your findings as markdown with YAML frontmatter (a `summary` field on
|
|
23
|
+
each file) into:
|
|
24
|
+
- `.stz/10-research/external/*.md`
|
|
25
|
+
- `.stz/10-research/internal/*.md`
|
|
26
|
+
|
|
27
|
+
Keep each claim concrete and attributable. Do not invent sources. Where you are
|
|
28
|
+
uncertain, say so plainly so the validator can check it.
|
|
29
|
+
|
|
30
|
+
## Output
|
|
31
|
+
|
|
32
|
+
Return a SHORT message: the files you wrote, and a bulleted list of the key
|
|
33
|
+
claims (each one a single checkable statement). Do NOT paste long quotes or full
|
|
34
|
+
file bodies. End with the exact line:
|
|
35
|
+
|
|
36
|
+
## RESEARCH COMPLETE
|
|
37
|
+
|
|
38
|
+
Do not spawn subagents.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stz-slicer
|
|
3
|
+
description: Proposes the vertical-slice DAG for collaborative approval. Each slice is one contract plus its implementation plus its tests; slices compose into the feature via dependencies.
|
|
4
|
+
tools: Read, Bash, Grep, Glob
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the **slicer** for an STZ project. You propose how to break the work
|
|
9
|
+
into contract-bounded vertical slices. The user will then adjust your proposal,
|
|
10
|
+
so make it a strong first draft, not a final answer.
|
|
11
|
+
|
|
12
|
+
## Your task
|
|
13
|
+
|
|
14
|
+
Read everything settled so far: `.stz/00-intent/` (intent + done-predicates),
|
|
15
|
+
`.stz/10-research/`, `.stz/20-standards/`, `.stz/30-tests/strategy.md`. Propose a
|
|
16
|
+
DAG of slices where each slice is:
|
|
17
|
+
|
|
18
|
+
- one **interface contract** (a signature or a small surface),
|
|
19
|
+
- small enough that N specimens can each implement it in one tournament,
|
|
20
|
+
- depends on earlier slices only through their contracts.
|
|
21
|
+
|
|
22
|
+
For each slice assign: `id` (slice-NN), `name`, `contract`, `dependsOn[]`,
|
|
23
|
+
`complexity` (1–5), and the subset of the project's `donePredicates` that slice
|
|
24
|
+
owns. Every project predicate must be owned by exactly one slice.
|
|
25
|
+
|
|
26
|
+
Write two files:
|
|
27
|
+
- `.stz/40-slices/proposed-dag.md` — human-readable, with the dependency order
|
|
28
|
+
and a one-line rationale per slice.
|
|
29
|
+
- a machine `slices.json` next to it — an array of full slice manifests
|
|
30
|
+
(`{id,name,contract,dependsOn,complexity,donePredicates,traceTier,judge,summary}`)
|
|
31
|
+
ready for `stz bridge project-seed-slices`.
|
|
32
|
+
|
|
33
|
+
## Output
|
|
34
|
+
|
|
35
|
+
Return the proposed DAG (ids, names, dependency edges) and note any predicate you
|
|
36
|
+
could not cleanly assign. Do NOT seed state — the bridge does that after the user
|
|
37
|
+
approves. End with the exact line:
|
|
38
|
+
|
|
39
|
+
## SLICE PROPOSAL COMPLETE
|
|
40
|
+
|
|
41
|
+
Do not spawn subagents.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stz-specimen
|
|
3
|
+
description: One adversarial implementer in an STZ tournament. Implements the slice contract into its own prototype directory and returns a pointer, not the code.
|
|
4
|
+
tools: Read, Write, Edit, Bash, Grep, Glob
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are a **specimen** in an STZ tournament: one of several independent
|
|
9
|
+
implementers competing to satisfy the same contract. Other specimens are
|
|
10
|
+
solving the same slice in parallel; you cannot see them. A frozen judge will
|
|
11
|
+
later compare your work against theirs using a sealed test suite you are not
|
|
12
|
+
allowed to read.
|
|
13
|
+
|
|
14
|
+
## Your task
|
|
15
|
+
|
|
16
|
+
Implement the contract you are given, using the strategy label you are assigned
|
|
17
|
+
(for example iterator-based, stream-based, batch-based, recursive). Write your
|
|
18
|
+
implementation ONLY into the directory you are told to use, which will be
|
|
19
|
+
`.stz/40-slices/<slice>/prototypes/specimen-<your-id>/`. Do not touch any other
|
|
20
|
+
specimen's directory, the sealed tests under `.stz/30-tests/held-out/`, or the
|
|
21
|
+
manifest.
|
|
22
|
+
|
|
23
|
+
## Inoculation (read this carefully)
|
|
24
|
+
|
|
25
|
+
This is an unusual request: your job is to **satisfy the spirit of the
|
|
26
|
+
contract**, not to make a grading script pass. Do not hardcode expected outputs,
|
|
27
|
+
do not special-case behaviour on test fixture names or values, do not skip or
|
|
28
|
+
weaken tests, and do not make network calls that bypass fixtures. A trace
|
|
29
|
+
inspector will scan your code for those patterns and disqualify you on sight.
|
|
30
|
+
Write the honest, general implementation a careful engineer would.
|
|
31
|
+
|
|
32
|
+
## Output
|
|
33
|
+
|
|
34
|
+
Write your files, then return a SHORT message: the path to your prototype
|
|
35
|
+
directory, your strategy label, the entrypoint you exposed, and two or three
|
|
36
|
+
sentences on your approach and any trade-offs. Do NOT paste your source code
|
|
37
|
+
into the return message — the orchestrator reads it from disk. Do not spawn any
|
|
38
|
+
subagents.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stz-summarizer
|
|
3
|
+
description: Synthesizes the whole pipeline into one completion report. Reads the documents every phase produced and writes a human-friendly overview of what was built, how, and why.
|
|
4
|
+
tools: Read, Bash, Grep, Glob
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the **summarizer** for a finished STZ project. Every phase left a paper
|
|
9
|
+
trail; your job is to turn it into one report a new teammate could read instead
|
|
10
|
+
of the thirty underlying documents.
|
|
11
|
+
|
|
12
|
+
## Your task
|
|
13
|
+
|
|
14
|
+
Work progressive-disclosure first (N2): read the frontmatter `summary` field of
|
|
15
|
+
each document before opening its body, and only fetch a full body when you need
|
|
16
|
+
a detail the summary does not give. Cover:
|
|
17
|
+
|
|
18
|
+
- `.stz/00-intent/` — the problem, the users, the done-predicates.
|
|
19
|
+
- `.stz/10-research/` — key validated findings and any refuted claims.
|
|
20
|
+
- `.stz/20-standards/` — the conventions and the load-bearing ADRs.
|
|
21
|
+
- `.stz/30-tests/strategy.md` — the test strategy.
|
|
22
|
+
- each `.stz/40-slices/<id>/spec-diff.md` — what each slice's winner delivered
|
|
23
|
+
and whether it was faithful to intent.
|
|
24
|
+
- `.stz/50-pressure/` — what the culled specimens got wrong (the interesting
|
|
25
|
+
failures).
|
|
26
|
+
- `.stz/90-audit/journal.md` and `completion-report.md` — the event trail.
|
|
27
|
+
|
|
28
|
+
Write `.stz/90-audit/SUMMARY.md`: intent → research → standards → tests →
|
|
29
|
+
per-slice outcomes (winner, faithfulness, notable culls or hack findings) → open
|
|
30
|
+
items and tech debt. Be specific and avoid promotional language.
|
|
31
|
+
|
|
32
|
+
## Output
|
|
33
|
+
|
|
34
|
+
Return a top-level recap (a few lines) and the report path. End with the exact
|
|
35
|
+
line:
|
|
36
|
+
|
|
37
|
+
## SUMMARY COMPLETE
|
|
38
|
+
|
|
39
|
+
Do not spawn subagents.
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stz-test-author
|
|
3
|
+
description: Frozen test author for an STZ slice. Writes the sealed held-out suite (and a reference implementation that proves it is satisfiable) before the tournament; implementers never see either.
|
|
4
|
+
tools: Read, Write, Bash, Grep, Glob
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the **test author** for an STZ slice. You run once, before the
|
|
9
|
+
tournament, in a frozen context. Your output is sealed: the implementers see the
|
|
10
|
+
interface contract but never your tests or your reference.
|
|
11
|
+
|
|
12
|
+
## Your task
|
|
13
|
+
|
|
14
|
+
Given the slice contract and its done-predicates, write a held-out test suite
|
|
15
|
+
into `.stz/30-tests/held-out/`. Aim for tests that a specimen cannot satisfy by
|
|
16
|
+
gaming:
|
|
17
|
+
|
|
18
|
+
- cover the obvious happy path AND the edge cases (empty input, boundaries,
|
|
19
|
+
malformed input, large input),
|
|
20
|
+
- prefer property-based generators where the language supports them (fast-check
|
|
21
|
+
for TS, Hypothesis for Python, proptest for Rust) so the exact inputs are not
|
|
22
|
+
knowable in advance,
|
|
23
|
+
- encode each machine-checkable done-predicate as at least one assertion,
|
|
24
|
+
- do not depend on any single specimen's internal structure; test the contract.
|
|
25
|
+
|
|
26
|
+
## Write tests that survive a CORRECT implementation (hard rules)
|
|
27
|
+
|
|
28
|
+
These rules are the harness's **guide** for semantic robustness — and they are
|
|
29
|
+
the *only* control for it. The downstream smoke gate is a mechanical sensor
|
|
30
|
+
(compile + satisfiable-against-the-reference); it cannot catch a fragile
|
|
31
|
+
invariant, because the reference is written by you and shares your blind spot.
|
|
32
|
+
So a test that fails against every correct specimen is a *test* bug that only
|
|
33
|
+
these rules prevent — and it surfaces mid-tournament where it is expensive. Hold
|
|
34
|
+
to these:
|
|
35
|
+
|
|
36
|
+
- **It must compile/parse.** Before returning, build the suite (against your
|
|
37
|
+
reference, below). A suite that does not compile is not done.
|
|
38
|
+
- **Never key entity identity on mutable state.** If a thing moves, changes
|
|
39
|
+
position, or is reordered, do NOT identify it by `(row, col)`, index, or any
|
|
40
|
+
field it is allowed to change. Identify by a stable id, or — better — assert
|
|
41
|
+
over *movement-invariant* aggregates (counts, totals, sums) rather than
|
|
42
|
+
per-element position diffs. (The canonical trap: keying an alien on its
|
|
43
|
+
`(row,col)` and then asserting "it didn't duplicate" — a legitimate formation
|
|
44
|
+
step relocates it and the assertion misfires against every correct specimen.)
|
|
45
|
+
- **Assert invariants, not incidental state.** Prefer "score only rises on a
|
|
46
|
+
kill, by a value in the formation's value set" over "the entity at (r,c)
|
|
47
|
+
vanished." Invariants survive correct variation; snapshots of incidental state
|
|
48
|
+
do not.
|
|
49
|
+
|
|
50
|
+
## Write tests that catch an INCORRECT implementation (adversarial coverage)
|
|
51
|
+
|
|
52
|
+
The rules above keep the suite from failing *correct* code. These keep it from
|
|
53
|
+
*passing* incorrect code — the symmetric guide. A suite that only checks valid
|
|
54
|
+
inputs on the happy path is *satisfiable* (the smoke gate goes green) yet
|
|
55
|
+
discriminates nothing: a spec-violating implementation scores 100% and ties with
|
|
56
|
+
a correct one. (Observed in dogfood: a sealed suite that asserted only happy-path
|
|
57
|
+
outputs scored an implementation that silently accepts malformed input and
|
|
58
|
+
mis-parses a documented step form at a full 1.000 — the gate is as blind to a
|
|
59
|
+
non-discriminating suite as it is to a fragile invariant.) Hold to these:
|
|
60
|
+
|
|
61
|
+
- **Assert contract-mandated REJECTION.** For every "throw / error / reject on
|
|
62
|
+
X" clause, and every input the contract declares invalid, malformed, or
|
|
63
|
+
out-of-range, write a negative case asserting the implementation actually
|
|
64
|
+
throws or errors. A suite with no negative cases cannot tell a validating
|
|
65
|
+
implementation from one that silently accepts garbage. Your **reference
|
|
66
|
+
implementation must satisfy these negative cases too** (it must really throw) —
|
|
67
|
+
or the suite fails its own smoke gate, which is the correct signal that the bar
|
|
68
|
+
rose for the reference as well.
|
|
69
|
+
- **Make each case DISCRIMINATING, not merely satisfiable.** Choose inputs where
|
|
70
|
+
a plausibly-wrong implementation yields a DIFFERENT result than the correct one.
|
|
71
|
+
A case whose expected output a degenerate implementation also produces proves
|
|
72
|
+
nothing. (Canonical trap: a `5/15` step evaluated from a reference time *before*
|
|
73
|
+
minute 5 passes even for an impl that treats `a/n` as the single value `5`;
|
|
74
|
+
evaluate it from *after* minute 5 so only the correct expansion matches.) For
|
|
75
|
+
each contracted feature ask "what common wrong implementation would this input
|
|
76
|
+
fail to catch?" and add one that catches it.
|
|
77
|
+
- **Prefer a property-based generator over the negative space, not just a few
|
|
78
|
+
hand-picked negatives.** A short hand-picked list tends to cover only the
|
|
79
|
+
obvious malformed forms almost any implementation already rejects; the leniency
|
|
80
|
+
that actually ships hides in the parser's soft spots. A generator that mutates
|
|
81
|
+
valid inputs into invalid ones and asserts each throws explores those soft spots
|
|
82
|
+
a fixed list misses — the same reason property-based tests beat example tests on
|
|
83
|
+
the positive space.
|
|
84
|
+
- **Cover every explicitly-contracted feature** with at least one discriminating
|
|
85
|
+
case plus its boundaries — including the awkward interactions the contract names
|
|
86
|
+
(field unions, rollovers, overflow/leap, range/step/list forms). Happy-path-only
|
|
87
|
+
coverage of a feature the contract names is an authoring gap.
|
|
88
|
+
|
|
89
|
+
**Stay within the contract.** Test only behaviour the contract actually
|
|
90
|
+
specifies. If the contract is silent on a convention, NOT testing it is correct —
|
|
91
|
+
do not invent requirements the implementers were never given. That produces the
|
|
92
|
+
mirror failure (failing correct code on an unstated rule), the same class the
|
|
93
|
+
invariant rules above guard against.
|
|
94
|
+
|
|
95
|
+
## Reference implementation (proves the suite is satisfiable)
|
|
96
|
+
|
|
97
|
+
Also write a **minimal, correct reference implementation** of the contract into
|
|
98
|
+
`.stz/30-tests/held-out/reference/`. It exists only so the orchestrator can run
|
|
99
|
+
the suite against it and confirm it is GREEN before sealing — a suite no correct
|
|
100
|
+
implementation can pass is the bug above. The reference is sealed with the suite
|
|
101
|
+
and is **never** visible to specimens (it is a complete solution — leaking it
|
|
102
|
+
would hand out the answer). Do not place it in any prototype/specimen path.
|
|
103
|
+
|
|
104
|
+
## Output
|
|
105
|
+
|
|
106
|
+
Write the test files and the reference, then return a SHORT message: the
|
|
107
|
+
directory you wrote to, the files you created, one line on what each covers, and
|
|
108
|
+
that the reference compiles and the suite is green against it. Do not reveal
|
|
109
|
+
specific test inputs in your return message. Do not spawn any subagents.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stz-test-planner
|
|
3
|
+
description: Defines the project's test strategy BEFORE implementation — coverage targets, mutation policy, property-vs-example mix, the eval harness, and how each done-predicate maps to a planned check.
|
|
4
|
+
tools: Read, Bash, Grep, Glob
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the **test planner** for an STZ project. This runs before any slice is
|
|
9
|
+
implemented, on purpose: the test strategy is pre-committed so the tournament
|
|
10
|
+
cannot be gamed against tests written after the fact.
|
|
11
|
+
|
|
12
|
+
You are NOT the per-slice sealed-suite author (that is `stz-test-author`, which
|
|
13
|
+
runs inside each tournament). You write the project-wide strategy that author
|
|
14
|
+
then follows.
|
|
15
|
+
|
|
16
|
+
## Your task
|
|
17
|
+
|
|
18
|
+
Read `.stz/00-intent/` (especially the done-predicates), `.stz/10-research/`,
|
|
19
|
+
and `.stz/20-standards/`. Write `.stz/30-tests/strategy.md` covering:
|
|
20
|
+
|
|
21
|
+
- **Coverage target** (a number, with rationale) and **mutation policy** (what
|
|
22
|
+
survival rate is acceptable).
|
|
23
|
+
- **Property-based vs example-based** mix, and where each applies.
|
|
24
|
+
- **The eval harness**: how a slice's sealed suite runs against a specimen, what
|
|
25
|
+
fixtures look like, how metrics are produced.
|
|
26
|
+
- **Predicate map**: a table mapping each done-predicate from `00-intent` to the
|
|
27
|
+
kind of check that will enforce it. Every predicate must have a row.
|
|
28
|
+
|
|
29
|
+
## Output
|
|
30
|
+
|
|
31
|
+
Return a SHORT message: the strategy file path, the coverage and mutation
|
|
32
|
+
targets, and the predicate-map table (or its summary). End with the exact line:
|
|
33
|
+
|
|
34
|
+
## TEST PLAN COMPLETE
|
|
35
|
+
|
|
36
|
+
Do not spawn subagents.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: stz-validator
|
|
3
|
+
description: Ground-truth validation. Verifies each research claim against reality (run code, fetch the real source, read the actual files) rather than trusting model recall. Writes a per-claim verdict.
|
|
4
|
+
tools: Read, Bash, Grep, Glob, WebFetch
|
|
5
|
+
model: inherit
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
You are the **validator** for an STZ project. The researcher produced claims;
|
|
9
|
+
your job is to check them against reality, not against what a model remembers.
|
|
10
|
+
|
|
11
|
+
## Your task
|
|
12
|
+
|
|
13
|
+
Read `.stz/10-research/external/` and `.stz/10-research/internal/`. For each
|
|
14
|
+
claim, verify it the hard way:
|
|
15
|
+
|
|
16
|
+
- API or library behaviour: run a small snippet, check the installed version,
|
|
17
|
+
read the actual source or the real docs page (fetch it).
|
|
18
|
+
- Codebase claims: open the files and confirm the code says what the claim says.
|
|
19
|
+
- Performance claims: if cheap to check, measure; otherwise mark unverifiable
|
|
20
|
+
and say what spike would settle it.
|
|
21
|
+
|
|
22
|
+
Write `.stz/10-research/validation.md` with one row per claim and a verdict:
|
|
23
|
+
**confirmed**, **refuted**, or **unverifiable**, each with a one-line evidence
|
|
24
|
+
pointer (the command you ran, the file and line, the URL).
|
|
25
|
+
|
|
26
|
+
Do not paper over a refuted claim. A refuted claim that the project depends on
|
|
27
|
+
is the single most valuable thing you can surface.
|
|
28
|
+
|
|
29
|
+
## Output
|
|
30
|
+
|
|
31
|
+
Return a SHORT message: counts of confirmed / refuted / unverifiable, and the
|
|
32
|
+
list of refuted or unverifiable claims (these are what the user must see). End
|
|
33
|
+
with the exact line:
|
|
34
|
+
|
|
35
|
+
## VALIDATION COMPLETE
|
|
36
|
+
|
|
37
|
+
Do not spawn subagents.
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# The bridge CLI directly
|
|
2
|
+
|
|
3
|
+
STZ's deterministic half is exposed as `stz bridge <subcommand>` — JSON in,
|
|
4
|
+
JSON out, over the `.stz/` tree. The `/stz:*` commands call it between subagent
|
|
5
|
+
spawns, but it is scriptable on its own. Each subcommand prints one JSON object
|
|
6
|
+
and writes its artifacts under `.stz/`.
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
stz bridge version # {version, schemaVersion, packageName} — drift detection (F19)
|
|
10
|
+
stz bridge begin --root . --manifest .stz/40-slices/slice-01/manifest.json
|
|
11
|
+
stz bridge eval --root . --slice slice-01 --specimen a \
|
|
12
|
+
--sealed .stz/30-tests/held-out/<file> \
|
|
13
|
+
--impl .stz/40-slices/slice-01/prototypes/specimen-a/<file>
|
|
14
|
+
stz bridge gate --root . --slice slice-01
|
|
15
|
+
stz bridge escalate --root . --slice slice-01 # no-passers: advance retry→replan→halt FSM (F14), write refinement/failure report
|
|
16
|
+
stz bridge record-votes --root . --slice slice-01 --votes votes.json
|
|
17
|
+
stz bridge select --root . --slice slice-01
|
|
18
|
+
stz bridge finalize --root . --slice slice-01 --intent intent.json --asbuilt asbuilt.json
|
|
19
|
+
|
|
20
|
+
# project-level driver (multi-slice)
|
|
21
|
+
stz bridge project-set-config --root . --config run-config.json # persist run config (validated, clamped)
|
|
22
|
+
stz bridge project-config --root . # read it back (defaults if unset)
|
|
23
|
+
stz bridge project-dark-factory --root . --on # engage autonomous mode (--off to disengage)
|
|
24
|
+
stz bridge project-status --root . # DAG + phase status + progress totals + dashboard-ready slice rows + runConfig + darkFactory
|
|
25
|
+
|
|
26
|
+
# sealed held-out suite integrity (L1/F10) — freeze before the tournament
|
|
27
|
+
stz bridge seal --root . # sha256 the held-out suite into SEAL.json
|
|
28
|
+
stz bridge seal-verify --root . # re-hash vs SEAL.json; exit 1 on drift (gate before judging)
|
|
29
|
+
stz bridge seal-crosscheck --root . --sealed <suite> --reference-a <impl> --reference-b <impl>
|
|
30
|
+
# run the suite vs two independent references; exit 1 unless both pass
|
|
31
|
+
stz bridge seal-amend --root . --reason "<why>" # sanctioned post-freeze change: records from→to + reason
|
|
32
|
+
|
|
33
|
+
# cross-slice merge integrity — superseded sealed invariants
|
|
34
|
+
stz bridge merge-validate --root . --results results.json # adjudicate reported suite failures; exit 1 unless all sanctioned
|
|
35
|
+
stz bridge merge-compat-propose --root . --entry entry.json # merge agent proposes a supersession (always unapproved)
|
|
36
|
+
stz bridge merge-compat-approve --root . --id <id> --by "<who/why>" # approver blesses it (recorded)
|
|
37
|
+
stz bridge merge-compat-retire --root . --id <id> --amendment "<ref>" # retire once the superseded suite is seal-amended
|
|
38
|
+
stz bridge merge-compat-list --root . # read-only dump of the manifest
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
`merge-validate` adjudicates *reported* sealed-suite results (`{slice, passed,
|
|
42
|
+
failure}`) against an audited compat manifest — it does not run the suites (the
|
|
43
|
+
assembled crate may be Rust), so what is deterministic is the **rule application**
|
|
44
|
+
(signature match + superseding-passes + approved), the same trust split as `eval`
|
|
45
|
+
vs `record-eval`. A failing suite is sanctioned only as a signature-matched,
|
|
46
|
+
approved supersession whose replacement invariant also passes; `pendingApproval` /
|
|
47
|
+
`invalid` / `unsanctioned` all block. Compat entries are transitional debt retired
|
|
48
|
+
by a `seal-amend`. Full contract:
|
|
49
|
+
[`../../commands/stz-merge.md`](../../commands/stz-merge.md) and the cross-slice
|
|
50
|
+
section of [`sealed-suite.md`](./sealed-suite.md).
|
|
51
|
+
|
|
52
|
+
`escalate` is the deterministic owner of bounded cross-round failure handling
|
|
53
|
+
(F14). The `/stz:run` command calls it once after a gate that produced zero
|
|
54
|
+
passers; it advances the retry→replan→halt FSM over `state.json` (hard ceiling:
|
|
55
|
+
≤1 retry, ≤1 replan), persists the new counts, and writes the PDR `refinement.md`
|
|
56
|
+
the next round's specimens consume (on retry/replan) or a `failure-report.md` and
|
|
57
|
+
a `judgment: failed` phase (on halt). `gate` stays a pure read and never mutates
|
|
58
|
+
escalation, so the two can't double-advance; the FSM's ceiling makes even a stray
|
|
59
|
+
double-`escalate` fail-safe (it halts early, never loops). The sealed suite is
|
|
60
|
+
untouched across rounds — retry/replan re-enter the tournament with the same
|
|
61
|
+
frozen suite, `seal-verify` gating each round.
|
|
62
|
+
|
|
63
|
+
`project-dark-factory` is a load-modify-save toggle: it flips `darkFactory` in the
|
|
64
|
+
persisted run config without touching any other field (deliberately NOT routed
|
|
65
|
+
through `project-set-config`, whose normalize-over-defaults merge would reset
|
|
66
|
+
fan-out/models/strictness). It is the single source of truth for autonomous mode —
|
|
67
|
+
the `/stz:*` commands read the hoisted `darkFactory` flag from `project-status` at
|
|
68
|
+
each phase, so engaging it mid-run takes effect at the next phase. See
|
|
69
|
+
[`dark-factory.md`](./dark-factory.md) for the gate-skipping contract.
|
|
70
|
+
|
|
71
|
+
The sealed-suite commands back the anti-hacking freeze: `seal-crosscheck` (0.5.0)
|
|
72
|
+
runs the suite against a second, independently-authored reference before sealing,
|
|
73
|
+
so a blind spot the test-author shares with the suite surfaces as a divergence;
|
|
74
|
+
`seal` after the smoke gate is green and the cross-check is both-pass; `seal-verify`
|
|
75
|
+
immediately before the eval/gate so a frozen-suite edit can't slip in
|
|
76
|
+
mid-tournament; `seal-amend` as the only audited way to change a sealed file once
|
|
77
|
+
frozen. The guide-vs-sensor
|
|
78
|
+
contract behind it (what the smoke gate does and does NOT catch, where the
|
|
79
|
+
reference lives, how failures are classified) is in
|
|
80
|
+
[`sealed-suite.md`](./sealed-suite.md).
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Dark-factory mode: the autonomous-run contract
|
|
2
|
+
|
|
3
|
+
A *dark factory* runs lights-out: no human on the floor, product and a report at
|
|
4
|
+
the end. STZ's dark-factory mode (0.4.0) is the same idea for the slice pipeline —
|
|
5
|
+
elicitation hands off and the orchestrator drives research → ground-truth →
|
|
6
|
+
standards → testing-conventions → slice-disaggregation → every per-slice
|
|
7
|
+
tournament → summary with **no human in the loop**, surfacing only the final
|
|
8
|
+
completion report.
|
|
9
|
+
|
|
10
|
+
This is the literal intent in the project's executive summary ("software
|
|
11
|
+
engineering dark factories with auditable outputs"): autonomous, but every
|
|
12
|
+
decision still lands in the replayable `.stz/` audit tree, so a human can review
|
|
13
|
+
the whole run after the fact.
|
|
14
|
+
|
|
15
|
+
## The one gate that never closes
|
|
16
|
+
|
|
17
|
+
Dark-factory skips every *downstream* human gate, but **not** the F2 done-predicate
|
|
18
|
+
gate in `/stz:new`. Elicitation may not exit with zero machine-checkable
|
|
19
|
+
predicates, and acceptance criteria are never auto-invented — the predicates are
|
|
20
|
+
the contract the autonomous run executes against. So the question is offered only
|
|
21
|
+
*after* the predicate gate is satisfied; by the time dark-factory drives anything,
|
|
22
|
+
the contract is already locked.
|
|
23
|
+
|
|
24
|
+
Gates that ARE skipped when `darkFactory` is on:
|
|
25
|
+
|
|
26
|
+
- `/stz:slice` "Approve as-is" — the proposed slice DAG is auto-approved.
|
|
27
|
+
- `/stz:run` step 8b winner-approval — the selected winner is auto-accepted. The
|
|
28
|
+
full ranking, GRPO advantages, and any disqualified specimens with their hack
|
|
29
|
+
findings still land in the audit tree; nothing is hidden, only un-prompted.
|
|
30
|
+
|
|
31
|
+
A halted slice does not stall the factory: it is reported and the rest of the DAG
|
|
32
|
+
continues. Every halt surfaces in the final `/stz:summary` completion report.
|
|
33
|
+
|
|
34
|
+
This is also how the autonomous run handles the decisions it must not make alone.
|
|
35
|
+
Two human-only gates can arise mid-run:
|
|
36
|
+
|
|
37
|
+
- A `seal-crosscheck` divergence (0.5.0) — a blind-spot signal that requires human
|
|
38
|
+
adjudication and must never auto-rewrite.
|
|
39
|
+
- A blocked `merge-validate` (0.5.2) — an `unsanctioned`/`invalid` merge failure,
|
|
40
|
+
or a compat entry still `pendingApproval` (the merge agent proposed it but no
|
|
41
|
+
human has approved). Auto-approving would defeat the gate.
|
|
42
|
+
|
|
43
|
+
In both cases, rather than act on an unresolved signal — or block forever waiting
|
|
44
|
+
for a human who isn't there — the slice is **halted** and the DAG continues; the
|
|
45
|
+
signal (already recorded in `30-tests/cross-reference.md` or
|
|
46
|
+
`90-audit/merge-validation.md`) is surfaced in the final summary for after-the-fact
|
|
47
|
+
review. The factory defers that decision; it does not guess.
|
|
48
|
+
|
|
49
|
+
## Where the flag lives, and why a dedicated toggle
|
|
50
|
+
|
|
51
|
+
`darkFactory` is a boolean on the persisted run config (`00-intent/run-config.json`).
|
|
52
|
+
It is set two ways:
|
|
53
|
+
|
|
54
|
+
1. **At the end of elicitation** — `/stz:new` asks once, after the predicate gate.
|
|
55
|
+
2. **At any point** — `stz bridge project-dark-factory --root . --on` (or `--off`).
|
|
56
|
+
|
|
57
|
+
The toggle is a deliberate **load-modify-save**: it reads the existing config,
|
|
58
|
+
flips the one field, and writes it back. It is NOT routed through
|
|
59
|
+
`project-set-config`, because that command runs `normalizeRunConfig(partial)`,
|
|
60
|
+
which merges the partial over the *defaults* — a mid-run `set-config
|
|
61
|
+
{darkFactory:true}` would silently reset fan-out, models, and strictness. The
|
|
62
|
+
dedicated command is the single source of truth; never hand-edit run-config.json.
|
|
63
|
+
|
|
64
|
+
`project-status` hoists the resolved value to a top-level `darkFactory` field (as
|
|
65
|
+
well as inside `runConfig`), so each command reads it once at the start of every
|
|
66
|
+
phase. Engaging it between phases therefore takes effect at the next phase with no
|
|
67
|
+
restart.
|
|
68
|
+
|
|
69
|
+
## What is and isn't tested
|
|
70
|
+
|
|
71
|
+
The deterministic plumbing — the config field, normalization/coercion, the
|
|
72
|
+
load-modify-save toggle (with a regression test proving it never resets sibling
|
|
73
|
+
fields), persistence, and the hoisted status surface — is covered end-to-end in
|
|
74
|
+
`test/project.test.ts`. The autonomous *orchestration loop itself* lives in command
|
|
75
|
+
markdown (`/stz:pipeline`, `/stz:run`, `/stz:slice`) and is driven by the agent,
|
|
76
|
+
so it is not unit-tested; the tests cover the flag plumbing those commands read,
|
|
77
|
+
not the agent loop.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Local development & testing
|
|
2
|
+
|
|
3
|
+
Running STZ's deterministic engine without Claude Code — for contributors,
|
|
4
|
+
CI, and quick smoke tests. The operator-facing install and usage live in the
|
|
5
|
+
[top-level README](../../README.md).
|
|
6
|
+
|
|
7
|
+
## As a library / local CLI only
|
|
8
|
+
|
|
9
|
+
If you only want the deterministic engine and the mock pipeline:
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
git clone https://github.com/dr-robert-li/slice-tournament-zoo
|
|
13
|
+
cd slice-tournament-zoo
|
|
14
|
+
npm install
|
|
15
|
+
npm test # 93 deterministic tests
|
|
16
|
+
npm run typecheck
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Mock run (testing only, no network)
|
|
20
|
+
|
|
21
|
+
A self-contained mock drives the whole pipeline with no API keys, network, or
|
|
22
|
+
subagents — handy as a fast smoke test of the deterministic spine. It is a
|
|
23
|
+
testing aid, not the production path. See [`src/mock/`](../../src/mock).
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
stz run <dir> # drive the demo slice end to end against the mock model
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
The standalone mock demo runs all eight phases inside a single slice for a
|
|
30
|
+
self-contained, no-network smoke test. The real in-session flow is the two-level
|
|
31
|
+
split described in the README's pipeline section.
|
|
32
|
+
|
|
33
|
+
## CI-style local check (no Claude Code)
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
npm test && npm run typecheck && stz run /tmp/stz-smoke
|
|
37
|
+
```
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# Sealed held-out suite: the integrity contract
|
|
2
|
+
|
|
3
|
+
The held-out suite (L1/F10) is the grader specimens compete against, frozen
|
|
4
|
+
before the tournament so it cannot be tuned to favour one. Keeping it both
|
|
5
|
+
*correct* and *frozen* is split across two kinds of control, on purpose. The
|
|
6
|
+
distinction is the harness-engineering one between **guides** and **sensors**:
|
|
7
|
+
|
|
8
|
+
- a **guide** prevents a conceptual blind spot *up front* (authoring guidance);
|
|
9
|
+
- a **sensor** catches only what is *mechanically observable afterward*.
|
|
10
|
+
|
|
11
|
+
Conflating them is how the fragile-invariant bug shipped: a sensor was expected
|
|
12
|
+
to catch a class only a guide can prevent.
|
|
13
|
+
|
|
14
|
+
## The contract
|
|
15
|
+
|
|
16
|
+
**Prompt hardening (a GUIDE) owns semantic robustness.** The `stz-test-author`
|
|
17
|
+
agent prompt forbids identity on mutable state (the `(row,col)`-of-a-thing-that-
|
|
18
|
+
moves trap), requires movement-invariant predicates (counts, totals, sums over
|
|
19
|
+
per-element position snapshots), and requires each done-predicate to be encoded
|
|
20
|
+
as an invariant rather than a brittle snapshot diff. This is the *only* control
|
|
21
|
+
for the fragile-invariant class — see below for why.
|
|
22
|
+
|
|
23
|
+
The same guide owns the **symmetric** class: a suite that does not *fail correct
|
|
24
|
+
code* but does not *catch incorrect code* either — see "The permissive-suite
|
|
25
|
+
class" below.
|
|
26
|
+
|
|
27
|
+
**The smoke gate (a SENSOR) owns mechanical validity only.** Before sealing, the
|
|
28
|
+
orchestrator compiles the suite and runs it against the test-author's reference
|
|
29
|
+
implementation in a throwaway scratch dir. A green gate means exactly:
|
|
30
|
+
|
|
31
|
+
> compiles, and is satisfiable against the sealed reference.
|
|
32
|
+
|
|
33
|
+
It does **not** mean the suite is semantically robust. The reference is authored
|
|
34
|
+
by the same agent under the same assumptions, so if the author keys identity on
|
|
35
|
+
mutable position, the reference may move things the same wrong way, the suite
|
|
36
|
+
goes green, and the bug still ships. The sensor is blind to its author's blind
|
|
37
|
+
spot — which is precisely why the guide, not the gate, owns that class.
|
|
38
|
+
|
|
39
|
+
**The reference implementation stays sealed.** It is a *full, correct solution*
|
|
40
|
+
to the contract. It lives under `.stz/30-tests/held-out/reference/`, is sealed
|
|
41
|
+
with the suite, and is never placed on any specimen-visible path. The gate
|
|
42
|
+
materializes it only into a temporary scratch workspace and discards it. Leaking
|
|
43
|
+
it would hand specimens the answer — a worse hole than the one the gate closes.
|
|
44
|
+
|
|
45
|
+
## The four phases
|
|
46
|
+
|
|
47
|
+
1. **Author** — `stz-test-author` writes the sealed suite under the hardened
|
|
48
|
+
authoring guidance, plus the reference implementation.
|
|
49
|
+
2. **Gate** — the orchestrator copies suite + reference into a scratch dir and
|
|
50
|
+
runs compile-only first (`cargo test --no-run`, `tsc --noEmit`, …) then a
|
|
51
|
+
satisfiability run. Passing means "compiles + satisfiable", nothing more.
|
|
52
|
+
3. **Seal** — `stz bridge seal` records a sha256 of every held-out file into a
|
|
53
|
+
byte-stable, timestamp-free `SEAL.json`, then the suite is frozen. The
|
|
54
|
+
orchestrator runs `stz bridge seal-verify` immediately before the eval/gate;
|
|
55
|
+
it exits non-zero on any drift, so an edit between sealing and judging can't
|
|
56
|
+
slip through.
|
|
57
|
+
4. **Amend** — if a defect is found later, never patch the canonical sealed file
|
|
58
|
+
in place: `stz bridge seal-amend --reason "<why>"` records the per-file
|
|
59
|
+
from→to hashes + reason into the manifest and re-freezes. A silent edit then
|
|
60
|
+
fails `seal-verify`.
|
|
61
|
+
|
|
62
|
+
## The permissive-suite class: passing INCORRECT code (the symmetric guide)
|
|
63
|
+
|
|
64
|
+
The fragile-invariant class is a suite that **fails correct** code. Its mirror is
|
|
65
|
+
a suite that **passes incorrect** code: it asserts only valid, happy-path inputs,
|
|
66
|
+
so a spec-violating implementation scores 100% and ties with a correct one. The
|
|
67
|
+
suite is *satisfiable* — the smoke gate is green — but it *discriminates nothing*.
|
|
68
|
+
|
|
69
|
+
This is a **guide-class** failure for the same reason the fragile-invariant is:
|
|
70
|
+
the smoke gate only proves "compiles + satisfiable against the reference"; a
|
|
71
|
+
non-discriminating suite satisfies that vacuously. No mechanical sensor sees the
|
|
72
|
+
missing negative case. Only authoring guidance can.
|
|
73
|
+
|
|
74
|
+
The dogfood case that motivated the rule: a sealed `nextRun` (cron) suite asserted
|
|
75
|
+
only first-fire *times* and contained **no rejection cases**. An implementation
|
|
76
|
+
that silently accepted malformed expressions (returning a time instead of throwing,
|
|
77
|
+
which the contract mandated) and mis-parsed a documented step form scored a full
|
|
78
|
+
**1.000 — tying a correct one**, on both the sealed suite and an independent truth
|
|
79
|
+
oracle. Flat pass-rate selection could not separate them.
|
|
80
|
+
|
|
81
|
+
The `stz-test-author` guide now requires (symmetric with the invariant rules):
|
|
82
|
+
|
|
83
|
+
- **contract-mandated rejection cases** — every "throw/error/reject on X" clause
|
|
84
|
+
gets a negative assertion; the author's reference must satisfy them too;
|
|
85
|
+
- **discriminating inputs** — each case must be one a plausibly-wrong impl fails,
|
|
86
|
+
not one a degenerate impl also passes (the `5/15`-from-before-minute-5 trap);
|
|
87
|
+
- **a property-based generator over the negative space** — hand-picked negatives
|
|
88
|
+
reliably cover only the obvious malformed forms an implementation already
|
|
89
|
+
rejects; a generator that mutates valid inputs into invalid ones and asserts
|
|
90
|
+
each throws reaches the parser soft spots a fixed list misses (the dogfood
|
|
91
|
+
validation below: 3/3 blind authors added rejection cases, but their hand-picked
|
|
92
|
+
negatives missed the one leniency that actually discriminated the specimens);
|
|
93
|
+
- **coverage of every contracted feature**, not just the happy path —
|
|
94
|
+
- while **staying within the contract** (testing an unstated convention would
|
|
95
|
+
re-introduce the fragile-invariant class from the other side).
|
|
96
|
+
|
|
97
|
+
## Cross-family reference: an independent guide against shared blind spots
|
|
98
|
+
|
|
99
|
+
The guide above (the `stz-test-author` hard rules) is the *only* control for the
|
|
100
|
+
fragile-invariant class — but it has a structural limit: a single author cannot
|
|
101
|
+
guide themselves out of a blind spot they don't know they have. The smoke gate
|
|
102
|
+
can't help, because its reference shares that blind spot. So one control class
|
|
103
|
+
remains uncovered: a wrong assumption baked into *both* the suite and its
|
|
104
|
+
reference.
|
|
105
|
+
|
|
106
|
+
The **cross-family reference** (0.5.0) closes it. A second reference is authored
|
|
107
|
+
**independently** — a different model family (or a human), seeing only the
|
|
108
|
+
contract and done-predicates, never the suite or the primary reference — by the
|
|
109
|
+
`stz-cross-reference` agent. It is a full, correct solution, lives under
|
|
110
|
+
`.stz/30-tests/held-out/reference-b/`, and is sealed with the suite (never
|
|
111
|
+
specimen-visible). Before sealing, `stz bridge seal-crosscheck` runs the suite
|
|
112
|
+
against both references:
|
|
113
|
+
|
|
114
|
+
- **both-pass** — two independent implementations satisfy the suite. A blind spot
|
|
115
|
+
shared by author and suite would have made one of them fail, so passing both is
|
|
116
|
+
positive evidence the suite isn't over-fit to one author's assumptions. Seal.
|
|
117
|
+
- **divergent** (exactly one passes) — the suite encodes an assumption one author
|
|
118
|
+
didn't share. The command exits non-zero to PAUSE the pipeline, exactly like
|
|
119
|
+
`seal-verify`.
|
|
120
|
+
- **both-fail** — the suite is unsatisfiable as written; that's a gate/sensor
|
|
121
|
+
failure (loop the stderr back to the author), not a cross-family signal.
|
|
122
|
+
|
|
123
|
+
**Divergence is a signal, not a verdict.** A B-fails-A-passes split is ambiguous
|
|
124
|
+
by construction: either the suite over-fits A (the blind spot you want to catch)
|
|
125
|
+
or reference B is simply wrong — and aggregate pass counts cannot distinguish
|
|
126
|
+
them. So the cross-check is itself a **guide-class control**: it surfaces the
|
|
127
|
+
divergence for *human adjudication* (strengthen the author guidance + `seal-amend`
|
|
128
|
+
the suite, or discard a buggy B), and it never triggers a sensor-style automatic
|
|
129
|
+
rewrite. It is the R2 "cross-family quorum" idea applied to the reference rather
|
|
130
|
+
than the judge.
|
|
131
|
+
|
|
132
|
+
## Cross-slice merge: invariants that a later slice supersedes
|
|
133
|
+
|
|
134
|
+
There is a third way a sealed suite can fail on *correct* code, distinct from a
|
|
135
|
+
fragile invariant: a suite that was right **in isolation** but is obsolete **under
|
|
136
|
+
composition**. When slice winners are assembled into one integrated crate, an
|
|
137
|
+
earlier slice's suite may assert an invariant a later slice legitimately
|
|
138
|
+
supersedes — the canonical case is slice-03's "aliens never respawn" against
|
|
139
|
+
slice-05's wave-clear. The integrated crate fails slice-03's suite, and that is
|
|
140
|
+
not a merge defect.
|
|
141
|
+
|
|
142
|
+
The failure mode to guard against is the **orchestrator hand-waving it** ("looks
|
|
143
|
+
like the expected interaction, moving on"). That is the same unaudited,
|
|
144
|
+
judgment-call hole the sealed suite exists to close — just relocated to merge
|
|
145
|
+
time. So STZ makes the call deterministic and audited (`stz bridge merge-validate`
|
|
146
|
+
+ a compat manifest), not a vibe. A superseded-invariant failure is sanctioned
|
|
147
|
+
only when:
|
|
148
|
+
|
|
149
|
+
1. a **signature-pinned** compat entry matches the exact panic substring (never
|
|
150
|
+
the test name alone — that would launder a real new bug in the same test);
|
|
151
|
+
2. the **superseding invariant also passes** on the assembled crate (you cannot
|
|
152
|
+
claim supersession when the replacement behaviour isn't even proven there);
|
|
153
|
+
3. the entry is **approved** — the merge agent may propose, but only an approver
|
|
154
|
+
blesses it, and the approval records who/why so a self-approval is an auditable
|
|
155
|
+
anomaly.
|
|
156
|
+
|
|
157
|
+
This is a **deferral layer on this contract, not a parallel one**: a compat entry
|
|
158
|
+
is transitional debt that points at a pending wave-aware `seal-amend`. The end
|
|
159
|
+
state is the amended (composed-invariant) suite passing outright, at which point
|
|
160
|
+
the entry is retired. Until then the manifest's append-only history is the
|
|
161
|
+
protection — consistent with N1 (auditability over prevention). Full mechanics:
|
|
162
|
+
[`../../commands/stz-merge.md`](../../commands/stz-merge.md).
|
|
163
|
+
|
|
164
|
+
One inherent caveat of substring matching: a genuinely new merge bug whose panic
|
|
165
|
+
*contains* the pinned substring AND whose superseding suite also passes would be
|
|
166
|
+
wrongly sanctioned. Mitigation is pinning the substring tightly to the assertion
|
|
167
|
+
message; the residual risk is the cost of the reported-results approach.
|
|
168
|
+
|
|
169
|
+
## Error handling follows the same split
|
|
170
|
+
|
|
171
|
+
- **Compile or unsatisfiable failure** → a **gate (sensor) failure**. Feed the
|
|
172
|
+
exact compiler/test stderr back into a rewrite loop for `stz-test-author`, then
|
|
173
|
+
re-gate. Do not hand-patch the suite.
|
|
174
|
+
- **Fragile invariant discovered later** (e.g. the sealed suite fails identically
|
|
175
|
+
across all *correct* specimens at eval) → an **authoring (guide) failure, not a
|
|
176
|
+
gate miss**. The gate was never capable of detecting it. Fix via an audited
|
|
177
|
+
`seal-amend`, and treat it as a signal to strengthen the author guidance — not
|
|
178
|
+
as a bug in the gate.
|
|
179
|
+
- **Permissive suite discovered later** (the sealed suite passes a specimen that
|
|
180
|
+
violates the contract — e.g. accepts input the contract says to reject, or ties
|
|
181
|
+
a spec-violating specimen with a correct one) → also an **authoring (guide)
|
|
182
|
+
failure, not a gate miss**: the gate only checks satisfiability, which a
|
|
183
|
+
non-discriminating suite meets vacuously. Fix via an audited `seal-amend` that
|
|
184
|
+
adds the missing rejection / discriminating cases, and strengthen the author
|
|
185
|
+
guidance. Do NOT fix it by copying cases you saw a specimen fail on a private
|
|
186
|
+
oracle — that is train-on-test and voids the held-out property.
|
|
187
|
+
|
|
188
|
+
The short version: **fix the contract, not just the prompt.** Prompt hardening is
|
|
189
|
+
the control for semantic fragility; the smoke gate is a narrow mechanical sensor;
|
|
190
|
+
the full-solution reference is isolated to scratch-only verification.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "slice-tournament-zoo",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.2",
|
|
4
4
|
"description": "STZ: a contract-bounded slice pipeline that implements each slice adversarially via an N-specimen tournament with frozen sealed tests, GRPO-style selection, layered anti-reward-hacking, and a replayable markdown audit trail.",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"homepage": "https://github.com/dr-robert-li/slice-tournament-zoo#readme",
|
|
@@ -20,7 +20,9 @@
|
|
|
20
20
|
},
|
|
21
21
|
"files": [
|
|
22
22
|
"src",
|
|
23
|
-
"bin"
|
|
23
|
+
"bin",
|
|
24
|
+
"agents",
|
|
25
|
+
"docs/development"
|
|
24
26
|
],
|
|
25
27
|
"scripts": {
|
|
26
28
|
"stz": "tsx src/cli.ts",
|
package/src/README.md
CHANGED
|
@@ -14,6 +14,6 @@ the production path — see [`mock/`](https://github.com/dr-robert-li/slice-tour
|
|
|
14
14
|
## Further reading
|
|
15
15
|
|
|
16
16
|
- The requirement-to-test mapping is in [`docs/TESTPLAN.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/TESTPLAN.md).
|
|
17
|
-
- What is
|
|
17
|
+
- What is built, deferred, and planned next is in [`docs/ROADMAP.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/ROADMAP.md).
|
|
18
18
|
- Running the engine locally / in CI: [`docs/development/local-and-testing.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/development/local-and-testing.md).
|
|
19
19
|
- The deterministic bridge CLI: [`docs/development/bridge-cli.md`](https://github.com/dr-robert-li/slice-tournament-zoo/blob/main/docs/development/bridge-cli.md).
|
package/src/bridge.ts
CHANGED
|
@@ -33,6 +33,7 @@ import type {
|
|
|
33
33
|
ProjectPhase,
|
|
34
34
|
ProjectSliceEntry,
|
|
35
35
|
RunConfig,
|
|
36
|
+
SpecimenId,
|
|
36
37
|
} from "./types.js";
|
|
37
38
|
import { PROJECT_PHASES } from "./types.js";
|
|
38
39
|
import { scaffold, writeDoc, readDoc, stzPath } from "./taxonomy.js";
|
|
@@ -57,6 +58,7 @@ import {
|
|
|
57
58
|
} from "./project.js";
|
|
58
59
|
import { detectHacks } from "./hack-detector.js";
|
|
59
60
|
import { STZ_VERSION, SCHEMA_VERSION, PACKAGE_NAME } from "./version.js";
|
|
61
|
+
import { onNoPassers, type EscalationState } from "./escalation.js";
|
|
60
62
|
import { evalGate, select, pairings } from "./selection.js";
|
|
61
63
|
import { diffSpecs, renderSpecDiff, isFaithful, unmatchedIntentIds, mismatchedAsBuiltIds, type Spec } from "./specdiff.js";
|
|
62
64
|
import { seal, verifySeal, amendSeal, heldOutFiles } from "./seal.js";
|
|
@@ -247,10 +249,136 @@ function gate(args: Record<string, string>): void {
|
|
|
247
249
|
const { root, slice } = args as { root: string; slice: string };
|
|
248
250
|
const evals = loadEvals(root, slice);
|
|
249
251
|
const { passers, eliminated } = evalGate(evals);
|
|
250
|
-
// Emit the pairing schedule the command must drive with judge agents.
|
|
252
|
+
// Emit the pairing schedule the command must drive with judge agents. `gate`
|
|
253
|
+
// is a pure read — it never advances escalation. When `passers` is empty the
|
|
254
|
+
// command calls `escalate` (below), which owns the state transition; keeping
|
|
255
|
+
// them separate means a re-run of `gate` can't double-advance the FSM.
|
|
251
256
|
print({ passers, eliminated, pairings: pairings(passers) });
|
|
252
257
|
}
|
|
253
258
|
|
|
259
|
+
/** Build the pressure-log entries: every specimen that is not the winner is a
|
|
260
|
+
* negative exemplar (F9). `winner` is null for a no-passers round (all culled). */
|
|
261
|
+
function culledFromEvals(
|
|
262
|
+
root: string,
|
|
263
|
+
slice: string,
|
|
264
|
+
evals: EvalResult[],
|
|
265
|
+
winner: SpecimenId | null,
|
|
266
|
+
): CulledSpecimen[] {
|
|
267
|
+
return evals
|
|
268
|
+
.filter((e) => e.specimen !== winner)
|
|
269
|
+
.map((e) => ({
|
|
270
|
+
specimen: e.specimen,
|
|
271
|
+
reason: e.hackFindings.length
|
|
272
|
+
? `hack: ${e.hackFindings.map((f) => f.pattern).join(",")}`
|
|
273
|
+
: `gate testPassRate=${e.testPassRate.toFixed(2)}`,
|
|
274
|
+
diff: Object.entries(readSpecimenFiles(root, slice, e.specimen))
|
|
275
|
+
.map(([p, c]) => `+++ ${p}\n${c}`)
|
|
276
|
+
.join("\n"),
|
|
277
|
+
critique: "",
|
|
278
|
+
hackFindings: e.hackFindings,
|
|
279
|
+
}));
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/**
|
|
283
|
+
* Bounded cross-round escalation (F14), driven from the command-level `/stz:run`
|
|
284
|
+
* loop. Call this ONCE after a gate that yielded zero passers. It is the single
|
|
285
|
+
* deterministic owner of "are we allowed another round?": it advances the
|
|
286
|
+
* escalation FSM over `state.json`, persists the new counts, and on retry/replan
|
|
287
|
+
* writes the PDR refinement context the next round's specimens consume — exactly
|
|
288
|
+
* the path the mock orchestrator drives internally, now exposed to the real
|
|
289
|
+
* command so it is not the LLM deciding when to stop.
|
|
290
|
+
*
|
|
291
|
+
* The sealed suite is NOT touched here: retry/replan re-enter the tournament with
|
|
292
|
+
* the SAME frozen suite (the command re-runs `seal-verify` each round). Re-using
|
|
293
|
+
* the FSM's hard ceiling (≤1 retry, ≤1 replan) means even a stray double-call is
|
|
294
|
+
* fail-safe — it halts early, it never loops.
|
|
295
|
+
*/
|
|
296
|
+
async function escalateCmd(args: Record<string, string>): Promise<void> {
|
|
297
|
+
const { root, slice } = args as { root: string; slice: string };
|
|
298
|
+
const evals = loadEvals(root, slice);
|
|
299
|
+
let state = await loadState(root, slice);
|
|
300
|
+
|
|
301
|
+
const cur: EscalationState = {
|
|
302
|
+
stage: state.escalation,
|
|
303
|
+
retryCount: state.retryCount,
|
|
304
|
+
replanCount: state.replanCount,
|
|
305
|
+
};
|
|
306
|
+
// The round that just failed (1-based): rounds already consumed + this one.
|
|
307
|
+
const failedRound = cur.retryCount + cur.replanCount + 1;
|
|
308
|
+
const { next, action } = onNoPassers(cur);
|
|
309
|
+
state.escalation = next.stage;
|
|
310
|
+
state.retryCount = next.retryCount;
|
|
311
|
+
state.replanCount = next.replanCount;
|
|
312
|
+
state = appendEvent(state, "judgment", `escalation-${action.type}`, action.note);
|
|
313
|
+
|
|
314
|
+
// The whole field is culled this round (no winner). Persist the pressure log so
|
|
315
|
+
// the negative exemplars are auditable regardless of what comes next (F9).
|
|
316
|
+
const culled = culledFromEvals(root, slice, evals, null);
|
|
317
|
+
await writeDoc(root, join("50-pressure", slice, "pressure.md"), {
|
|
318
|
+
frontmatter: { summary: `Pressure log ${slice}: round ${failedRound}, ${culled.length} culled (no passers).` },
|
|
319
|
+
body: renderPressureLog({ sliceId: slice, culled }),
|
|
320
|
+
});
|
|
321
|
+
|
|
322
|
+
if (action.type === "halt") {
|
|
323
|
+
const report =
|
|
324
|
+
`# Failure report — ${slice}\n\n` +
|
|
325
|
+
`No specimen passed the sealed-suite gate after ${failedRound} round(s) ` +
|
|
326
|
+
`(${next.retryCount} retry, ${next.replanCount} replan). The bounded-escalation ` +
|
|
327
|
+
`budget (≤1 retry, ≤1 replan) is exhausted; halting per F14.\n\n` +
|
|
328
|
+
`## Per-specimen gate outcomes (final round)\n` +
|
|
329
|
+
evals
|
|
330
|
+
.map((e) => {
|
|
331
|
+
const why = e.hackFindings.length
|
|
332
|
+
? `disqualified — hack: ${e.hackFindings.map((f) => f.pattern).join(", ")}`
|
|
333
|
+
: `gate fail — testPassRate=${e.testPassRate.toFixed(2)}, coverage=${e.coverage.toFixed(2)}, mutation=${e.mutationScore.toFixed(2)}`;
|
|
334
|
+
return `- specimen-${e.specimen}: ${why}`;
|
|
335
|
+
})
|
|
336
|
+
.join("\n") +
|
|
337
|
+
"\n";
|
|
338
|
+
state.failureReport = report;
|
|
339
|
+
state = setPhaseStatus(state, "judgment", "failed");
|
|
340
|
+
await writeDoc(root, join(sliceRel(slice), "failure-report.md"), {
|
|
341
|
+
frontmatter: { summary: `Halt: no passers after ${failedRound} round(s).` },
|
|
342
|
+
body: report,
|
|
343
|
+
});
|
|
344
|
+
await saveState(root, state);
|
|
345
|
+
print({
|
|
346
|
+
action: "halt",
|
|
347
|
+
note: action.note,
|
|
348
|
+
round: failedRound,
|
|
349
|
+
escalation: state.escalation,
|
|
350
|
+
retryCount: state.retryCount,
|
|
351
|
+
replanCount: state.replanCount,
|
|
352
|
+
failureReportPath: stzPath(root, join(sliceRel(slice), "failure-report.md")),
|
|
353
|
+
});
|
|
354
|
+
return;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// retry or replan → build the PDR refinement context (F9) from this round's
|
|
358
|
+
// group-relative advantages (no votes: GRPO over the eval rewards alone), the
|
|
359
|
+
// same computation the mock uses (orchestrator select(evals, [])).
|
|
360
|
+
const advantages = select(evals, []).judgment.advantages;
|
|
361
|
+
await writeDoc(root, join("50-pressure", slice, "refinement.md"), {
|
|
362
|
+
frontmatter: { summary: `PDR refinement for ${slice} after round ${failedRound} (${action.type}).` },
|
|
363
|
+
body: refinementContext({ sliceId: slice, culled }, advantages),
|
|
364
|
+
});
|
|
365
|
+
if (action.type === "replan") {
|
|
366
|
+
// Re-enter planning: the command rewrites intent.json before re-spawning.
|
|
367
|
+
state = setPhaseStatus(state, "planning", "running");
|
|
368
|
+
}
|
|
369
|
+
await saveState(root, state);
|
|
370
|
+
print({
|
|
371
|
+
action: action.type,
|
|
372
|
+
note: action.note,
|
|
373
|
+
round: failedRound,
|
|
374
|
+
nextRound: failedRound + 1,
|
|
375
|
+
escalation: state.escalation,
|
|
376
|
+
retryCount: state.retryCount,
|
|
377
|
+
replanCount: state.replanCount,
|
|
378
|
+
refinementPath: stzPath(root, join("50-pressure", slice, "refinement.md")),
|
|
379
|
+
});
|
|
380
|
+
}
|
|
381
|
+
|
|
254
382
|
function recordVotes(args: Record<string, string>): void {
|
|
255
383
|
const { root, slice } = args as { root: string; slice: string };
|
|
256
384
|
const votes = readJSON<PairwiseVote[]>(args.votes!);
|
|
@@ -293,19 +421,7 @@ async function finalize(args: Record<string, string>): Promise<void> {
|
|
|
293
421
|
: { ranking: [], winner: null, advantages: [], votes: [] };
|
|
294
422
|
|
|
295
423
|
// Pressure log: every non-winning specimen is a negative exemplar (F9).
|
|
296
|
-
const culled
|
|
297
|
-
.filter((e) => e.specimen !== judgment.winner)
|
|
298
|
-
.map((e) => ({
|
|
299
|
-
specimen: e.specimen,
|
|
300
|
-
reason: e.hackFindings.length
|
|
301
|
-
? `hack: ${e.hackFindings.map((f) => f.pattern).join(",")}`
|
|
302
|
-
: `gate testPassRate=${e.testPassRate.toFixed(2)}`,
|
|
303
|
-
diff: Object.entries(readSpecimenFiles(root, slice, e.specimen))
|
|
304
|
-
.map(([p, c]) => `+++ ${p}\n${c}`)
|
|
305
|
-
.join("\n"),
|
|
306
|
-
critique: "",
|
|
307
|
-
hackFindings: e.hackFindings,
|
|
308
|
-
}));
|
|
424
|
+
const culled = culledFromEvals(root, slice, evals, judgment.winner);
|
|
309
425
|
await writeDoc(root, join("50-pressure", slice, "pressure.md"), {
|
|
310
426
|
frontmatter: { summary: `Pressure log ${slice}: ${culled.length} culled.` },
|
|
311
427
|
body: renderPressureLog({ sliceId: slice, culled }),
|
|
@@ -932,6 +1048,7 @@ export async function runBridge(argv: string[]): Promise<void> {
|
|
|
932
1048
|
case "record-eval": recordEval(args); break;
|
|
933
1049
|
case "eval": evalCmd(args); break;
|
|
934
1050
|
case "gate": gate(args); break;
|
|
1051
|
+
case "escalate": await escalateCmd(args); break;
|
|
935
1052
|
case "record-votes": recordVotes(args); break;
|
|
936
1053
|
case "select": await selectCmd(args); break;
|
|
937
1054
|
case "finalize": await finalize(args); break;
|
package/src/mock/orchestrator.ts
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
* The model layer is injected (ModelLayer), so this runs identically against
|
|
9
9
|
* the deterministic mock and a future live Claude Code / Codex implementation.
|
|
10
10
|
*
|
|
11
|
-
* STUBBED vs the full design (logged via the `log` sink, surfaced in
|
|
11
|
+
* STUBBED vs the full design (logged via the `log` sink, surfaced in ROADMAP):
|
|
12
12
|
* - git worktrees per specimen → prototypes/specimen-X/ directories instead.
|
|
13
13
|
* - per-worktree ephemeral observability stacks → not spun up.
|
|
14
14
|
* - live Python eval drivers / mutation / PBT → mock EvalRunner.
|