slice-tournament-zoo 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ /**
2
+ * Hybrid selection (F7): eval-gate then judge ranking.
3
+ *
4
+ * Stage 1 (gate): a specimen must pass the sealed held-out suite AND carry no
5
+ * hack findings (F10/L3). Failures are eliminated.
6
+ * Stage 2 (rank): pairwise comparisons across passers, V=8 votes per pair
7
+ * (RTV default). Ranking is by win-count (the plain,
8
+ * deterministic aggregation — deliberately not clever).
9
+ *
10
+ * GRPO group-relative advantage (F8) is computed over a scalar reward derived
11
+ * from each passer's eval signal + pairwise win share, and is reported
12
+ * alongside the ranking so the winner pick and the pressure-log weighting share
13
+ * one number.
14
+ */
15
+ import type {
16
+ Advantage,
17
+ EvalResult,
18
+ Judgment,
19
+ PairwiseVote,
20
+ SpecimenId,
21
+ } from "./types.js";
22
+ import { groupRelativeAdvantage } from "./grpo.js";
23
+
24
+ /** Stage 1: eliminate specimens that fail the gate or trip a hack pattern. */
25
+ export function evalGate(results: EvalResult[]): {
26
+ passers: SpecimenId[];
27
+ eliminated: { specimen: SpecimenId; reason: string }[];
28
+ } {
29
+ const passers: SpecimenId[] = [];
30
+ const eliminated: { specimen: SpecimenId; reason: string }[] = [];
31
+ for (const r of results) {
32
+ if (r.hackFindings.length > 0) {
33
+ eliminated.push({
34
+ specimen: r.specimen,
35
+ reason: `hack-pattern: ${r.hackFindings.map((f) => f.pattern).join(", ")}`,
36
+ });
37
+ } else if (!r.passedGate) {
38
+ eliminated.push({
39
+ specimen: r.specimen,
40
+ reason: `gate-fail: testPassRate=${r.testPassRate.toFixed(2)}`,
41
+ });
42
+ } else {
43
+ passers.push(r.specimen);
44
+ }
45
+ }
46
+ return { passers, eliminated };
47
+ }
48
+
49
+ /** Tally pairwise votes into per-specimen win counts. */
50
+ export function tallyVotes(votes: PairwiseVote[]): Map<SpecimenId, number> {
51
+ const wins = new Map<SpecimenId, number>();
52
+ for (const v of votes) {
53
+ // Ensure both contestants exist in the map even at 0 wins.
54
+ if (!wins.has(v.a)) wins.set(v.a, 0);
55
+ if (!wins.has(v.b)) wins.set(v.b, 0);
56
+ wins.set(v.winner, (wins.get(v.winner) ?? 0) + 1);
57
+ }
58
+ return wins;
59
+ }
60
+
61
+ /**
62
+ * Stage 2: rank passers by pairwise win-count (descending). Ties broken by
63
+ * the specimen's scalar eval reward, then lexicographically by id so ranking
64
+ * is fully deterministic (N6).
65
+ */
66
+ export function rankByVotes(
67
+ passers: SpecimenId[],
68
+ votes: PairwiseVote[],
69
+ rewardOf: (s: SpecimenId) => number,
70
+ ): SpecimenId[] {
71
+ const wins = tallyVotes(votes);
72
+ return [...passers].sort((a, b) => {
73
+ const wd = (wins.get(b) ?? 0) - (wins.get(a) ?? 0);
74
+ if (wd !== 0) return wd;
75
+ const rd = rewardOf(b) - rewardOf(a);
76
+ if (rd !== 0) return rd;
77
+ return a < b ? -1 : a > b ? 1 : 0;
78
+ });
79
+ }
80
+
81
+ /**
82
+ * Scalar reward for a passer: blend of test pass rate, coverage, and mutation
83
+ * kill rate (1 − survival). Bounded to [0,1]. Feeds GRPO + tie-breaks.
84
+ */
85
+ export function evalReward(r: EvalResult): number {
86
+ const killRate = 1 - r.mutationScore;
87
+ return 0.5 * r.testPassRate + 0.25 * r.coverage + 0.25 * killRate;
88
+ }
89
+
90
+ /** Full two-stage selection producing a Judgment (F7 + F8). */
91
+ export function select(
92
+ results: EvalResult[],
93
+ votes: PairwiseVote[],
94
+ ): { judgment: Judgment; eliminated: { specimen: SpecimenId; reason: string }[] } {
95
+ const { passers, eliminated } = evalGate(results);
96
+ const rewardByName = new Map(results.map((r) => [r.specimen, evalReward(r)]));
97
+ const rewardOf = (s: SpecimenId) => rewardByName.get(s) ?? 0;
98
+
99
+ const ranking = rankByVotes(passers, votes, rewardOf);
100
+ // GRPO advantage is computed across the WHOLE specimen group (F8: "across
101
+ // the slice's specimen group"), including gate-eliminated specimens — so the
102
+ // pressure log can weight which *losers'* diffs are most informative (F9).
103
+ const advantages: Advantage[] = groupRelativeAdvantage(
104
+ results.map((r) => ({ specimen: r.specimen, reward: rewardOf(r.specimen) })),
105
+ );
106
+
107
+ const judgment: Judgment = {
108
+ ranking,
109
+ winner: ranking[0] ?? null,
110
+ advantages,
111
+ votes,
112
+ };
113
+ return { judgment, eliminated };
114
+ }
115
+
116
+ /**
117
+ * Generate the full round-robin pairing schedule for a set of passers, each
118
+ * pair to be voted V times by the judge. Order is deterministic (i<j).
119
+ */
120
+ export function pairings(passers: SpecimenId[]): [SpecimenId, SpecimenId][] {
121
+ const pairs: [SpecimenId, SpecimenId][] = [];
122
+ for (let i = 0; i < passers.length; i++) {
123
+ for (let j = i + 1; j < passers.length; j++) {
124
+ pairs.push([passers[i]!, passers[j]!]);
125
+ }
126
+ }
127
+ return pairs;
128
+ }
@@ -0,0 +1,141 @@
1
+ /**
2
+ * Intent-spec / as-built-spec diff (F13) — the canonical audit artifact.
3
+ *
4
+ * The planner produces an upfront intent spec; the documenter produces an
5
+ * as-built spec from the winning merged code + traces. Their diff is committed
6
+ * as `slice-NN/spec-diff.md`.
7
+ *
8
+ * Claims are matched by a stable KEY, not by wording. A claim's key is its
9
+ * explicit `id` when present, else its normalized text. This makes the legacy
10
+ * string-claim path a special case (key derived from text → content matching,
11
+ * case/whitespace-insensitive) while the id-keyed path lets the documenter
12
+ * reword a claim freely and still match the intent claim it adjudicates. An
13
+ * id-keyed as-built claim carries a `satisfied` verdict; `satisfied: false`
14
+ * means the intent claim was genuinely NOT delivered (it lands in `missing`,
15
+ * never silently in `kept`).
16
+ */
17
+
18
+ /** One claim: a bare string (legacy/positional) or a keyed, adjudicated claim. */
19
+ export type Claim =
20
+ | string
21
+ | { id?: string; text?: string; evidence?: string; satisfied?: boolean };
22
+
23
+ export interface Spec {
24
+ claims: Claim[];
25
+ }
26
+
27
+ export interface SpecDiff {
28
+ /** In intent but not as-built — promised, not delivered (or not documented). */
29
+ missing: string[];
30
+ /** In as-built but not intent — delivered beyond the plan (scope creep / extras). */
31
+ added: string[];
32
+ /** Present in both — delivered as planned. */
33
+ kept: string[];
34
+ }
35
+
36
+ function norm(s: string): string {
37
+ return s.trim().toLowerCase().replace(/\s+/g, " ");
38
+ }
39
+
40
+ /** Display text for a claim — what shows in the spec-diff markdown. */
41
+ function claimText(c: Claim): string {
42
+ if (typeof c === "string") return c;
43
+ return c.text ?? c.evidence ?? c.id ?? "(unspecified claim)";
44
+ }
45
+
46
+ /** Stable match key: explicit id if given, else the normalized display text. */
47
+ function claimKey(c: Claim): string {
48
+ if (typeof c !== "string" && typeof c.id === "string" && c.id.trim() !== "") {
49
+ return `id:${c.id.trim()}`;
50
+ }
51
+ return `t:${norm(claimText(c))}`;
52
+ }
53
+
54
+ /** An as-built claim counts as delivering its intent claim unless it explicitly says otherwise. */
55
+ function isSatisfied(c: Claim): boolean {
56
+ return typeof c === "string" ? true : c.satisfied !== false;
57
+ }
58
+
59
+ export function diffSpecs(intent: Spec, asBuilt: Spec): SpecDiff {
60
+ const builtByKey = new Map<string, Claim>();
61
+ for (const c of asBuilt.claims) builtByKey.set(claimKey(c), c);
62
+ const intendedKeys = new Set(intent.claims.map(claimKey));
63
+
64
+ const missing: string[] = [];
65
+ const kept: string[] = [];
66
+ for (const c of intent.claims) {
67
+ const b = builtByKey.get(claimKey(c));
68
+ if (b && isSatisfied(b)) kept.push(claimText(c));
69
+ else missing.push(claimText(c));
70
+ }
71
+ // An as-built claim whose key matches an intent claim is never "added" — even
72
+ // if satisfied:false (then it's a genuine miss counted above), so the same
73
+ // claim is never double-counted as both missing and added.
74
+ const added = asBuilt.claims
75
+ .filter((c) => !intendedKeys.has(claimKey(c)))
76
+ .map(claimText);
77
+
78
+ return { missing, added, kept };
79
+ }
80
+
81
+ /** Render the spec-diff as the markdown body of spec-diff.md (F13). */
82
+ export function renderSpecDiff(sliceId: string, diff: SpecDiff): string {
83
+ const section = (title: string, items: string[]) =>
84
+ `## ${title} (${items.length})\n` +
85
+ (items.length ? items.map((i) => `- ${i}`).join("\n") : "_none_");
86
+ return [
87
+ `# Spec diff — ${sliceId}`,
88
+ "",
89
+ "Canonical audit record: intent spec vs. as-built spec.",
90
+ "",
91
+ section("✅ Delivered as planned", diff.kept),
92
+ "",
93
+ section("⚠️ Planned but missing", diff.missing),
94
+ "",
95
+ section("➕ Built beyond plan", diff.added),
96
+ "",
97
+ ].join("\n");
98
+ }
99
+
100
+ /** A slice is faithfully built when nothing planned is missing. */
101
+ export function isFaithful(diff: SpecDiff): boolean {
102
+ return diff.missing.length === 0;
103
+ }
104
+
105
+ /**
106
+ * Intent claim ids that the as-built spec failed to account for cleanly — used
107
+ * by `finalize` to warn when the documenter mis-keyed a verdict (a mis-keyed id
108
+ * shows up as a false `missing` here). Returns intent ids with no satisfied
109
+ * as-built claim at their key; an empty array means every intent claim was
110
+ * adjudicated. Only meaningful for id-keyed specs.
111
+ */
112
+ export function unmatchedIntentIds(intent: Spec, asBuilt: Spec): string[] {
113
+ const builtByKey = new Map<string, Claim>();
114
+ for (const c of asBuilt.claims) builtByKey.set(claimKey(c), c);
115
+ const ids: string[] = [];
116
+ for (const c of intent.claims) {
117
+ if (typeof c === "string" || !c.id) continue;
118
+ const b = builtByKey.get(claimKey(c));
119
+ if (!b || !isSatisfied(b)) ids.push(c.id);
120
+ }
121
+ return ids;
122
+ }
123
+
124
+ /**
125
+ * As-built claim ids that claim to satisfy an intent claim but reference an id
126
+ * not present in the intent spec — a documenter mis-key. Extras (ids the
127
+ * documenter coined for behaviour beyond intent) are expected and excluded by
128
+ * convention: an id is treated as an extra when it is absent from intent AND
129
+ * the claim does not assert `satisfied`. Anything else absent-from-intent but
130
+ * asserting satisfied is a likely mis-key and is surfaced.
131
+ */
132
+ export function mismatchedAsBuiltIds(intent: Spec, asBuilt: Spec): string[] {
133
+ const intendedKeys = new Set(intent.claims.map(claimKey));
134
+ const out: string[] = [];
135
+ for (const c of asBuilt.claims) {
136
+ if (typeof c === "string" || !c.id) continue;
137
+ if (intendedKeys.has(claimKey(c))) continue; // matched an intent claim
138
+ if (c.satisfied === true) out.push(c.id); // asserts it satisfied something, but matches no intent id
139
+ }
140
+ return out;
141
+ }
package/src/state.ts ADDED
@@ -0,0 +1,95 @@
1
+ /**
2
+ * Per-slice state store (F16, N1). git is the artifact store; `state.json`
3
+ * tracks current phase, active specimens, escalation, budget, and an
4
+ * append-only event sequence. Crash recovery resumes from `state.json` + the
5
+ * last commit on the slice branch.
6
+ *
7
+ * Lives at `.stz/40-slices/<sliceId>/state.json`. (The §3 taxonomy lists
8
+ * state.json under 90-audit; we keep a per-slice copy beside the slice so a
9
+ * slice is self-contained, matching F16 "state.json per slice".)
10
+ */
11
+ import { writeFile, readFile, mkdir } from "node:fs/promises";
12
+ import { existsSync } from "node:fs";
13
+ import { join, dirname } from "node:path";
14
+ import { PHASES, type Phase, type PhaseStatus, type SliceState } from "./types.js";
15
+ import { STZ_DIR } from "./taxonomy.js";
16
+ import { allocateBudget } from "./budget.js";
17
+
18
+ export function statePath(root: string, sliceId: string): string {
19
+ return join(root, STZ_DIR, "40-slices", sliceId, "state.json");
20
+ }
21
+
22
+ export function freshState(sliceId: string, complexity = 1, poolRemaining = 5_000_000): SliceState {
23
+ const phaseStatus = Object.fromEntries(
24
+ PHASES.map((p) => [p, "pending" as PhaseStatus]),
25
+ ) as Record<Phase, PhaseStatus>;
26
+ return {
27
+ schemaVersion: 1,
28
+ sliceId,
29
+ currentPhase: PHASES[0],
30
+ phaseStatus,
31
+ escalation: "normal",
32
+ retryCount: 0,
33
+ replanCount: 0,
34
+ activeSpecimens: [],
35
+ budget: allocateBudget(complexity, poolRemaining),
36
+ events: [],
37
+ callCount: 0,
38
+ failureReport: null,
39
+ };
40
+ }
41
+
42
+ /** Append a structured event (N1 replay spine). Mutates and returns state. */
43
+ export function appendEvent(
44
+ state: SliceState,
45
+ phase: Phase | "lifecycle",
46
+ kind: string,
47
+ detail: string,
48
+ ): SliceState {
49
+ state.events.push({ seq: state.events.length, phase, kind, detail });
50
+ return state;
51
+ }
52
+
53
+ export function setPhaseStatus(
54
+ state: SliceState,
55
+ phase: Phase,
56
+ status: PhaseStatus,
57
+ ): SliceState {
58
+ state.phaseStatus[phase] = status;
59
+ if (status === "running") state.currentPhase = phase;
60
+ return appendEvent(state, phase, `phase-${status}`, `${phase} → ${status}`);
61
+ }
62
+
63
+ export async function saveState(root: string, state: SliceState): Promise<void> {
64
+ const p = statePath(root, state.sliceId);
65
+ await mkdir(dirname(p), { recursive: true });
66
+ await writeFile(p, JSON.stringify(state, null, 2) + "\n", "utf8");
67
+ }
68
+
69
+ export async function loadState(root: string, sliceId: string): Promise<SliceState> {
70
+ const raw = await readFile(statePath(root, sliceId), "utf8");
71
+ return JSON.parse(raw) as SliceState;
72
+ }
73
+
74
+ export function stateExists(root: string, sliceId: string): boolean {
75
+ return existsSync(statePath(root, sliceId));
76
+ }
77
+
78
+ /**
79
+ * Crash recovery (F16): determine the phase to resume from. A phase left in
80
+ * "running" was interrupted and must be re-entered; otherwise resume at the
81
+ * first non-done phase. Returns null if the slice is fully complete or halted.
82
+ */
83
+ export function resumePhase(state: SliceState): Phase | null {
84
+ if (state.escalation === "halted") return null;
85
+ const running = PHASES.find((p) => state.phaseStatus[p] === "running");
86
+ if (running) return running;
87
+ const pending = PHASES.find(
88
+ (p) => state.phaseStatus[p] === "pending" || state.phaseStatus[p] === "failed",
89
+ );
90
+ return pending ?? null;
91
+ }
92
+
93
+ export function isComplete(state: SliceState): boolean {
94
+ return PHASES.every((p) => state.phaseStatus[p] === "done");
95
+ }
@@ -0,0 +1,161 @@
1
+ /**
2
+ * The `.stz/` markdown taxonomy (§3 Data & Vector Store) — primary data store.
3
+ *
4
+ * Tiered tree:
5
+ * 00-intent/ 10-research/ 20-standards/ 30-tests/
6
+ * 40-slices/ 50-pressure/ 90-audit/
7
+ *
8
+ * Every file carries YAML frontmatter with a ~200-token `summary` field for
9
+ * progressive disclosure (N2): phase agents load summaries by default and fetch
10
+ * full bodies only on named-anchor reference.
11
+ *
12
+ * Dependency-light by design (N10 "minimal toolchain"): a tiny hand-rolled
13
+ * frontmatter (de)serializer rather than a YAML lib. The supported subset is
14
+ * scalars + string arrays, which is all the schema uses.
15
+ */
16
+ import { mkdir, writeFile, readFile } from "node:fs/promises";
17
+ import { existsSync } from "node:fs";
18
+ import { join, dirname } from "node:path";
19
+
20
+ export const STZ_DIR = ".stz";
21
+
22
+ export const TIERS = [
23
+ "00-intent",
24
+ "10-research",
25
+ "10-research/external",
26
+ "10-research/internal",
27
+ "10-research/spikes",
28
+ "20-standards",
29
+ "20-standards/architecture-decisions",
30
+ "30-tests",
31
+ "30-tests/held-out",
32
+ "40-slices",
33
+ "50-pressure",
34
+ "90-audit",
35
+ "90-audit/calls",
36
+ ] as const;
37
+
38
+ export interface Frontmatter {
39
+ summary: string;
40
+ [key: string]: unknown;
41
+ }
42
+
43
+ export interface MarkdownDoc {
44
+ frontmatter: Frontmatter;
45
+ body: string;
46
+ }
47
+
48
+ // ── frontmatter (de)serialization ─────────────────────────────────────────
49
+
50
+ function serializeValue(v: unknown): string {
51
+ if (Array.isArray(v)) {
52
+ if (v.length === 0) return "[]";
53
+ return "\n" + v.map((x) => ` - ${scalar(x)}`).join("\n");
54
+ }
55
+ return ` ${scalar(v)}`;
56
+ }
57
+
58
+ function scalar(v: unknown): string {
59
+ if (typeof v === "string") {
60
+ // Quote strings containing characters that would break the simple parser.
61
+ if (/[:#\n]|^\s|\s$/.test(v) || v === "") return JSON.stringify(v);
62
+ return v;
63
+ }
64
+ return String(v);
65
+ }
66
+
67
+ export function serializeFrontmatter(fm: Frontmatter): string {
68
+ const lines: string[] = ["---"];
69
+ for (const [k, v] of Object.entries(fm)) {
70
+ if (Array.isArray(v)) {
71
+ lines.push(`${k}:${serializeValue(v)}`);
72
+ } else {
73
+ lines.push(`${k}:${serializeValue(v)}`);
74
+ }
75
+ }
76
+ lines.push("---");
77
+ return lines.join("\n");
78
+ }
79
+
80
+ export function serializeDoc(doc: MarkdownDoc): string {
81
+ return `${serializeFrontmatter(doc.frontmatter)}\n\n${doc.body.trimEnd()}\n`;
82
+ }
83
+
84
+ export function parseDoc(raw: string): MarkdownDoc {
85
+ const m = raw.match(/^---\n([\s\S]*?)\n---\n?([\s\S]*)$/);
86
+ if (!m) return { frontmatter: { summary: "" }, body: raw };
87
+ const [, fmBlock, body] = m;
88
+ const fm: Frontmatter = { summary: "" };
89
+ const lines = (fmBlock ?? "").split("\n");
90
+ for (let i = 0; i < lines.length; i++) {
91
+ const line = lines[i]!;
92
+ const kv = line.match(/^([A-Za-z0-9_-]+):\s*(.*)$/);
93
+ if (!kv) continue;
94
+ const key = kv[1]!;
95
+ const rest = kv[2]!;
96
+ if (rest === "" && lines[i + 1]?.match(/^\s*-\s+/)) {
97
+ // string array
98
+ const arr: string[] = [];
99
+ while (lines[i + 1]?.match(/^\s*-\s+/)) {
100
+ arr.push(String(unscalar(lines[++i]!.replace(/^\s*-\s+/, ""))));
101
+ }
102
+ fm[key] = arr;
103
+ } else if (rest === "[]") {
104
+ fm[key] = [];
105
+ } else {
106
+ fm[key] = unscalar(rest);
107
+ }
108
+ }
109
+ return { frontmatter: fm, body: (body ?? "").replace(/^\n+/, "") };
110
+ }
111
+
112
+ function unscalar(s: string): string | number | boolean {
113
+ if (s.startsWith('"')) {
114
+ try {
115
+ return JSON.parse(s) as string;
116
+ } catch {
117
+ return s;
118
+ }
119
+ }
120
+ if (s === "true") return true;
121
+ if (s === "false") return false;
122
+ if (s !== "" && !Number.isNaN(Number(s))) return Number(s);
123
+ return s;
124
+ }
125
+
126
+ // ── filesystem operations ─────────────────────────────────────────────────
127
+
128
+ /** Create the full `.stz/` tier tree under `root`. Idempotent. */
129
+ export async function scaffold(root: string): Promise<string[]> {
130
+ const base = join(root, STZ_DIR);
131
+ const created: string[] = [];
132
+ for (const tier of TIERS) {
133
+ const dir = join(base, tier);
134
+ if (!existsSync(dir)) {
135
+ await mkdir(dir, { recursive: true });
136
+ created.push(tier);
137
+ }
138
+ }
139
+ return created;
140
+ }
141
+
142
+ /** Write a markdown doc (creating parent dirs) under `.stz/<relPath>`. */
143
+ export async function writeDoc(
144
+ root: string,
145
+ relPath: string,
146
+ doc: MarkdownDoc,
147
+ ): Promise<void> {
148
+ const full = join(root, STZ_DIR, relPath);
149
+ await mkdir(dirname(full), { recursive: true });
150
+ await writeFile(full, serializeDoc(doc), "utf8");
151
+ }
152
+
153
+ export async function readDoc(root: string, relPath: string): Promise<MarkdownDoc> {
154
+ const full = join(root, STZ_DIR, relPath);
155
+ const raw = await readFile(full, "utf8");
156
+ return parseDoc(raw);
157
+ }
158
+
159
+ export function stzPath(root: string, relPath: string): string {
160
+ return join(root, STZ_DIR, relPath);
161
+ }