slice-tournament-zoo 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +357 -0
- package/bin/stz.mjs +15 -0
- package/package.json +35 -0
- package/src/README.md +19 -0
- package/src/bridge.ts +950 -0
- package/src/budget.ts +78 -0
- package/src/cli.ts +126 -0
- package/src/cost-tracker.ts +59 -0
- package/src/escalation.ts +89 -0
- package/src/eval-runner.ts +220 -0
- package/src/grpo.ts +54 -0
- package/src/hack-detector.ts +124 -0
- package/src/index.ts +17 -0
- package/src/merge.ts +245 -0
- package/src/mock/README.md +40 -0
- package/src/mock/interfaces.ts +114 -0
- package/src/mock/mock.ts +223 -0
- package/src/mock/orchestrator.ts +457 -0
- package/src/pressure.ts +81 -0
- package/src/project.ts +335 -0
- package/src/seal.ts +182 -0
- package/src/selection.ts +128 -0
- package/src/specdiff.ts +141 -0
- package/src/state.ts +95 -0
- package/src/taxonomy.ts +161 -0
- package/src/types.ts +305 -0
package/src/budget.ts
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Adaptive complexity-based budgeting (F15, N5).
|
|
3
|
+
*
|
|
4
|
+
* Elicitation emits a complexity score 1..5 per slice. The orchestrator
|
|
5
|
+
* allocates token + wall-clock budget from a project pool against complexity.
|
|
6
|
+
* Actuals feed back into a calibration table so future estimates improve.
|
|
7
|
+
*/
|
|
8
|
+
import type { Budget } from "./types.js";
|
|
9
|
+
|
|
10
|
+
export interface BudgetConfig {
|
|
11
|
+
/** Project-level token pool to draw from. */
|
|
12
|
+
poolTokens: number;
|
|
13
|
+
/** Base tokens for a complexity-1 slice. */
|
|
14
|
+
baseTokens: number;
|
|
15
|
+
/** Multiplier applied per complexity step above 1. */
|
|
16
|
+
perComplexity: number;
|
|
17
|
+
/** Base wall-clock per slice (ms). Default 30 min (N4). */
|
|
18
|
+
baseWallClockMs: number;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export const DEFAULT_BUDGET_CONFIG: BudgetConfig = {
|
|
22
|
+
poolTokens: 5_000_000,
|
|
23
|
+
baseTokens: 200_000,
|
|
24
|
+
perComplexity: 1.6,
|
|
25
|
+
baseWallClockMs: 30 * 60_000,
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
export function clampComplexity(c: number): number {
|
|
29
|
+
if (Number.isNaN(c)) return 1;
|
|
30
|
+
return Math.max(1, Math.min(5, Math.round(c)));
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Allocate a per-slice budget. Token cap grows geometrically with complexity
|
|
35
|
+
* but is never allowed to exceed the remaining pool.
|
|
36
|
+
*/
|
|
37
|
+
export function allocateBudget(
|
|
38
|
+
complexity: number,
|
|
39
|
+
poolRemaining: number,
|
|
40
|
+
cfg: BudgetConfig = DEFAULT_BUDGET_CONFIG,
|
|
41
|
+
): Budget {
|
|
42
|
+
const c = clampComplexity(complexity);
|
|
43
|
+
const raw = Math.round(cfg.baseTokens * cfg.perComplexity ** (c - 1));
|
|
44
|
+
const tokenCap = Math.min(raw, Math.max(0, poolRemaining));
|
|
45
|
+
return {
|
|
46
|
+
tokenCap,
|
|
47
|
+
wallClockMs: cfg.baseWallClockMs * (1 + 0.5 * (c - 1)),
|
|
48
|
+
tokensSpent: 0,
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/** A calibration table entry: estimated vs actual token spend at a complexity. */
|
|
53
|
+
export interface CalibrationEntry {
|
|
54
|
+
complexity: number;
|
|
55
|
+
estimated: number;
|
|
56
|
+
actual: number;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Update the per-complexity calibration multiplier from observed actuals.
|
|
61
|
+
* Returns a corrected `perComplexity`-style scalar centred on observed ratios.
|
|
62
|
+
* Simple mean-of-ratios estimator (transparent, auditable).
|
|
63
|
+
*/
|
|
64
|
+
export function calibrate(entries: CalibrationEntry[], cfg = DEFAULT_BUDGET_CONFIG): BudgetConfig {
|
|
65
|
+
if (entries.length === 0) return cfg;
|
|
66
|
+
const ratios = entries
|
|
67
|
+
.filter((e) => e.estimated > 0)
|
|
68
|
+
.map((e) => e.actual / e.estimated);
|
|
69
|
+
if (ratios.length === 0) return cfg;
|
|
70
|
+
const meanRatio = ratios.reduce((a, b) => a + b, 0) / ratios.length;
|
|
71
|
+
// Nudge baseTokens by the observed ratio so next estimates track actuals.
|
|
72
|
+
return { ...cfg, baseTokens: Math.round(cfg.baseTokens * meanRatio) };
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** True if spending `add` more tokens would breach the slice cap (N5 hard cap). */
|
|
76
|
+
export function wouldExceed(budget: Budget, add: number): boolean {
|
|
77
|
+
return budget.tokensSpent + add > budget.tokenCap;
|
|
78
|
+
}
|
package/src/cli.ts
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* STZ CLI (F17). `npx stz <command>`.
|
|
3
|
+
*
|
|
4
|
+
* stz init [dir] scaffold the .stz/ taxonomy + AGENTS.md
|
|
5
|
+
* stz run [dir] run the bundled demo slice through the mock pipeline
|
|
6
|
+
* stz help
|
|
7
|
+
*/
|
|
8
|
+
import { join } from "node:path";
|
|
9
|
+
import { writeFile } from "node:fs/promises";
|
|
10
|
+
import { existsSync } from "node:fs";
|
|
11
|
+
import { scaffold, writeDoc, STZ_DIR, TIERS } from "./taxonomy.js";
|
|
12
|
+
import { runSlice } from "./mock/orchestrator.js";
|
|
13
|
+
import { runBridge } from "./bridge.js";
|
|
14
|
+
import { MockModelLayer, defaultMockConfig } from "./mock/mock.js";
|
|
15
|
+
import type { SliceManifest } from "./types.js";
|
|
16
|
+
|
|
17
|
+
const AGENTS_MD = `# AGENTS.md — STZ table of contents
|
|
18
|
+
|
|
19
|
+
This repo is managed by **slice-tournament-zoo (STZ)**. Progressive disclosure:
|
|
20
|
+
load the tier summary you need, fetch full bodies only on named-anchor reference.
|
|
21
|
+
|
|
22
|
+
| Tier | Purpose |
|
|
23
|
+
|------|---------|
|
|
24
|
+
| \`.stz/00-intent/\` | elicitation transcript, questionnaire, done-predicates |
|
|
25
|
+
| \`.stz/10-research/\` | external/internal research, validated claims, spikes |
|
|
26
|
+
| \`.stz/20-standards/\` | conventions (versioned), architecture decisions |
|
|
27
|
+
| \`.stz/30-tests/\` | test plan, rubric, **sealed held-out suite** (read-only) |
|
|
28
|
+
| \`.stz/40-slices/\` | per-slice manifest, plan, prototypes, tournament, spec-diff |
|
|
29
|
+
| \`.stz/50-pressure/\` | culled specimens' diffs + critiques (the pressure log) |
|
|
30
|
+
| \`.stz/90-audit/\` | journal, call ledger, cost, state.json |
|
|
31
|
+
|
|
32
|
+
Vocabulary (the zoo metaphor): *specimens* = agents, *environment* = eval
|
|
33
|
+
suite + conventions, *propagation* = winner's pattern carried forward,
|
|
34
|
+
*selection pressure* = the culling mechanism, *pressure log* = the artifact.
|
|
35
|
+
`;
|
|
36
|
+
|
|
37
|
+
const DEMO_MANIFEST: SliceManifest = {
|
|
38
|
+
id: "slice-01",
|
|
39
|
+
name: "demo-slice",
|
|
40
|
+
contract: "export function run(input: Request): Result",
|
|
41
|
+
donePredicates: [
|
|
42
|
+
{ id: "schema", expr: "returns_schema(Result)", kind: "schema" },
|
|
43
|
+
{ id: "latency", expr: "p95_latency_ms < 200", kind: "metric" },
|
|
44
|
+
],
|
|
45
|
+
traceTier: "minimal",
|
|
46
|
+
complexity: 2,
|
|
47
|
+
dependsOn: [],
|
|
48
|
+
judge: { votesPerPair: 8 },
|
|
49
|
+
summary: "Demo slice exercising the full STZ pipeline against the mock model layer.",
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
async function cmdInit(dir: string): Promise<void> {
|
|
53
|
+
const created = await scaffold(dir);
|
|
54
|
+
await writeFile(join(dir, "AGENTS.md"), AGENTS_MD, "utf8");
|
|
55
|
+
await writeDoc(dir, join("00-intent", "bootstrap.md"), {
|
|
56
|
+
frontmatter: { summary: "Bootstrap (slice-00): hand-written minimal kernel; STZ produces itself from slice-01 (R7/F18)." },
|
|
57
|
+
body: "# Bootstrap\n\nSlice-00 is this kernel. STZ dogfoods from slice-01 onward.\n",
|
|
58
|
+
});
|
|
59
|
+
console.log(`Scaffolded ${STZ_DIR}/ (${TIERS.length} tiers, ${created.length} created) + AGENTS.md at ${dir}`);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async function cmdRun(dir: string): Promise<void> {
|
|
63
|
+
if (!existsSync(join(dir, STZ_DIR))) await scaffold(dir);
|
|
64
|
+
const model = new MockModelLayer(defaultMockConfig());
|
|
65
|
+
const result = await runSlice({ root: dir, manifest: DEMO_MANIFEST, model, n: 4, log: console.log });
|
|
66
|
+
console.log("\n── result ──");
|
|
67
|
+
console.log(`winner: ${result.winner ? "specimen-" + result.winner : "none (halted)"}`);
|
|
68
|
+
console.log(`faithful (no planned-but-missing): ${result.faithful}`);
|
|
69
|
+
console.log(`rounds: ${result.rounds}`);
|
|
70
|
+
console.log(`artifacts: ${result.artifacts.length} under ${STZ_DIR}/`);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const LOGO = String.raw`
|
|
74
|
+
██████╗ ████████╗ ███████╗
|
|
75
|
+
██╔════╝ ╚══██╔══╝ ╚══███╔╝
|
|
76
|
+
╚█████╗ ██║ ███╔╝
|
|
77
|
+
╚═══██╗ ██║ ███╔╝
|
|
78
|
+
██████╔╝ ██║ ███████╗
|
|
79
|
+
╚═════╝ ╚═╝ ╚══════╝
|
|
80
|
+
`;
|
|
81
|
+
|
|
82
|
+
function cmdHelp(): void {
|
|
83
|
+
console.log(LOGO);
|
|
84
|
+
console.log(`slice-tournament-zoo: adversarial slice tournaments with a replayable audit trail
|
|
85
|
+
|
|
86
|
+
Usage:
|
|
87
|
+
stz init [dir] scaffold the .stz/ taxonomy + AGENTS.md (default: cwd)
|
|
88
|
+
stz run [dir] run the bundled demo slice through the mock pipeline
|
|
89
|
+
stz bridge <cmd> deterministic orchestration bridge (used by the /stz:* commands)
|
|
90
|
+
stz help show this help
|
|
91
|
+
|
|
92
|
+
In Claude Code, install the plugin and drive the full pipeline with /stz:new,
|
|
93
|
+
/stz:research, /stz:slice, /stz:pipeline, and friends. See the README.
|
|
94
|
+
`);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async function main(): Promise<void> {
|
|
98
|
+
const [cmd, dirArg] = process.argv.slice(2);
|
|
99
|
+
const dir = dirArg ?? process.cwd();
|
|
100
|
+
switch (cmd) {
|
|
101
|
+
case "init":
|
|
102
|
+
await cmdInit(dir);
|
|
103
|
+
break;
|
|
104
|
+
case "run":
|
|
105
|
+
await cmdRun(dir);
|
|
106
|
+
break;
|
|
107
|
+
case "bridge":
|
|
108
|
+
// Deterministic orchestration bridge called by the /stz:run command
|
|
109
|
+
// between Task-subagent spawns. Everything after "bridge" is its argv.
|
|
110
|
+
await runBridge(process.argv.slice(3));
|
|
111
|
+
break;
|
|
112
|
+
case "help":
|
|
113
|
+
case undefined:
|
|
114
|
+
cmdHelp();
|
|
115
|
+
break;
|
|
116
|
+
default:
|
|
117
|
+
console.error(`unknown command: ${cmd}\n`);
|
|
118
|
+
cmdHelp();
|
|
119
|
+
process.exitCode = 1;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
main().catch((err) => {
|
|
124
|
+
console.error(err);
|
|
125
|
+
process.exitCode = 1;
|
|
126
|
+
});
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cost / token tracker (N5 cost governance, N6 replay).
|
|
3
|
+
*
|
|
4
|
+
* Middleware around every Anthropic/OpenAI SDK call. Persists each call as a
|
|
5
|
+
* JSONL record under `90-audit/calls/` and aggregates spend into `state.json`.
|
|
6
|
+
* A replay command can reconstruct any decision from these records.
|
|
7
|
+
*/
|
|
8
|
+
import type { CallRecord, Phase } from "./types.js";
|
|
9
|
+
|
|
10
|
+
export class CostTracker {
|
|
11
|
+
private seq = 0;
|
|
12
|
+
private records: CallRecord[] = [];
|
|
13
|
+
|
|
14
|
+
/** Record one call. Returns the persisted record (with assigned seq). */
|
|
15
|
+
record(input: Omit<CallRecord, "seq">): CallRecord {
|
|
16
|
+
const rec: CallRecord = { ...input, seq: this.seq++ };
|
|
17
|
+
this.records.push(rec);
|
|
18
|
+
return rec;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
all(): CallRecord[] {
|
|
22
|
+
return [...this.records];
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/** Total tokens (prompt + completion) across all recorded calls. */
|
|
26
|
+
totalTokens(): number {
|
|
27
|
+
return this.records.reduce(
|
|
28
|
+
(a, r) => a + r.promptTokens + r.completionTokens,
|
|
29
|
+
0,
|
|
30
|
+
);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
tokensForPhase(phase: Phase): number {
|
|
34
|
+
return this.records
|
|
35
|
+
.filter((r) => r.phase === phase)
|
|
36
|
+
.reduce((a, r) => a + r.promptTokens + r.completionTokens, 0);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
count(): number {
|
|
40
|
+
return this.records.length;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** Serialize the ledger as JSONL for 90-audit/calls/*.jsonl. */
|
|
44
|
+
toJSONL(): string {
|
|
45
|
+
return this.records.map((r) => JSON.stringify(r)).join("\n");
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Rebuild a tracker from persisted JSONL (replay/recovery). */
|
|
49
|
+
static fromJSONL(jsonl: string): CostTracker {
|
|
50
|
+
const t = new CostTracker();
|
|
51
|
+
const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
|
|
52
|
+
for (const line of lines) {
|
|
53
|
+
const rec = JSON.parse(line) as CallRecord;
|
|
54
|
+
t.records.push(rec);
|
|
55
|
+
t.seq = Math.max(t.seq, rec.seq + 1);
|
|
56
|
+
}
|
|
57
|
+
return t;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bounded failure escalation (F14, R1 mitigation).
|
|
3
|
+
*
|
|
4
|
+
* no passers → 1 GRPO retry round (losers' pressure log as negative context)
|
|
5
|
+
* → 1 replanning loop (failure analysis fed back into planning)
|
|
6
|
+
* → halt with a structured failure report.
|
|
7
|
+
*
|
|
8
|
+
* Hard ceiling. This FSM is the single source of truth for "are we allowed to
|
|
9
|
+
* try again?" — the orchestrator must consult it and never loop on its own.
|
|
10
|
+
* The ceiling is exactly: at most 1 retry and at most 1 replan, ever.
|
|
11
|
+
*/
|
|
12
|
+
import type { EscalationStage } from "./types.js";
|
|
13
|
+
|
|
14
|
+
export interface EscalationState {
|
|
15
|
+
stage: EscalationStage;
|
|
16
|
+
retryCount: number;
|
|
17
|
+
replanCount: number;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export const MAX_RETRIES = 1;
|
|
21
|
+
export const MAX_REPLANS = 1;
|
|
22
|
+
|
|
23
|
+
export function initialEscalation(): EscalationState {
|
|
24
|
+
return { stage: "normal", retryCount: 0, replanCount: 0 };
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export type EscalationAction =
|
|
28
|
+
| { type: "retry"; note: string }
|
|
29
|
+
| { type: "replan"; note: string }
|
|
30
|
+
| { type: "halt"; note: string };
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Given the current escalation state after a tournament produced no gate-passers,
|
|
34
|
+
* decide the next action and return the advanced state. Pure function: same
|
|
35
|
+
* input → same output (N6).
|
|
36
|
+
*/
|
|
37
|
+
export function onNoPassers(s: EscalationState): {
|
|
38
|
+
next: EscalationState;
|
|
39
|
+
action: EscalationAction;
|
|
40
|
+
} {
|
|
41
|
+
if (s.retryCount < MAX_RETRIES) {
|
|
42
|
+
return {
|
|
43
|
+
next: { stage: "grpo-retry", retryCount: s.retryCount + 1, replanCount: s.replanCount },
|
|
44
|
+
action: {
|
|
45
|
+
type: "retry",
|
|
46
|
+
note: "Re-running tournament with pressure log + K=4 surviving summaries as PDR refinement context.",
|
|
47
|
+
},
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
if (s.replanCount < MAX_REPLANS) {
|
|
51
|
+
return {
|
|
52
|
+
next: { stage: "replan", retryCount: s.retryCount, replanCount: s.replanCount + 1 },
|
|
53
|
+
action: {
|
|
54
|
+
type: "replan",
|
|
55
|
+
note: "Re-entering planning phase with failure analysis from the prior round.",
|
|
56
|
+
},
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
return {
|
|
60
|
+
next: { stage: "halted", retryCount: s.retryCount, replanCount: s.replanCount },
|
|
61
|
+
action: {
|
|
62
|
+
type: "halt",
|
|
63
|
+
note: "Retry and replan budgets exhausted. Emitting structured failure report.",
|
|
64
|
+
},
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** True once no further attempts are permitted. */
|
|
69
|
+
export function isHalted(s: EscalationState): boolean {
|
|
70
|
+
return s.stage === "halted";
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Drive the FSM to terminal state from a given start, recording the action
|
|
75
|
+
* sequence. Used to prove the ceiling holds (test) and to dry-run the path.
|
|
76
|
+
*/
|
|
77
|
+
export function escalationTrace(start = initialEscalation()): EscalationAction[] {
|
|
78
|
+
const actions: EscalationAction[] = [];
|
|
79
|
+
let s = start;
|
|
80
|
+
// Bound the loop independently as a belt-and-suspenders guard; the FSM must
|
|
81
|
+
// terminate in at most MAX_RETRIES + MAX_REPLANS + 1 steps.
|
|
82
|
+
for (let guard = 0; guard < MAX_RETRIES + MAX_REPLANS + 5; guard++) {
|
|
83
|
+
const { next, action } = onNoPassers(s);
|
|
84
|
+
actions.push(action);
|
|
85
|
+
s = next;
|
|
86
|
+
if (action.type === "halt") return actions;
|
|
87
|
+
}
|
|
88
|
+
throw new Error("escalation FSM failed to terminate — ceiling violated");
|
|
89
|
+
}
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Real eval runner (F7/F11) — un-stubs the metrics the bridge used to take on
|
|
3
|
+
* trust. Everything here is genuinely executed, with no test/coverage/mutation
|
|
4
|
+
* library dependency (N10 minimal toolchain):
|
|
5
|
+
*
|
|
6
|
+
* testPassRate — run the sealed harness against the specimen, parse its
|
|
7
|
+
* final JSON line {passed,total,passRate}.
|
|
8
|
+
* coverage — run under NODE_V8_COVERAGE and measure the fraction of the
|
|
9
|
+
* specimen file's bytes that V8 marked executed.
|
|
10
|
+
* mutationScore — apply a small set of source mutators to the specimen,
|
|
11
|
+
* re-run the sealed harness against each mutant, and report
|
|
12
|
+
* the SURVIVAL rate (mutants the suite failed to kill). Lower
|
|
13
|
+
* is better; this is the eval signal that separates a thorough
|
|
14
|
+
* suite from a shallow one.
|
|
15
|
+
*
|
|
16
|
+
* The sealed harness contract: `node <sealed.mjs> <absolute-impl-path>` prints a
|
|
17
|
+
* final line `{"passed":n,"total":m,"passRate":r}` and exits 0 iff r===1. The
|
|
18
|
+
* runner resolves impl paths to absolute itself (the relative-path bug that bit
|
|
19
|
+
* the first live run must never reach a user).
|
|
20
|
+
*/
|
|
21
|
+
import { spawnSync } from "node:child_process";
|
|
22
|
+
import { readFileSync, writeFileSync, mkdtempSync, rmSync, readdirSync, mkdirSync } from "node:fs";
|
|
23
|
+
import { tmpdir } from "node:os";
|
|
24
|
+
import { resolve, join } from "node:path";
|
|
25
|
+
|
|
26
|
+
export interface SealedResult {
|
|
27
|
+
passed: number;
|
|
28
|
+
total: number;
|
|
29
|
+
passRate: number;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const RUN_TIMEOUT_MS = 20_000;
|
|
33
|
+
|
|
34
|
+
/** Run the sealed harness against one implementation file. */
|
|
35
|
+
export function runSealed(sealedPath: string, implPath: string, covDir?: string): SealedResult {
|
|
36
|
+
const abs = resolve(implPath);
|
|
37
|
+
const env = covDir ? { ...process.env, NODE_V8_COVERAGE: covDir } : process.env;
|
|
38
|
+
const r = spawnSync("node", [resolve(sealedPath), abs], {
|
|
39
|
+
encoding: "utf8",
|
|
40
|
+
timeout: RUN_TIMEOUT_MS,
|
|
41
|
+
env,
|
|
42
|
+
});
|
|
43
|
+
const out = (r.stdout ?? "").trim().split("\n").filter(Boolean);
|
|
44
|
+
const last = out[out.length - 1] ?? "";
|
|
45
|
+
try {
|
|
46
|
+
const parsed = JSON.parse(last) as SealedResult;
|
|
47
|
+
if (typeof parsed.passRate === "number") return parsed;
|
|
48
|
+
} catch {
|
|
49
|
+
/* fall through to a zeroed result */
|
|
50
|
+
}
|
|
51
|
+
return { passed: 0, total: 1, passRate: 0 };
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export type CrossStatus = "both-pass" | "divergent" | "both-fail";
|
|
55
|
+
|
|
56
|
+
export interface CrossReferenceResult {
|
|
57
|
+
/** Sealed-suite result for the primary (test-author's) reference. */
|
|
58
|
+
a: SealedResult;
|
|
59
|
+
/** Sealed-suite result for the independently-authored cross-family reference. */
|
|
60
|
+
b: SealedResult;
|
|
61
|
+
/** Both independent references satisfy the suite — the wanted green state. */
|
|
62
|
+
bothPass: boolean;
|
|
63
|
+
/** Exactly one passes — the suite encodes a reference-specific assumption. */
|
|
64
|
+
divergent: boolean;
|
|
65
|
+
/** Neither passes — the suite is unsatisfiable for both (a gate failure). */
|
|
66
|
+
bothFail: boolean;
|
|
67
|
+
status: CrossStatus;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Cross-family reference check (F10-adjacent / R2 "cross-family quorum"). The
|
|
72
|
+
* single smoke-gate reference is authored by the same agent as the suite, so it
|
|
73
|
+
* shares the author's blind spots: a fragile invariant (e.g. identity keyed on
|
|
74
|
+
* mutable position) goes green against it and still ships. A SECOND,
|
|
75
|
+
* independently-authored reference — different model family or a human — that is
|
|
76
|
+
* run against the SAME sealed suite catches exactly that class: if the two
|
|
77
|
+
* references disagree, the suite encodes an assumption one author didn't share.
|
|
78
|
+
*
|
|
79
|
+
* This primitive only REPORTS the divergence; it deliberately does not verdict.
|
|
80
|
+
* A B-fails-A-passes split is ambiguous — either the suite over-fits A (the
|
|
81
|
+
* blind spot we want to surface) OR reference B is simply wrong — and aggregate
|
|
82
|
+
* pass counts cannot tell them apart. Classification is the orchestrator's job,
|
|
83
|
+
* consistent with the guide/sensor split in `sealed-suite.md`: divergence is a
|
|
84
|
+
* GUIDE-class signal for human adjudication (amend + strengthen authoring
|
|
85
|
+
* guidance, or discard a buggy B), not a sensor-style auto-rewrite trigger.
|
|
86
|
+
*/
|
|
87
|
+
export function crossReference(sealedPath: string, refAPath: string, refBPath: string): CrossReferenceResult {
|
|
88
|
+
const a = runSealed(sealedPath, refAPath);
|
|
89
|
+
const b = runSealed(sealedPath, refBPath);
|
|
90
|
+
const aPass = a.passRate === 1;
|
|
91
|
+
const bPass = b.passRate === 1;
|
|
92
|
+
const bothPass = aPass && bPass;
|
|
93
|
+
const bothFail = !aPass && !bPass;
|
|
94
|
+
const divergent = aPass !== bPass;
|
|
95
|
+
const status: CrossStatus = bothPass ? "both-pass" : divergent ? "divergent" : "both-fail";
|
|
96
|
+
return { a, b, bothPass, divergent, bothFail, status };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/** Real coverage: fraction of the impl file's bytes V8 recorded as executed. */
|
|
100
|
+
export function measureCoverage(sealedPath: string, implPath: string): number {
|
|
101
|
+
const abs = resolve(implPath);
|
|
102
|
+
const covDir = mkdtempSync(join(tmpdir(), "stz-cov-"));
|
|
103
|
+
try {
|
|
104
|
+
runSealed(sealedPath, abs, covDir);
|
|
105
|
+
const fileLen = readFileSync(abs, "utf8").length;
|
|
106
|
+
if (fileLen === 0) return 0;
|
|
107
|
+
let covered = 0;
|
|
108
|
+
for (const f of readdirSync(covDir)) {
|
|
109
|
+
const data = JSON.parse(readFileSync(join(covDir, f), "utf8")) as {
|
|
110
|
+
result: { url: string; functions: { ranges: { startOffset: number; endOffset: number; count: number }[] }[] }[];
|
|
111
|
+
};
|
|
112
|
+
const entry = data.result.find((e) => e.url.includes(abs) || abs.includes(e.url.replace("file://", "")));
|
|
113
|
+
if (!entry) continue;
|
|
114
|
+
// Union of executed character ranges across all functions.
|
|
115
|
+
const marks = new Uint8Array(fileLen);
|
|
116
|
+
for (const fn of entry.functions) {
|
|
117
|
+
for (const rng of fn.ranges) {
|
|
118
|
+
if (rng.count > 0) {
|
|
119
|
+
for (let i = rng.startOffset; i < Math.min(rng.endOffset, fileLen); i++) marks[i] = 1;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
covered = marks.reduce((a, b) => a + b, 0);
|
|
124
|
+
break;
|
|
125
|
+
}
|
|
126
|
+
return Math.min(1, covered / fileLen);
|
|
127
|
+
} finally {
|
|
128
|
+
rmSync(covDir, { recursive: true, force: true });
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/** Source-level mutators. Each returns a mutated copy or null if inapplicable. */
|
|
133
|
+
const MUTATORS: { name: string; apply: (s: string) => string | null }[] = [
|
|
134
|
+
{ name: "lt→lte", apply: (s) => mutateFirst(s, /([^<>=])<([^<=])/, "$1<=$2") },
|
|
135
|
+
{ name: "gt→gte", apply: (s) => mutateFirst(s, /([^<>=])>([^>=])/, "$1>=$2") },
|
|
136
|
+
{ name: "lte→lt", apply: (s) => mutateFirst(s, /<=/, "<") },
|
|
137
|
+
{ name: "gte→gt", apply: (s) => mutateFirst(s, />=/, ">") },
|
|
138
|
+
{ name: "plus→minus", apply: (s) => mutateFirst(s, /([\w)])\s\+\s([\w(])/, "$1 - $2") },
|
|
139
|
+
{ name: "min→max", apply: (s) => mutateFirst(s, /Math\.min/, "Math.max") },
|
|
140
|
+
{ name: "max→min", apply: (s) => mutateFirst(s, /Math\.max/, "Math.min") },
|
|
141
|
+
{ name: "swap-cmp-lo-hi", apply: (s) => mutateFirst(s, /\blo\b/, "hi") },
|
|
142
|
+
];
|
|
143
|
+
|
|
144
|
+
function mutateFirst(src: string, re: RegExp, repl: string): string | null {
|
|
145
|
+
const out = src.replace(re, repl);
|
|
146
|
+
return out === src ? null : out;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/** Remove block and line comments so mutators only touch executable code. */
|
|
150
|
+
function stripComments(src: string): string {
|
|
151
|
+
return src
|
|
152
|
+
.replace(/\/\*[\s\S]*?\*\//g, "")
|
|
153
|
+
.replace(/(^|[^:])\/\/.*$/gm, "$1");
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Real mutation testing: produce mutants, re-run the sealed suite on each, and
|
|
158
|
+
* report SURVIVAL rate (1 = nothing killed, 0 = every mutant killed). Mutants
|
|
159
|
+
* that don't change the source or fail to load count as not-applicable and are
|
|
160
|
+
* skipped. Returns 1 (worst) when no mutant could be applied so a degenerate
|
|
161
|
+
* file is never rewarded.
|
|
162
|
+
*/
|
|
163
|
+
export function measureMutation(sealedPath: string, implPath: string): { mutationScore: number; mutants: number; survivors: number } {
|
|
164
|
+
const abs = resolve(implPath);
|
|
165
|
+
// Mutate executable code, not comments. A `Math.min` inside a doc comment
|
|
166
|
+
// would otherwise yield a behaviour-identical mutant that always "survives"
|
|
167
|
+
// and silently inflates the survival rate.
|
|
168
|
+
const src = stripComments(readFileSync(abs, "utf8"));
|
|
169
|
+
const dir = mkdtempSync(join(tmpdir(), "stz-mut-"));
|
|
170
|
+
let mutants = 0;
|
|
171
|
+
let survivors = 0;
|
|
172
|
+
try {
|
|
173
|
+
for (const m of MUTATORS) {
|
|
174
|
+
const mutated = m.apply(src);
|
|
175
|
+
if (mutated === null) continue;
|
|
176
|
+
mutants++;
|
|
177
|
+
const mutPath = join(dir, `mutant-${m.name}.mjs`);
|
|
178
|
+
writeFileSync(mutPath, mutated, "utf8");
|
|
179
|
+
const res = runSealed(sealedPath, mutPath);
|
|
180
|
+
// A surviving mutant still passes the suite (suite failed to catch it).
|
|
181
|
+
if (res.passRate === 1) survivors++;
|
|
182
|
+
}
|
|
183
|
+
} finally {
|
|
184
|
+
rmSync(dir, { recursive: true, force: true });
|
|
185
|
+
}
|
|
186
|
+
const mutationScore = mutants === 0 ? 1 : survivors / mutants;
|
|
187
|
+
return { mutationScore, mutants, survivors };
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
export interface FullEval {
|
|
191
|
+
testPassRate: number;
|
|
192
|
+
coverage: number;
|
|
193
|
+
mutationScore: number;
|
|
194
|
+
passed: number;
|
|
195
|
+
total: number;
|
|
196
|
+
mutants: number;
|
|
197
|
+
survivors: number;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/** Run all three real measurements for one specimen implementation. */
|
|
201
|
+
export function fullEval(sealedPath: string, implPath: string): FullEval {
|
|
202
|
+
const sealed = runSealed(sealedPath, implPath);
|
|
203
|
+
const coverage = measureCoverage(sealedPath, implPath);
|
|
204
|
+
const mutation = measureMutation(sealedPath, implPath);
|
|
205
|
+
return {
|
|
206
|
+
testPassRate: sealed.passRate,
|
|
207
|
+
coverage,
|
|
208
|
+
mutationScore: mutation.mutationScore,
|
|
209
|
+
passed: sealed.passed,
|
|
210
|
+
total: sealed.total,
|
|
211
|
+
mutants: mutation.mutants,
|
|
212
|
+
survivors: mutation.survivors,
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/** Write a metrics.json the bridge `record-eval` consumes. */
|
|
217
|
+
export function writeMetrics(path: string, e: FullEval): void {
|
|
218
|
+
mkdirSync(join(path, ".."), { recursive: true });
|
|
219
|
+
writeFileSync(path, JSON.stringify({ testPassRate: e.testPassRate, coverage: e.coverage, mutationScore: e.mutationScore }, null, 2), "utf8");
|
|
220
|
+
}
|
package/src/grpo.ts
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GRPO group-relative advantage (F8).
|
|
3
|
+
*
|
|
4
|
+
* advantage_i = (reward_i − group_mean) / (group_std + ε)
|
|
5
|
+
*
|
|
6
|
+
* Adopted at the harness selection layer (DeepSeekMath / verl formulation).
|
|
7
|
+
* Used both to pick the winner and to weight which losers' diffs are the most
|
|
8
|
+
* informative forward signal for the pressure log (F9).
|
|
9
|
+
*/
|
|
10
|
+
import type { Advantage, SpecimenId } from "./types.js";
|
|
11
|
+
|
|
12
|
+
export const GRPO_EPSILON = 1e-8;
|
|
13
|
+
|
|
14
|
+
export function mean(xs: number[]): number {
|
|
15
|
+
if (xs.length === 0) return 0;
|
|
16
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** Population standard deviation (divide by N, matching the GRPO group stat). */
|
|
20
|
+
export function stddev(xs: number[], mu = mean(xs)): number {
|
|
21
|
+
if (xs.length === 0) return 0;
|
|
22
|
+
const variance = xs.reduce((a, x) => a + (x - mu) ** 2, 0) / xs.length;
|
|
23
|
+
return Math.sqrt(variance);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Compute group-relative advantages across a slice's specimen group.
|
|
28
|
+
* The ε guard means an all-equal-rewards group yields all-zero advantages
|
|
29
|
+
* (no division by zero), which is the correct "no signal" outcome.
|
|
30
|
+
*/
|
|
31
|
+
export function groupRelativeAdvantage(
|
|
32
|
+
rewards: { specimen: SpecimenId; reward: number }[],
|
|
33
|
+
epsilon = GRPO_EPSILON,
|
|
34
|
+
): Advantage[] {
|
|
35
|
+
const values = rewards.map((r) => r.reward);
|
|
36
|
+
const mu = mean(values);
|
|
37
|
+
const sigma = stddev(values, mu);
|
|
38
|
+
return rewards.map(({ specimen, reward }) => ({
|
|
39
|
+
specimen,
|
|
40
|
+
reward,
|
|
41
|
+
advantage: (reward - mu) / (sigma + epsilon),
|
|
42
|
+
}));
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Rank specimens whose diffs are most informative as negative exemplars (F9):
|
|
47
|
+
* the largest-magnitude advantages carry the most signal. Returns specimen ids
|
|
48
|
+
* sorted by |advantage| descending.
|
|
49
|
+
*/
|
|
50
|
+
export function mostInformative(advantages: Advantage[]): SpecimenId[] {
|
|
51
|
+
return [...advantages]
|
|
52
|
+
.sort((x, y) => Math.abs(y.advantage) - Math.abs(x.advantage))
|
|
53
|
+
.map((a) => a.specimen);
|
|
54
|
+
}
|