@hegemonart/get-design-done 1.27.1 → 1.27.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/CHANGELOG.md +95 -0
- package/SKILL.md +1 -0
- package/agents/design-reflector.md +52 -0
- package/agents/perf-analyzer.md +166 -0
- package/hooks/budget-enforcer.ts +249 -5
- package/hooks/gdd-precompact-snapshot.js +334 -0
- package/hooks/gdd-sessionstart-recap.js +281 -0
- package/hooks/hooks.json +18 -0
- package/package.json +2 -2
- package/reference/bandit-integration.md +163 -0
- package/reference/perf-budget.md +142 -0
- package/reference/registry.json +14 -0
- package/reference/retrieval-contract.md +16 -0
- package/scripts/lib/bandit-arbitrage.cjs +423 -0
- package/scripts/lib/bandit-router/integration.cjs +309 -0
- package/scripts/lib/cache/gdd-cache-manager.cjs +292 -0
- package/scripts/lib/discuss-parallel-runner/index.ts +5 -1
- package/scripts/lib/explore-parallel-runner/index.ts +5 -1
- package/scripts/lib/parallelism-engine/concurrency-tuner.cjs +259 -0
- package/scripts/lib/parallelism-engine/concurrency-tuner.d.cts +53 -0
- package/scripts/lib/perf-analyzer/cost-regression.cjs +299 -0
- package/scripts/lib/perf-analyzer/index.cjs +139 -0
- package/scripts/lib/prompt-dedup/index.cjs +161 -0
- package/scripts/lib/session-runner/index.ts +206 -0
- package/skills/bandit-status/SKILL.md +129 -0
- package/skills/peers/SKILL.md +27 -8
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* hooks/gdd-sessionstart-recap.js — Plan 27.6-05
|
|
4
|
+
*
|
|
5
|
+
* Claude Code SessionStart hook. Emits a "what changed while you were
|
|
6
|
+
* away" diff between the most-recent PreCompact snapshot and the
|
|
7
|
+
* current STATE.md.
|
|
8
|
+
*
|
|
9
|
+
* Phase 27.6 D-09: markdown summary to stderr + structured JSON to
|
|
10
|
+
* `.design/snapshots/last-recap.json` (the JSON is a sidecar for
|
|
11
|
+
* downstream tools: progress dashboard, resume skill).
|
|
12
|
+
* Phase 27.6 D-10: harness-aware Codex no-op (Phase 45 dep for full
|
|
13
|
+
* pre-large-context recap integration).
|
|
14
|
+
*
|
|
15
|
+
* Silent-on-failure: tolerable errors exit 0 with breadcrumb.
|
|
16
|
+
* Emits `recap.emitted` event via lazy appendEvent (best-effort).
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
'use strict';
|
|
20
|
+
|
|
21
|
+
const fs = require('node:fs');
|
|
22
|
+
const path = require('node:path');
|
|
23
|
+
|
|
24
|
+
const SNAPSHOT_DIR = path.resolve(process.cwd(), '.design', 'snapshots');
|
|
25
|
+
const STATE_MD_PATH = path.resolve(process.cwd(), '.design', 'STATE.md');
|
|
26
|
+
const EVENTS_PATH = path.resolve(process.cwd(), '.design', 'telemetry', 'events.jsonl');
|
|
27
|
+
const RECAP_JSON_PATH = path.join(SNAPSHOT_DIR, 'last-recap.json');
|
|
28
|
+
const SCHEMA_VERSION = '1.0.0';
|
|
29
|
+
|
|
30
|
+
// ---------------------------------------------------------------------------
|
|
31
|
+
// Harness detection (D-10) — mirrors gdd-precompact-snapshot.js
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
function detectHarness() {
|
|
35
|
+
const explicit = (process.env.CLAUDE_HARNESS || process.env.GDD_HARNESS || '')
|
|
36
|
+
.toLowerCase()
|
|
37
|
+
.trim();
|
|
38
|
+
if (explicit === 'codex' || explicit === 'codex-cli') return 'codex';
|
|
39
|
+
return 'claude-code';
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
// Lazy event-stream emit (best-effort)
|
|
44
|
+
// ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
function getAppendEvent() {
|
|
47
|
+
try {
|
|
48
|
+
const m = require('../scripts/lib/event-stream');
|
|
49
|
+
if (m && typeof m.appendEvent === 'function') return m.appendEvent;
|
|
50
|
+
} catch {
|
|
51
|
+
/* swallow — event-stream is optional infrastructure */
|
|
52
|
+
}
|
|
53
|
+
return function noopAppend(_ev) {
|
|
54
|
+
/* no-op */
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// ---------------------------------------------------------------------------
|
|
59
|
+
// STATE.md tolerant parser (lighter than the PreCompact version — only
|
|
60
|
+
// needs frontmatter + a flat decisions list for the diff)
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
function readStateMd() {
|
|
64
|
+
if (!fs.existsSync(STATE_MD_PATH)) return { frontmatter: {}, decisions: [] };
|
|
65
|
+
let body;
|
|
66
|
+
try {
|
|
67
|
+
body = fs.readFileSync(STATE_MD_PATH, 'utf8');
|
|
68
|
+
} catch {
|
|
69
|
+
return { frontmatter: {}, decisions: [] };
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const frontmatter = {};
|
|
73
|
+
const fmMatch = body.match(/^---\n([\s\S]*?)\n---\n/);
|
|
74
|
+
if (fmMatch) {
|
|
75
|
+
for (const line of fmMatch[1].split('\n')) {
|
|
76
|
+
const m = line.match(/^(\w+):\s*(.+)$/);
|
|
77
|
+
if (m) frontmatter[m[1]] = m[2].trim();
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// All D-XX entries anywhere in the body — broad sweep is fine for diff.
|
|
82
|
+
const decisions = [];
|
|
83
|
+
const dRe = /D-\d+:[^\n]+/g;
|
|
84
|
+
let m2;
|
|
85
|
+
while ((m2 = dRe.exec(body)) !== null) {
|
|
86
|
+
decisions.push(m2[0].trim());
|
|
87
|
+
}
|
|
88
|
+
return { frontmatter, decisions };
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// ---------------------------------------------------------------------------
|
|
92
|
+
// Snapshot discovery — highest-mtime *.json (excluding last-recap.json)
|
|
93
|
+
// ---------------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
function findLatestSnapshot() {
|
|
96
|
+
if (!fs.existsSync(SNAPSHOT_DIR)) return null;
|
|
97
|
+
let files;
|
|
98
|
+
try {
|
|
99
|
+
files = fs.readdirSync(SNAPSHOT_DIR);
|
|
100
|
+
} catch {
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
103
|
+
const candidates = files.filter(
|
|
104
|
+
(f) => f.endsWith('.json') && f !== 'last-recap.json' && !f.endsWith('.tmp'),
|
|
105
|
+
);
|
|
106
|
+
if (candidates.length === 0) return null;
|
|
107
|
+
|
|
108
|
+
let best = null;
|
|
109
|
+
let bestMtime = -1;
|
|
110
|
+
for (const name of candidates) {
|
|
111
|
+
const full = path.join(SNAPSHOT_DIR, name);
|
|
112
|
+
try {
|
|
113
|
+
const mtime = fs.statSync(full).mtimeMs;
|
|
114
|
+
if (mtime > bestMtime) {
|
|
115
|
+
best = full;
|
|
116
|
+
bestMtime = mtime;
|
|
117
|
+
}
|
|
118
|
+
} catch {
|
|
119
|
+
/* swallow */
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
return best;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// ---------------------------------------------------------------------------
|
|
126
|
+
// Event count since snapshot timestamp (JSONL-tolerant)
|
|
127
|
+
// ---------------------------------------------------------------------------
|
|
128
|
+
|
|
129
|
+
function countEventsSince(isoTimestamp) {
|
|
130
|
+
if (!fs.existsSync(EVENTS_PATH)) return 0;
|
|
131
|
+
let body;
|
|
132
|
+
try {
|
|
133
|
+
body = fs.readFileSync(EVENTS_PATH, 'utf8');
|
|
134
|
+
} catch {
|
|
135
|
+
return 0;
|
|
136
|
+
}
|
|
137
|
+
let count = 0;
|
|
138
|
+
for (const line of body.split(/\r?\n/)) {
|
|
139
|
+
const t = line.trim();
|
|
140
|
+
if (t.length === 0) continue;
|
|
141
|
+
try {
|
|
142
|
+
const ev = JSON.parse(t);
|
|
143
|
+
if (typeof ev.timestamp === 'string' && ev.timestamp > isoTimestamp) {
|
|
144
|
+
count++;
|
|
145
|
+
}
|
|
146
|
+
} catch {
|
|
147
|
+
/* tolerate malformed */
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
return count;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// ---------------------------------------------------------------------------
|
|
154
|
+
// Main
|
|
155
|
+
// ---------------------------------------------------------------------------
|
|
156
|
+
|
|
157
|
+
function main() {
|
|
158
|
+
const harness = detectHarness();
|
|
159
|
+
if (harness === 'codex') {
|
|
160
|
+
// D-10: SessionStart on Codex skips recap; Phase 45 dep for full
|
|
161
|
+
// pre-large-context-action integration.
|
|
162
|
+
process.stderr.write('[gdd-sessionstart-recap] codex harness no-op (Phase 45 dep)\n');
|
|
163
|
+
process.exit(0);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const snapshotPath = findLatestSnapshot();
|
|
167
|
+
if (!snapshotPath) {
|
|
168
|
+
process.stderr.write('[gdd-sessionstart-recap] no prior snapshot\n');
|
|
169
|
+
process.exit(0);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
let snapshot;
|
|
173
|
+
try {
|
|
174
|
+
snapshot = JSON.parse(fs.readFileSync(snapshotPath, 'utf8'));
|
|
175
|
+
} catch (err) {
|
|
176
|
+
process.stderr.write(
|
|
177
|
+
'[gdd-sessionstart-recap] snapshot unreadable: ' +
|
|
178
|
+
(err && err.message ? err.message : String(err)) +
|
|
179
|
+
'\n',
|
|
180
|
+
);
|
|
181
|
+
process.exit(0);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
const current = readStateMd();
|
|
185
|
+
const priorDecisions = Array.isArray(snapshot.last_n_decisions)
|
|
186
|
+
? snapshot.last_n_decisions
|
|
187
|
+
: [];
|
|
188
|
+
const priorSet = new Set(priorDecisions);
|
|
189
|
+
const newDecisions = current.decisions.filter((d) => !priorSet.has(d));
|
|
190
|
+
const newEventCount = countEventsSince(snapshot.timestamp || '1970-01-01T00:00:00.000Z');
|
|
191
|
+
|
|
192
|
+
const priorCycle = snapshot.cycle_id || 'unknown';
|
|
193
|
+
const currentCycle = current.frontmatter.milestone || 'unknown';
|
|
194
|
+
const cycleChanged = priorCycle !== currentCycle ? `${priorCycle} → ${currentCycle}` : null;
|
|
195
|
+
|
|
196
|
+
const snapshotTime = snapshot.timestamp ? new Date(snapshot.timestamp).getTime() : 0;
|
|
197
|
+
const timeElapsedMs =
|
|
198
|
+
snapshotTime > 0 && Number.isFinite(snapshotTime) ? Date.now() - snapshotTime : 0;
|
|
199
|
+
|
|
200
|
+
// Markdown summary to stderr (D-09).
|
|
201
|
+
const md = [
|
|
202
|
+
'## Session Recap',
|
|
203
|
+
`Snapshot taken: ${snapshot.timestamp || 'unknown'}`,
|
|
204
|
+
`Time elapsed: ${(timeElapsedMs / 60000).toFixed(1)} min`,
|
|
205
|
+
cycleChanged ? `Cycle: ${cycleChanged}` : `Cycle: ${currentCycle} (unchanged)`,
|
|
206
|
+
`New decisions: ${newDecisions.length}`,
|
|
207
|
+
...newDecisions.slice(0, 5).map((d) => ` - ${d}`),
|
|
208
|
+
`New events since snapshot: ${newEventCount}`,
|
|
209
|
+
'',
|
|
210
|
+
].join('\n');
|
|
211
|
+
process.stderr.write(md + '\n');
|
|
212
|
+
|
|
213
|
+
// JSON sidecar (D-09) — atomic .tmp + rename for consistency.
|
|
214
|
+
const recap = {
|
|
215
|
+
schema_version: SCHEMA_VERSION,
|
|
216
|
+
previous_snapshot: snapshotPath,
|
|
217
|
+
current_timestamp: new Date().toISOString(),
|
|
218
|
+
diff: {
|
|
219
|
+
new_decisions: newDecisions,
|
|
220
|
+
new_events_since_snapshot: newEventCount,
|
|
221
|
+
cycle_changed: cycleChanged,
|
|
222
|
+
time_elapsed_ms: timeElapsedMs,
|
|
223
|
+
},
|
|
224
|
+
};
|
|
225
|
+
|
|
226
|
+
try {
|
|
227
|
+
// mkdir -p for safety — directory should exist if snapshotPath was found,
|
|
228
|
+
// but defensive ensure for race conditions.
|
|
229
|
+
fs.mkdirSync(SNAPSHOT_DIR, { recursive: true });
|
|
230
|
+
fs.writeFileSync(RECAP_JSON_PATH + '.tmp', JSON.stringify(recap, null, 2), 'utf8');
|
|
231
|
+
fs.renameSync(RECAP_JSON_PATH + '.tmp', RECAP_JSON_PATH);
|
|
232
|
+
} catch (err) {
|
|
233
|
+
process.stderr.write(
|
|
234
|
+
'[gdd-sessionstart-recap] sidecar write failed: ' +
|
|
235
|
+
(err && err.message ? err.message : String(err)) +
|
|
236
|
+
'\n',
|
|
237
|
+
);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Best-effort event emit.
|
|
241
|
+
const appendEvent = getAppendEvent();
|
|
242
|
+
try {
|
|
243
|
+
appendEvent({
|
|
244
|
+
type: 'recap.emitted',
|
|
245
|
+
timestamp: new Date().toISOString(),
|
|
246
|
+
sessionId: process.env.GDD_SESSION_ID || 'sessionstart-hook',
|
|
247
|
+
payload: {
|
|
248
|
+
new_decisions: newDecisions.length,
|
|
249
|
+
new_events: newEventCount,
|
|
250
|
+
time_elapsed_ms: timeElapsedMs,
|
|
251
|
+
harness,
|
|
252
|
+
},
|
|
253
|
+
});
|
|
254
|
+
} catch {
|
|
255
|
+
/* swallow */
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// Emit non-blocking continue verdict on stdout.
|
|
259
|
+
try {
|
|
260
|
+
process.stdout.write(JSON.stringify({ continue: true, suppressOutput: true }));
|
|
261
|
+
} catch {
|
|
262
|
+
/* swallow */
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
process.exit(0);
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
try {
|
|
269
|
+
main();
|
|
270
|
+
} catch (err) {
|
|
271
|
+
try {
|
|
272
|
+
process.stderr.write(
|
|
273
|
+
'[gdd-sessionstart-recap] uncaught: ' +
|
|
274
|
+
(err && err.message ? err.message : String(err)) +
|
|
275
|
+
'\n',
|
|
276
|
+
);
|
|
277
|
+
} catch {
|
|
278
|
+
/* swallow */
|
|
279
|
+
}
|
|
280
|
+
process.exit(0);
|
|
281
|
+
}
|
package/hooks/hooks.json
CHANGED
|
@@ -24,6 +24,14 @@
|
|
|
24
24
|
"command": "bash \"${CLAUDE_PLUGIN_ROOT}/hooks/first-run-nudge.sh\""
|
|
25
25
|
}
|
|
26
26
|
]
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"hooks": [
|
|
30
|
+
{
|
|
31
|
+
"type": "command",
|
|
32
|
+
"command": "node \"${CLAUDE_PLUGIN_ROOT}/hooks/gdd-sessionstart-recap.js\""
|
|
33
|
+
}
|
|
34
|
+
]
|
|
27
35
|
}
|
|
28
36
|
],
|
|
29
37
|
"PreToolUse": [
|
|
@@ -110,6 +118,16 @@
|
|
|
110
118
|
}
|
|
111
119
|
]
|
|
112
120
|
}
|
|
121
|
+
],
|
|
122
|
+
"PreCompact": [
|
|
123
|
+
{
|
|
124
|
+
"hooks": [
|
|
125
|
+
{
|
|
126
|
+
"type": "command",
|
|
127
|
+
"command": "node \"${CLAUDE_PLUGIN_ROOT}/hooks/gdd-precompact-snapshot.js\""
|
|
128
|
+
}
|
|
129
|
+
]
|
|
130
|
+
}
|
|
113
131
|
]
|
|
114
132
|
}
|
|
115
133
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hegemonart/get-design-done",
|
|
3
|
-
"version": "1.27.
|
|
3
|
+
"version": "1.27.6",
|
|
4
4
|
"description": "A design-quality pipeline for AI coding agents: brief, plan, implement, and verify UI work against your design system.",
|
|
5
5
|
"author": "Hegemon",
|
|
6
6
|
"homepage": "https://github.com/hegemonart/get-design-done",
|
|
@@ -83,7 +83,7 @@
|
|
|
83
83
|
],
|
|
84
84
|
"hooks": "hooks/hooks.json",
|
|
85
85
|
"dependencies": {
|
|
86
|
-
"@anthropic-ai/claude-agent-sdk": "^0.
|
|
86
|
+
"@anthropic-ai/claude-agent-sdk": "^0.3.143",
|
|
87
87
|
"@clack/prompts": "^1.2.0",
|
|
88
88
|
"@modelcontextprotocol/sdk": "^1.0.0"
|
|
89
89
|
},
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: bandit-integration
|
|
3
|
+
phase: 27.5
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
type: meta-rules
|
|
6
|
+
description: Bandit posterior + production-integration shim cheat sheet — signatures, reward function semantics, adaptive_mode gate, posterior path conventions.
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Bandit Integration — Developer Cheat Sheet
|
|
10
|
+
|
|
11
|
+
**Phase 27.5 (v1.27.5).** Reference for the bandit production-integration surface. Authoring or modifying a caller of the bandit posterior? Debugging a routing decision at the code level? Start here.
|
|
12
|
+
|
|
13
|
+
For ops-level guidance (when bandit fires, how to disable, posterior inspection), see `docs/BANDIT-INTEGRATION.md`.
|
|
14
|
+
|
|
15
|
+
In-scope modules:
|
|
16
|
+
|
|
17
|
+
- `scripts/lib/bandit-router.cjs` (Phase 23.5 primitives).
|
|
18
|
+
- `scripts/lib/bandit-router/integration.cjs` (Phase 27.5 shim).
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## The two-stage architecture
|
|
23
|
+
|
|
24
|
+
Phase 23.5 ships the bandit primitives — Thompson-sampling pull, posterior update, computeReward, atomic persistence. Phase 27-07 added the `delegate?` arm dimension (5 peer-CLI arms + the local `none` arm). Both phases shipped library-only with no production callers.
|
|
25
|
+
|
|
26
|
+
Phase 27.5 ships the production-integration shim that wraps the primitives behind two purpose-built entry points and hides the `pull` vs `pullWithDelegate` choice. Callers pass a `delegate` argument and the shim routes internally.
|
|
27
|
+
|
|
28
|
+
### Phase 23.5 + 27-07 surface — `scripts/lib/bandit-router.cjs`
|
|
29
|
+
|
|
30
|
+
Exports: `pull`, `update`, `pullWithDelegate`, `updateWithDelegate`, `computeReward`, `loadPosterior`, `savePosterior`, `reset`, `decayArm`, `sampleBeta`, `priorFor`, `binForGlobCount`, `DEFAULT_DELEGATES`, `DELEGATE_NONE`, `TIER_PRIOR`, `PRIOR_STRENGTH`, `TOUCHES_BINS`, `DEFAULT_POSTERIOR_PATH`, `SCHEMA_VERSION`.
|
|
31
|
+
|
|
32
|
+
The two-pair primitive split:
|
|
33
|
+
|
|
34
|
+
- `pull({agent, bin, ...})` / `update({agent, bin, tier, reward, ...})` — operate on the `(agent, bin, tier)` arm slice. Equivalent to `delegate='none'`.
|
|
35
|
+
- `pullWithDelegate({agent, bin, delegates, ...})` / `updateWithDelegate({agent, bin, tier, delegate, reward, ...})` — operate on the `(agent, bin, tier, delegate)` arm slice for any `delegate ∈ DEFAULT_DELEGATES`.
|
|
36
|
+
|
|
37
|
+
### Phase 27.5 surface — `scripts/lib/bandit-router/integration.cjs`
|
|
38
|
+
|
|
39
|
+
Exports: `consultBandit`, `recordOutcome`, `DELEGATE_NONE`.
|
|
40
|
+
|
|
41
|
+
Routing rules (D-05, D-07):
|
|
42
|
+
|
|
43
|
+
1. `agentFrontmatter.tier_override` set → bypass bandit, return `tier_override`.
|
|
44
|
+
2. `adaptiveMode !== 'full'` → bandit silent, return `frontmatter.default_tier`.
|
|
45
|
+
3. `adaptiveMode === 'full'` + delegate `'none'` or undefined → call `pull()`.
|
|
46
|
+
4. `adaptiveMode === 'full'` + delegate is a peer name → call `pullWithDelegate({delegates: [delegate]})`.
|
|
47
|
+
|
|
48
|
+
`recordOutcome` is symmetric on the adaptive-mode gate.
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## `consultBandit` signature
|
|
53
|
+
|
|
54
|
+
```javascript
|
|
55
|
+
consultBandit({
|
|
56
|
+
agent: string, // required
|
|
57
|
+
bin: string, // required: 'tiny' | 'small' | 'medium' | 'large'
|
|
58
|
+
delegate: string, // 'none' or one of DEFAULT_DELEGATES
|
|
59
|
+
agentFrontmatter: {
|
|
60
|
+
tier_override?: string,
|
|
61
|
+
default_tier?: string,
|
|
62
|
+
},
|
|
63
|
+
adaptiveMode?: 'static' | 'hedge' | 'full', // omit to read on-disk
|
|
64
|
+
baseDir?: string, // override workspace root (test-injection)
|
|
65
|
+
posteriorPath?: string, // override posterior file path (test-injection)
|
|
66
|
+
}) → {
|
|
67
|
+
tier: 'haiku' | 'sonnet' | 'opus',
|
|
68
|
+
decision_log: {
|
|
69
|
+
source: 'frontmatter' | 'tier_override_bypass' | 'bandit_pull' | 'bandit_pull_with_delegate',
|
|
70
|
+
samples?: { haiku?: number, sonnet?: number, opus?: number },
|
|
71
|
+
delegate?: string,
|
|
72
|
+
adaptive_mode: string,
|
|
73
|
+
reason?: string,
|
|
74
|
+
},
|
|
75
|
+
}
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
`decision_log.source` is the audit trail — it tells observability tools which routing branch ran. Tests use it to assert the correct path was taken.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## `recordOutcome` signature
|
|
83
|
+
|
|
84
|
+
```javascript
|
|
85
|
+
recordOutcome({
|
|
86
|
+
agent: string,
|
|
87
|
+
bin: string,
|
|
88
|
+
delegate: string,
|
|
89
|
+
tier: string,
|
|
90
|
+
status: string, // SessionResult.status — only 'completed' triggers reward.solidify_pass
|
|
91
|
+
costUsd?: number,
|
|
92
|
+
adaptiveMode?: 'static' | 'hedge' | 'full',
|
|
93
|
+
baseDir?: string,
|
|
94
|
+
posteriorPath?: string,
|
|
95
|
+
}) → void // best-effort per D-04 — write errors are swallowed
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Reward semantics:
|
|
99
|
+
|
|
100
|
+
- `solidify_pass = (status === 'completed')`.
|
|
101
|
+
- If `!solidify_pass`, reward is `0`. If true, reward is `1 - lambda * normalize(costUsd + epsilon * wallTimeMs)`.
|
|
102
|
+
|
|
103
|
+
Phase 27.5 passes `wallTimeMs: 0` always (D-08 unchanged from Phase 23.5).
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## `adaptive_mode` gate semantics
|
|
108
|
+
|
|
109
|
+
Phase 23.5 ladder (D-07):
|
|
110
|
+
|
|
111
|
+
- `static` — default. Bandit silent. `default-tier:` is authoritative. No reads, no writes.
|
|
112
|
+
- `hedge` — measurement-only. Bandit silent on reads, but `recordOutcome` may still write to seed the posterior. Currently identical to `static` in Phase 27.5; reserved for Phase 28+ explicit "hedge mode".
|
|
113
|
+
- `full` — bandit active. Reads pick via Thompson sampling; writes update posterior.
|
|
114
|
+
|
|
115
|
+
The shim respects the gate transparently. Operators flip via `.design/budget.json#adaptive_mode`.
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Reward function
|
|
120
|
+
|
|
121
|
+
`computeReward({solidify_pass, cost_usd, wall_time_ms, lambda?, epsilon?, costNormalizer?}) → number`
|
|
122
|
+
|
|
123
|
+
Two-stage lexicographic (D-08, unchanged from Phase 23.5):
|
|
124
|
+
|
|
125
|
+
- Stage 1 — correctness: if `solidify_pass !== true`, return `0`.
|
|
126
|
+
- Stage 2 — cost: return `1 - lambda * normalize(cost_usd + epsilon * wall_time_ms)`.
|
|
127
|
+
|
|
128
|
+
Defaults: `lambda = 0.3`, `epsilon = 0.05`. `normalize` maps `[0, $5]` linearly to `[0, 1]`, clamped.
|
|
129
|
+
|
|
130
|
+
Cheaper successful spawns get higher reward. Failed spawns are flat zero. Tune `lambda` to weight cost less.
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Posterior path
|
|
135
|
+
|
|
136
|
+
Canonical path: `.design/telemetry/posterior.json` (Phase 23.5 D-08, Phase 27.5 D-06 unchanged). Path is owned by `DEFAULT_POSTERIOR_PATH` constant in `scripts/lib/bandit-router.cjs`.
|
|
137
|
+
|
|
138
|
+
Test injection: pass `baseDir` (anchors path under a different workspace root) or `posteriorPath` (overrides the file path directly). Both `consultBandit` and `recordOutcome` accept these options.
|
|
139
|
+
|
|
140
|
+
Write discipline: atomic via `.tmp` + rename. Read failures yield an empty posterior; subsequent writes overwrite. Concurrent writers within the same process are not synchronized — gdd's session-runner is single-threaded.
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## Call sites
|
|
145
|
+
|
|
146
|
+
Phase 27.5 wires these consumers:
|
|
147
|
+
|
|
148
|
+
- **`hooks/budget-enforcer.ts`** (Plan 27.5-02) — per Agent spawn, after `resolved_models` is computed, before SDK call. Calls `consultBandit({agent, bin, delegate, agentFrontmatter, adaptiveMode})`. Overrides `resolved_models[agent]` with the bandit tier via `tier-resolver.cjs`. Emits `bandit.tier_selected` event for observability.
|
|
149
|
+
- **`scripts/lib/session-runner/index.ts`** (Plan 27.5-03) — terminal-emit path. Calls `recordOutcome({agent, bin, delegate, tier, status, costUsd})` after every `emit('session.completed', ...)` site (4 sites: rate-limited, peer-success, turn-cap-zero, terminal retry-exit). Posterior write is best-effort; missing optional fields silent.
|
|
150
|
+
- **`agents/design-reflector.md` Section 8** (Plan 27.5-04) — bandit-arbitrage analysis. `scripts/lib/bandit-arbitrage.cjs` reads `.design/telemetry/posterior.json` and surfaces stale-frontmatter proposals. Mirrors Phase 26-06's `cost-arbitrage.cjs` shape.
|
|
151
|
+
- **`skills/peers/SKILL.md` Step 5 + `skills/bandit-status/SKILL.md`** (Plan 27.5-05) — read-only diagnostic surfaces. `/gdd:peers` posterior delta column populated; `/gdd:bandit-status` renders per-`(agent, bin, delegate, tier)` snapshots.
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Cross-references
|
|
156
|
+
|
|
157
|
+
- `docs/BANDIT-INTEGRATION.md` — operator guide (when bandit fires, how to disable, troubleshooting).
|
|
158
|
+
- `reference/peer-protocols.md` — Phase 27 ACP/ASP cheat sheet (peer-CLI delegation transport).
|
|
159
|
+
- `scripts/lib/bandit-router.cjs` — Phase 23.5 primitives surface.
|
|
160
|
+
- `scripts/lib/bandit-router/integration.cjs` — Phase 27.5 production shim.
|
|
161
|
+
- `scripts/lib/bandit-arbitrage.cjs` — Phase 27.5 reflector analyzer (Section 8 of `design-reflector.md`).
|
|
162
|
+
- `hooks/budget-enforcer.ts` — bandit consultation site.
|
|
163
|
+
- `scripts/lib/session-runner/index.ts` — `recordOutcome` site.
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: perf-budget
|
|
3
|
+
phase: 27.6
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
type: meta-rules
|
|
6
|
+
description: Per-agent token-cost budget reference and CI regression-gate documentation. Budgets sourced from current p50 + 25% buffer (Phase 27.6 D-05); CI gate fails on >25% regression vs baseline across 3 cycles (D-01); thresholds configurable via .design/budget.json.
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
# Per-Agent Performance Budgets — Phase 27.6
|
|
10
|
+
|
|
11
|
+
This reference documents the token-cost budgets that the pipeline measures itself against. Two surfaces consume this document:
|
|
12
|
+
|
|
13
|
+
1. `tests/perf-budget.test.cjs` — CI regression gate. Fails the build when any agent's p50 USD-cost has regressed > 25% vs baseline across the last 3 cycles.
|
|
14
|
+
2. `agents/perf-analyzer.md` — cross-cycle reflector. Reads the same budget + telemetry; surfaces top-3 regressions as `[REGRESSION]` proposals.
|
|
15
|
+
|
|
16
|
+
Phase 27.5 (v1.27.5, shipped 2026-05-17) made production telemetry real by wiring the bandit into routing. Phase 27.6 reads what 27.5 writes.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## How budgets are derived
|
|
21
|
+
|
|
22
|
+
Per **D-05**, each per-agent budget is the agent's current p50 USD-cost plus a 25% buffer (`p50 × 1.25`). The buffer absorbs natural cycle-to-cycle variance without firing the gate, while still flagging genuine cost growth.
|
|
23
|
+
|
|
24
|
+
Per **D-03**, the v1.27.6 baseline data lives at `test-fixture/baselines/phase-27-6/perf-baseline.json` and is built from **synthetic cycle replay**. Real-cycle calibration ships as a follow-up after 1-2 production cycles accumulate, via the commit:
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
chore(27.6): recalibrate perf-budget against measured cycles
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Per **D-01**, the regression-gate threshold is **25%**, configurable via `.design/budget.json#perf_regression_threshold`. A minimum of **3 distinct cycles** must be observed per agent before that agent is evaluated for regression. Agents with fewer than 3 cycles are silently skipped (cold-start tolerance).
|
|
31
|
+
|
|
32
|
+
This conservative-then-tighten discipline matches Phase 23.5 `PRIOR_STRENGTH` calibration — start wide to avoid noise, tighten once enough samples accumulate to compute realistic p95 bounds.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Per-agent budget table
|
|
37
|
+
|
|
38
|
+
| Agent | p50 budget (USD) | Buffer | Hit-rate baseline | p95 wall (ms) | Notes |
|
|
39
|
+
|---|---|---|---|---|---|
|
|
40
|
+
| design-verifier | 0.04 | 0.05 (+25%) | 0.55 | 12000 | Stage 5; reads DESIGN-VERIFICATION.md scoring rubric |
|
|
41
|
+
| design-planner | 0.08 | 0.10 (+25%) | 0.40 | 18000 | Stage 3; opus default |
|
|
42
|
+
| design-executor | 0.06 | 0.075 (+25%) | 0.50 | 15000 | Stage 4 |
|
|
43
|
+
| design-context-checker | 0.02 | 0.025 (+25%) | 0.65 | 6000 | Gate; pre-stage validator |
|
|
44
|
+
| design-reflector | 0.10 | 0.125 (+25%) | 0.35 | 22000 | XL reflector tier |
|
|
45
|
+
| design-discussant | 0.05 | 0.0625 (+25%) | 0.45 | 11000 | Spawned by `/gdd:discuss` |
|
|
46
|
+
| perf-analyzer | 0.10 | 0.125 (+25%) | 0.30 | 22000 | XL reflector tier (this phase) |
|
|
47
|
+
|
|
48
|
+
These values are **seed numbers**, re-calibrated after 1-2 real production cycles. The authoritative numbers live in `test-fixture/baselines/phase-27-6/perf-baseline.json` (created at Phase 27.6 closeout in Plan 27.6-06). The CI gate reads that file at runtime, not this table.
|
|
49
|
+
|
|
50
|
+
When the baseline JSON is **absent** (first run after this plan lands but before 27.6-06), the gate passes silently with a stderr notice — it does NOT block Wave A from shipping.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## CI Regression Gate
|
|
55
|
+
|
|
56
|
+
File: `tests/perf-budget.test.cjs`
|
|
57
|
+
|
|
58
|
+
Algorithm (single source of truth — re-uses `detectCostRegressions` from `scripts/lib/perf-analyzer/cost-regression.cjs`):
|
|
59
|
+
|
|
60
|
+
1. Load `test-fixture/baselines/phase-27-6/perf-baseline.json`. If absent, exit early — gate passes with stderr notice. (Phase 27.6-06 creates this file at closeout.)
|
|
61
|
+
2. Load `.design/telemetry/costs.jsonl` via `loadCosts`. If absent or empty, exit early — no data to regress against.
|
|
62
|
+
3. Read `perf_regression_threshold` from `.design/budget.json` (default 25 per D-01).
|
|
63
|
+
4. Call `detectCostRegressions({rows, baseline: parsedBaseline.agents, thresholdPct, cyclesRequired: 3})`.
|
|
64
|
+
5. If `result.regressions.length === 0`, gate passes.
|
|
65
|
+
6. Otherwise, fail the test with the regression details (agent, baseline_p50_usd, current_p50_usd, delta_pct, cycles_observed).
|
|
66
|
+
|
|
67
|
+
The gate is intentionally **low-noise**:
|
|
68
|
+
|
|
69
|
+
- Skips agents with fewer than 3 distinct cycles of data (avoids false positives during cold-start).
|
|
70
|
+
- Only fires on the **regression rule** — NOT on cache-hit-rate drops or p95 latency spikes; those surface as `agents/perf-analyzer.md` proposals only.
|
|
71
|
+
- Top-3 cap on the regressions list — a "noisy day" can flag at most three agents, never the entire roster.
|
|
72
|
+
|
|
73
|
+
The gate runs as a regular `node --test` entry under the `tests/**/*.test.cjs` glob — no special CI wiring required. If you can run `npm test`, you run the gate.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Tuning the Gate
|
|
78
|
+
|
|
79
|
+
Override the regression threshold by adding to `.design/budget.json`:
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
{
|
|
83
|
+
"perf_regression_threshold": 30
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Override the cache-warming false-positive tolerance (used by Phase 27.6-03):
|
|
88
|
+
|
|
89
|
+
```json
|
|
90
|
+
{
|
|
91
|
+
"cache_warming_falsepositive_threshold": 25
|
|
92
|
+
}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
**Defaults** (per Phase 27.6 D-01 + D-02):
|
|
96
|
+
|
|
97
|
+
- `perf_regression_threshold: 25`
|
|
98
|
+
- `cache_warming_falsepositive_threshold: 20`
|
|
99
|
+
|
|
100
|
+
After 5 measured cycles accumulate, re-tune based on observed natural variance. The 25%-default is conservative — likely too loose once real telemetry stabilizes. The first tightening pass belongs to a measurement-gated follow-up, not v1.27.6 itself.
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Recalibration (Phase 27.6 D-03 follow-up)
|
|
105
|
+
|
|
106
|
+
v1.27.6 ships with synthetic-cycle-replay baselines. After 1-2 real production cycles accumulate, re-lock the baseline:
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
chore(27.6): recalibrate perf-budget against measured cycles
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
That commit:
|
|
113
|
+
|
|
114
|
+
1. Re-runs the baseline-fixture build against real telemetry.
|
|
115
|
+
2. Updates `test-fixture/baselines/phase-27-6/perf-baseline.json` with the measured p50, hit_rate, and p95_ms per agent.
|
|
116
|
+
3. Bumps the budget numbers in this document to match.
|
|
117
|
+
4. Optionally tightens `perf_regression_threshold` from 25 toward 15-20 if measured variance permits.
|
|
118
|
+
|
|
119
|
+
The synthetic baseline is **not a hack** — it's the documented v1 path per spec Success Criterion #7. Real-cycle data simply doesn't exist yet at v1.27.6 cut, because Phase 27.5 only shipped 2026-05-17.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Cross-references
|
|
124
|
+
|
|
125
|
+
- `agents/perf-analyzer.md` — cross-cycle reflector that reads the same baseline. Surfaces top-3 cost regressions, hit-rate deltas, and p95 spikes as `[REGRESSION]` proposals per `/gdd:reflect`.
|
|
126
|
+
- `scripts/lib/perf-analyzer/cost-regression.cjs` — **single source of truth** for the regression rule. The CI gate re-uses `detectCostRegressions` from this module; it does NOT re-implement the rule.
|
|
127
|
+
- `scripts/lib/perf-analyzer/index.cjs` — telemetry loader (`loadCosts`, `loadTrajectories`). JSONL-tolerant; blank lines silently ignored, malformed lines counted in `skipped_count`.
|
|
128
|
+
- `tests/perf-budget.test.cjs` — the CI gate itself. Always-green when no baseline + no data; fails on >25% regression vs baseline once both exist.
|
|
129
|
+
- `reference/bandit-integration.md` — Phase 27.5 routing reference (precursor; the bandit picks tier **within** the budget — the gate evaluates whether the picked tier behaved within budget).
|
|
130
|
+
- `.design/budget.json` — operator-tunable thresholds. Optional file; absent file means defaults (`perf_regression_threshold: 25`, `cache_warming_falsepositive_threshold: 20`).
|
|
131
|
+
- `test-fixture/baselines/phase-27-6/perf-baseline.json` — authoritative per-agent p50 / hit_rate / p95_ms values. Created in Plan 27.6-06 closeout.
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Boundary semantics (matching detectCostRegressions)
|
|
136
|
+
|
|
137
|
+
- **>= threshold** is a regression. A current p50 exactly 25% above baseline (e.g., baseline 0.05, current 0.0625) fires the gate. This matches the Phase 27.6-01 test contract.
|
|
138
|
+
- **base = 0 + current > 0** → flagged as `delta_pct: Infinity`. A previously-zero-cost agent becoming non-zero is always a regression.
|
|
139
|
+
- **base = 0 + current = 0** → NOT a regression (both `delta_pct = 0`).
|
|
140
|
+
- **Missing baseline entry** → agent silently skipped (no false positive on new agents that haven't been calibrated yet).
|
|
141
|
+
|
|
142
|
+
The gate's "fail loud, false-positive rare" character comes from these boundary choices plus the 3-cycle minimum — together they make the gate safe to wire into CI without flaking on first-run noise.
|
package/reference/registry.json
CHANGED
|
@@ -95,6 +95,13 @@
|
|
|
95
95
|
"type": "authority-feed",
|
|
96
96
|
"description": "Whitelist of design-authority feed sources for the watcher"
|
|
97
97
|
},
|
|
98
|
+
{
|
|
99
|
+
"name": "bandit-integration",
|
|
100
|
+
"path": "reference/bandit-integration.md",
|
|
101
|
+
"type": "meta-rules",
|
|
102
|
+
"phase": 27.5,
|
|
103
|
+
"description": "Phase 27.5 bandit production-integration cheat sheet — consultBandit + recordOutcome shim signatures, adaptive_mode gate semantics, reward function (Phase 23.5 D-08 unchanged), posterior path .design/telemetry/posterior.json, call-site map (budget-enforcer + session-runner + design-reflector Section 8)"
|
|
104
|
+
},
|
|
98
105
|
{
|
|
99
106
|
"name": "brand-voice",
|
|
100
107
|
"path": "reference/brand-voice.md",
|
|
@@ -652,6 +659,13 @@
|
|
|
652
659
|
"phase": 27,
|
|
653
660
|
"description": "Phase 27 peer-CLI delegation capability matrix — which peer (codex/copilot/cursor/gemini/qwen) claims which agent role, protocol (ACP/ASP), tie-break order, and opt-in gating semantics"
|
|
654
661
|
},
|
|
662
|
+
{
|
|
663
|
+
"name": "perf-budget",
|
|
664
|
+
"path": "reference/perf-budget.md",
|
|
665
|
+
"type": "meta-rules",
|
|
666
|
+
"phase": 27.6,
|
|
667
|
+
"description": "Phase 27.6 per-agent token-cost budget reference and CI regression-gate documentation; budgets sourced from current p50 + 25% buffer (D-05), CI gate fails on >25% regression vs baseline across 3 cycles (D-01); thresholds configurable via .design/budget.json"
|
|
668
|
+
},
|
|
655
669
|
{
|
|
656
670
|
"name": "performance",
|
|
657
671
|
"path": "reference/performance.md",
|