@hegemonart/get-design-done 1.54.0 → 1.56.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +2 -2
- package/.claude-plugin/plugin.json +1 -1
- package/CHANGELOG.md +92 -0
- package/README.md +6 -0
- package/SKILL.md +1 -0
- package/agents/design-fixer.md +16 -0
- package/bin/gdd-dashboard +91 -0
- package/dist/claude-code/.claude/skills/override/SKILL.md +86 -0
- package/hooks/gdd-decision-injector.js +58 -0
- package/hooks/gdd-fact-force.js +345 -0
- package/hooks/gdd-risk-gate.js +406 -0
- package/hooks/hooks.json +18 -0
- package/package.json +2 -1
- package/reference/schemas/events.schema.json +61 -1
- package/reference/skill-graph.md +2 -1
- package/scripts/lib/dashboard/graph-html.cjs +0 -0
- package/scripts/lib/health-mirror/index.cjs +146 -1
- package/scripts/lib/manifest/skills.json +8 -0
- package/scripts/lib/risk/calibration.cjs +385 -0
- package/scripts/lib/risk/compute-risk.cjs +229 -0
- package/scripts/lib/risk/consumers.cjs +211 -0
- package/scripts/lib/risk/override.cjs +87 -0
- package/scripts/lib/risk/route.cjs +59 -0
- package/scripts/lib/risk/tables.cjs +221 -0
- package/sdk/cli/commands/dashboard.ts +419 -0
- package/sdk/cli/index.js +253 -2
- package/sdk/cli/index.ts +7 -0
- package/sdk/dashboard/data/_pkg-root.cjs +92 -0
- package/sdk/dashboard/data/cost-aggregator.cjs +187 -0
- package/sdk/dashboard/data/discovery.cjs +297 -0
- package/sdk/dashboard/data/risk-surface.cjs +136 -0
- package/sdk/dashboard/data/source.cjs +576 -0
- package/sdk/dashboard/tui/ansi.cjs +355 -0
- package/sdk/dashboard/tui/index.cjs +778 -0
- package/sdk/mcp/gdd-mcp/server.js +70 -0
- package/skills/override/SKILL.md +86 -0
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
// Surface:
|
|
9
9
|
// async getHealthChecks(rootDir) → { checks: HealthCheck[] }
|
|
10
10
|
//
|
|
11
|
-
// The
|
|
11
|
+
// The 10 checks (in stable order) are:
|
|
12
12
|
// 1. claude_md — CLAUDE.md presence
|
|
13
13
|
// 2. planning_dir — .planning/ presence
|
|
14
14
|
// 3. design_dir — .design/ presence
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
// 7. skill_discipline — using-gdd bootstrap + SessionStart inject (Plan 32-07)
|
|
19
19
|
// 8. harness_freshness — per-harness last_verified age (Phase 44)
|
|
20
20
|
// 9. stack_addendums — Phase 54 coverage: N/M detected stacks have addendums
|
|
21
|
+
// 10. dashboard_reachable — Phase 55: bin/gdd-dashboard on disk + data plane loads
|
|
21
22
|
//
|
|
22
23
|
// Check 5 was added in Plan 30-06 — surfaces the report-issue kill-switch
|
|
23
24
|
// (env or config disable) so users can verify why the command is
|
|
@@ -48,6 +49,22 @@
|
|
|
48
49
|
// inject-using-gdd entry)
|
|
49
50
|
// status: 'ok' when ready, 'warn' otherwise. PURE read-only (rootDir-relative
|
|
50
51
|
// file + JSON inspection only) — NEVER throws, NEVER networks.
|
|
52
|
+
//
|
|
53
|
+
// Check 10 was added in Phase 55 — surfaces whether the GDD dashboard is
|
|
54
|
+
// reachable so a user running /gdd:health knows the `gdd dashboard` entrypoint
|
|
55
|
+
// is wired. GRACEFUL-ABSENT by design (D-8 risk surfacing precedent): the
|
|
56
|
+
// dashboard is an opt-in, read-only surface that also works via file-scrape, so
|
|
57
|
+
// a missing bin or absent data plane is a 'warn' (actionable note), NEVER a
|
|
58
|
+
// hard 'fail'. The status is 'ok' when BOTH the bin/gdd-dashboard trampoline
|
|
59
|
+
// resolves on disk (located via a package-root walk-up — the Phase 53/54 lesson,
|
|
60
|
+
// NEVER a fixed __dirname jump) AND the dashboard data plane module
|
|
61
|
+
// (sdk/dashboard/data/source.cjs) loads + exposes loadDashboardModel. The detail
|
|
62
|
+
// line is one of:
|
|
63
|
+
// - "dashboard: bin/gdd-dashboard present; data plane ok"
|
|
64
|
+
// - "dashboard: bin missing" (trampoline not on disk)
|
|
65
|
+
// - "dashboard: data plane unavailable" (bin present, source.cjs absent)
|
|
66
|
+
// - "dashboard: bin missing; data plane unavailable"
|
|
67
|
+
// PURE read-only (fs.statSync + a wrapped require) — NEVER throws, NEVER networks.
|
|
51
68
|
|
|
52
69
|
const fs = require('node:fs');
|
|
53
70
|
const path = require('node:path');
|
|
@@ -310,6 +327,39 @@ async function getHealthChecks(rootDir) {
|
|
|
310
327
|
checks.push({ name: 'stack_addendums', status, detail });
|
|
311
328
|
}
|
|
312
329
|
|
|
330
|
+
// 10. dashboard_reachable — Phase 55. GRACEFUL-ABSENT: reports whether the
|
|
331
|
+
// GDD dashboard entrypoint is wired (bin/gdd-dashboard on disk) AND its data
|
|
332
|
+
// plane module loads. NEVER 'fail' — a missing bin is a 'warn' note because
|
|
333
|
+
// the dashboard is opt-in and also works via file-scrape. PURE read-only
|
|
334
|
+
// (fs.statSync + a wrapped require); NEVER throws, NEVER networks.
|
|
335
|
+
{
|
|
336
|
+
let status;
|
|
337
|
+
let detail;
|
|
338
|
+
try {
|
|
339
|
+
const gddRoot = resolveDashboardRoot(rootDir);
|
|
340
|
+
const binPresent = dashboardBinResolves(gddRoot);
|
|
341
|
+
const dataPlaneOk = dashboardDataPlaneLoads(gddRoot);
|
|
342
|
+
if (binPresent && dataPlaneOk) {
|
|
343
|
+
status = 'ok';
|
|
344
|
+
detail = 'dashboard: bin/gdd-dashboard present; data plane ok';
|
|
345
|
+
} else {
|
|
346
|
+
status = 'warn';
|
|
347
|
+
if (!binPresent && !dataPlaneOk) {
|
|
348
|
+
detail = 'dashboard: bin missing; data plane unavailable';
|
|
349
|
+
} else if (!binPresent) {
|
|
350
|
+
detail = 'dashboard: bin missing';
|
|
351
|
+
} else {
|
|
352
|
+
detail = 'dashboard: data plane unavailable';
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
} catch {
|
|
356
|
+
// Absolute safety net — the health probe must never crash on this check.
|
|
357
|
+
status = 'warn';
|
|
358
|
+
detail = 'dashboard: unavailable';
|
|
359
|
+
}
|
|
360
|
+
checks.push({ name: 'dashboard_reachable', status, detail });
|
|
361
|
+
}
|
|
362
|
+
|
|
313
363
|
return { checks };
|
|
314
364
|
}
|
|
315
365
|
|
|
@@ -402,4 +452,99 @@ function figmaVariablesBlockedLocally(rootDir) {
|
|
|
402
452
|
}
|
|
403
453
|
}
|
|
404
454
|
|
|
455
|
+
/**
|
|
456
|
+
* Walk UP from `startDir` to the GDD package root (the first ancestor whose
|
|
457
|
+
* package.json `name` is the GDD package). This mirrors the Phase 53/54 lesson
|
|
458
|
+
* (sdk/dashboard/data/_pkg-root.cjs): NEVER resolve a cross-tree sibling via a
|
|
459
|
+
* fixed __dirname-relative jump. The shipped package name is scoped
|
|
460
|
+
* ("@hegemonart/get-design-done"); dev/self-host/fixture roots may use the bare
|
|
461
|
+
* "get-design-done" — both match. Bounded climb; defensive. Returns null if no
|
|
462
|
+
* GDD root marker is found.
|
|
463
|
+
*
|
|
464
|
+
* @param {string} startDir
|
|
465
|
+
* @returns {string|null} absolute package-root dir, or null
|
|
466
|
+
*/
|
|
467
|
+
function findGddPackageRoot(startDir) {
|
|
468
|
+
try {
|
|
469
|
+
let dir = path.resolve(startDir);
|
|
470
|
+
for (let i = 0; i < 12; i++) {
|
|
471
|
+
try {
|
|
472
|
+
const pkg = JSON.parse(fs.readFileSync(path.join(dir, 'package.json'), 'utf8'));
|
|
473
|
+
if (pkg && typeof pkg.name === 'string') {
|
|
474
|
+
if (pkg.name === 'get-design-done' || /\/get-design-done$/.test(pkg.name)) {
|
|
475
|
+
return dir;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
} catch {
|
|
479
|
+
// no/garbage package.json at this level — keep climbing
|
|
480
|
+
}
|
|
481
|
+
const parent = path.dirname(dir);
|
|
482
|
+
if (parent === dir) break;
|
|
483
|
+
dir = parent;
|
|
484
|
+
}
|
|
485
|
+
return null;
|
|
486
|
+
} catch {
|
|
487
|
+
return null;
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Resolve the AUTHORITATIVE GDD package root for the dashboard probe, given the
|
|
493
|
+
* project root being health-checked. Resolution (D-7 walk-up, never a fixed
|
|
494
|
+
* __dirname jump):
|
|
495
|
+
* - If `rootDir` itself sits inside a GDD checkout (dev / self-host / a
|
|
496
|
+
* hermetic fixture that declares the GDD name), THAT root is authoritative —
|
|
497
|
+
* its own bin/ + sdk/dashboard/ are the truth. No cross-root fallback (so a
|
|
498
|
+
* fixture is hermetic + deterministic regardless of the shipped tree).
|
|
499
|
+
* - Otherwise `rootDir` is an unrelated CONSUMER project (no GDD marker); the
|
|
500
|
+
* dashboard ships alongside THIS module, so walk up from __dirname.
|
|
501
|
+
* Returns null only if neither resolves (degrades the check to 'warn').
|
|
502
|
+
*
|
|
503
|
+
* @param {string} rootDir project root passed to getHealthChecks
|
|
504
|
+
* @returns {string|null}
|
|
505
|
+
*/
|
|
506
|
+
function resolveDashboardRoot(rootDir) {
|
|
507
|
+
const fromRoot = findGddPackageRoot(rootDir);
|
|
508
|
+
if (fromRoot) return fromRoot;
|
|
509
|
+
return findGddPackageRoot(__dirname);
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
/**
|
|
513
|
+
* Does the bin/gdd-dashboard trampoline resolve on disk under the authoritative
|
|
514
|
+
* GDD root? (Phase 55, check 10.) `fs.statSync` follows symlinks, so an npm
|
|
515
|
+
* bin-linked trampoline (a symlink resolving to a file) counts as present. PURE
|
|
516
|
+
* read-only; NEVER throws.
|
|
517
|
+
*
|
|
518
|
+
* @param {string} gddRoot authoritative GDD root (or null)
|
|
519
|
+
* @returns {boolean} true iff bin/gdd-dashboard is present on disk
|
|
520
|
+
*/
|
|
521
|
+
function dashboardBinResolves(gddRoot) {
|
|
522
|
+
if (!gddRoot) return false;
|
|
523
|
+
try {
|
|
524
|
+
return fs.statSync(path.join(gddRoot, 'bin', 'gdd-dashboard')).isFile();
|
|
525
|
+
} catch {
|
|
526
|
+
return false;
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
/**
|
|
531
|
+
* Does the dashboard data plane module load + expose loadDashboardModel under
|
|
532
|
+
* the authoritative GDD root? (Phase 55, check 10.) The data plane is
|
|
533
|
+
* sdk/dashboard/data/source.cjs (R1 — the in-process shared-lib read surface the
|
|
534
|
+
* dashboard renders). A missing module or a require error degrades to false
|
|
535
|
+
* (→ 'warn'), NEVER throws.
|
|
536
|
+
*
|
|
537
|
+
* @param {string} gddRoot authoritative GDD root (or null)
|
|
538
|
+
* @returns {boolean} true iff source.cjs loads and exports loadDashboardModel
|
|
539
|
+
*/
|
|
540
|
+
function dashboardDataPlaneLoads(gddRoot) {
|
|
541
|
+
if (!gddRoot) return false;
|
|
542
|
+
try {
|
|
543
|
+
const mod = require(path.join(gddRoot, 'sdk', 'dashboard', 'data', 'source.cjs'));
|
|
544
|
+
return !!(mod && typeof mod.loadDashboardModel === 'function');
|
|
545
|
+
} catch {
|
|
546
|
+
return false;
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
|
|
405
550
|
module.exports = { getHealthChecks };
|
|
@@ -319,6 +319,14 @@
|
|
|
319
319
|
"user_invocable": true,
|
|
320
320
|
"tools": "Read, Bash, Grep, Write"
|
|
321
321
|
},
|
|
322
|
+
{
|
|
323
|
+
"name": "override",
|
|
324
|
+
"description": "Escalation surface for a risk-blocked action or a fact-force gate. Use when the Phase 56 risk gate blocked a writer action (suggested_action=block) and a reviewer has signed off, or when the first-write fact-force gate is holding a file you have legitimately reviewed. Activates for requests involving overriding a blocked edit, approving a high-risk change, or clearing a fact-force hold on a path.",
|
|
325
|
+
"argument_hint": "<finding-id | factforce <path>> [--approver <who>] [--reason <text>]",
|
|
326
|
+
"user_invocable": true,
|
|
327
|
+
"tools": "Read, Write, Bash, Grep, Glob",
|
|
328
|
+
"registered_in_phase": "56"
|
|
329
|
+
},
|
|
322
330
|
{
|
|
323
331
|
"name": "pause",
|
|
324
332
|
"description": "Write a numbered checkpoint so work can resume in a new session without re-running completed stages.",
|
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* scripts/lib/risk/calibration.cjs — Phase 56 (CAL-01) per-agent risk
|
|
4
|
+
* calibration + the bandit reward bridge.
|
|
5
|
+
*
|
|
6
|
+
* The risk scorer (scripts/lib/risk/compute-risk.cjs) is a STATIC table-driven
|
|
7
|
+
* model — it cannot learn that a particular writer agent chronically under- or
|
|
8
|
+
* over-scores its own actions. This module is the feedback layer: it records
|
|
9
|
+
* per-agent outcomes in a rolling-50 window, derives three calibration
|
|
10
|
+
* statistics, flags drift, and feeds a reward signal into the Phase 23.5
|
|
11
|
+
* Thompson-sampling bandit (scripts/lib/bandit-router.cjs) so the adaptive
|
|
12
|
+
* router can react to a mis-calibrated agent over time.
|
|
13
|
+
*
|
|
14
|
+
* Persistence:
|
|
15
|
+
* .design/telemetry/calibration.json
|
|
16
|
+
* {
|
|
17
|
+
* schema_version: '56.0',
|
|
18
|
+
* generated_at: ISO,
|
|
19
|
+
* agents: {
|
|
20
|
+
* "<agent>": {
|
|
21
|
+
* window: [ { risk, accepted, user_undo, post_apply_correct }, … ≤50 ],
|
|
22
|
+
* mean_risk_emitted: number, // mean(window.risk)
|
|
23
|
+
* override_rate: number, // P(rejected OR undone)
|
|
24
|
+
* post_apply_correctness: number // P(correct | applied)
|
|
25
|
+
* }, …
|
|
26
|
+
* }
|
|
27
|
+
* }
|
|
28
|
+
* Atomic .tmp + rename (mirrors instinct-store.save / ds-arms.save). The
|
|
29
|
+
* `.design/` tree is gitignored + worktree-local (R5).
|
|
30
|
+
*
|
|
31
|
+
* Purity contract:
|
|
32
|
+
* - detectDrift + riskReward are PURE (no I/O, no Date.now / Math.random;
|
|
33
|
+
* the DRIFT thresholds are frozen). Deterministic for the suite.
|
|
34
|
+
* - updateCalibration reads/writes the FS, but only via the injected
|
|
35
|
+
* `{root}` (or `file`) so tests run hermetically under a tmpdir. The only
|
|
36
|
+
* non-determinism is `generated_at` (an ISO stamp), which callers can pin
|
|
37
|
+
* via opts.now.
|
|
38
|
+
* - recordRiskOutcome calls bandit-router.update BEST-EFFORT — it never
|
|
39
|
+
* throws (a telemetry write must never break a hook / agent turn).
|
|
40
|
+
*
|
|
41
|
+
* Zero new dependency. CommonJS to match the scripts/lib/ siblings.
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
const fs = require('node:fs');
|
|
45
|
+
const path = require('node:path');
|
|
46
|
+
|
|
47
|
+
const SCHEMA_VERSION = '56.0';
|
|
48
|
+
const DEFAULT_CALIBRATION_PATH = '.design/telemetry/calibration.json';
|
|
49
|
+
|
|
50
|
+
/** Rolling window length (CAL-01): keep the last 50 outcomes per agent. */
|
|
51
|
+
const WINDOW_SIZE = 50;
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Drift thresholds (frozen). detectDrift compares the rolling stats against
|
|
55
|
+
* these bands:
|
|
56
|
+
* under_scoring — the agent emits LOW risk yet the user overrides OFTEN:
|
|
57
|
+
* the scores are too tame (false sense of safety).
|
|
58
|
+
* over_scoring — the agent emits HIGH risk yet applied actions are almost
|
|
59
|
+
* always correct AND the user rarely overrides: the scores
|
|
60
|
+
* are too alarmist (friction without payoff).
|
|
61
|
+
*/
|
|
62
|
+
const DRIFT = Object.freeze({
|
|
63
|
+
under_scoring: Object.freeze({ mean_risk_max: 0.35, override_rate_min: 0.30 }),
|
|
64
|
+
over_scoring: Object.freeze({
|
|
65
|
+
mean_risk_min: 0.65,
|
|
66
|
+
correctness_min: 0.90,
|
|
67
|
+
override_rate_max: 0.10,
|
|
68
|
+
}),
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Clamp to [0, 1]. Non-finite -> 0 (matches compute-risk.clamp01 semantics).
|
|
73
|
+
* @param {number} n
|
|
74
|
+
* @returns {number}
|
|
75
|
+
*/
|
|
76
|
+
function clamp01(n) {
|
|
77
|
+
if (typeof n !== 'number' || Number.isNaN(n)) return 0;
|
|
78
|
+
if (n < 0) return 0;
|
|
79
|
+
if (n > 1) return 1;
|
|
80
|
+
return n;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Resolve the on-disk calibration file, honouring an absolute override.
|
|
85
|
+
* Accepts `{ file }` (explicit path) or `{ root }` / `{ baseDir }` (a project
|
|
86
|
+
* root under which DEFAULT_CALIBRATION_PATH is resolved).
|
|
87
|
+
* @param {{file?:string, root?:string, baseDir?:string}} [opts]
|
|
88
|
+
* @returns {{file:string, dir:string}}
|
|
89
|
+
*/
|
|
90
|
+
function paths(opts = {}) {
|
|
91
|
+
let file;
|
|
92
|
+
if (opts.file) {
|
|
93
|
+
file = path.isAbsolute(opts.file)
|
|
94
|
+
? opts.file
|
|
95
|
+
: path.resolve(opts.root ?? opts.baseDir ?? process.cwd(), opts.file);
|
|
96
|
+
} else {
|
|
97
|
+
file = path.resolve(opts.root ?? opts.baseDir ?? process.cwd(), DEFAULT_CALIBRATION_PATH);
|
|
98
|
+
}
|
|
99
|
+
return { file, dir: path.dirname(file) };
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Load the calibration store, or a fresh envelope when absent/corrupt.
|
|
104
|
+
* @param {{file?:string, root?:string, baseDir?:string}} [opts]
|
|
105
|
+
* @returns {{schema_version:string, generated_at?:string, agents:object}}
|
|
106
|
+
*/
|
|
107
|
+
function load(opts = {}) {
|
|
108
|
+
const { file } = paths(opts);
|
|
109
|
+
if (!fs.existsSync(file)) {
|
|
110
|
+
return { schema_version: SCHEMA_VERSION, agents: {} };
|
|
111
|
+
}
|
|
112
|
+
try {
|
|
113
|
+
const data = JSON.parse(fs.readFileSync(file, 'utf8'));
|
|
114
|
+
if (!data || typeof data !== 'object' || typeof data.agents !== 'object' || data.agents === null) {
|
|
115
|
+
return { schema_version: SCHEMA_VERSION, agents: {} };
|
|
116
|
+
}
|
|
117
|
+
return data;
|
|
118
|
+
} catch {
|
|
119
|
+
return { schema_version: SCHEMA_VERSION, agents: {} };
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* Persist the calibration store atomically (.tmp + rename).
|
|
125
|
+
* @param {object} store
|
|
126
|
+
* @param {{file?:string, root?:string, baseDir?:string, now?:string|Date}} [opts]
|
|
127
|
+
* @returns {string} absolute path written
|
|
128
|
+
*/
|
|
129
|
+
function save(store, opts = {}) {
|
|
130
|
+
const { file, dir } = paths(opts);
|
|
131
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
132
|
+
store.schema_version = SCHEMA_VERSION;
|
|
133
|
+
store.generated_at =
|
|
134
|
+
opts.now instanceof Date
|
|
135
|
+
? opts.now.toISOString()
|
|
136
|
+
: typeof opts.now === 'string'
|
|
137
|
+
? opts.now
|
|
138
|
+
: new Date().toISOString();
|
|
139
|
+
const tmp = file + '.tmp';
|
|
140
|
+
fs.writeFileSync(tmp, JSON.stringify(store, null, 2) + '\n');
|
|
141
|
+
fs.renameSync(tmp, file);
|
|
142
|
+
return file;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Coerce a raw outcome into the canonical window record. Unknown / missing
|
|
147
|
+
* fields degrade safely:
|
|
148
|
+
* risk -> clamp01(number), default 0
|
|
149
|
+
* accepted -> boolean (default true — an action that produced an
|
|
150
|
+
* outcome without an explicit reject is treated as
|
|
151
|
+
* applied)
|
|
152
|
+
* user_undo -> boolean (default false)
|
|
153
|
+
* post_apply_correct -> boolean | null (null = "not yet known"; only counts
|
|
154
|
+
* toward post_apply_correctness once resolved)
|
|
155
|
+
* @param {object} record
|
|
156
|
+
* @returns {{risk:number, accepted:boolean, user_undo:boolean, post_apply_correct:(boolean|null)}}
|
|
157
|
+
*/
|
|
158
|
+
function normalizeRecord(record) {
|
|
159
|
+
const r = record && typeof record === 'object' ? record : {};
|
|
160
|
+
return {
|
|
161
|
+
risk: clamp01(typeof r.risk === 'number' ? r.risk : 0),
|
|
162
|
+
accepted: r.accepted === undefined ? true : Boolean(r.accepted),
|
|
163
|
+
user_undo: Boolean(r.user_undo),
|
|
164
|
+
post_apply_correct:
|
|
165
|
+
r.post_apply_correct === undefined || r.post_apply_correct === null
|
|
166
|
+
? null
|
|
167
|
+
: Boolean(r.post_apply_correct),
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Recompute the three rolling statistics over a window of normalized records.
|
|
173
|
+
*
|
|
174
|
+
* mean_risk_emitted = mean(risk) (0 when empty)
|
|
175
|
+
* override_rate = P(!accepted OR user_undo) (0 when empty)
|
|
176
|
+
* post_apply_correctness = P(post_apply_correct | applied) (1 when no
|
|
177
|
+
* resolved applied records — an agent with no known-bad applied actions
|
|
178
|
+
* reads as fully correct; this is the conservative direction for the
|
|
179
|
+
* over_scoring drift gate, which additionally requires high mean risk +
|
|
180
|
+
* low override, so an empty window never spuriously trips it)
|
|
181
|
+
*
|
|
182
|
+
* @param {Array} window normalized records
|
|
183
|
+
* @returns {{mean_risk_emitted:number, override_rate:number, post_apply_correctness:number}}
|
|
184
|
+
*/
|
|
185
|
+
function computeStats(window) {
|
|
186
|
+
const w = Array.isArray(window) ? window : [];
|
|
187
|
+
const n = w.length;
|
|
188
|
+
if (n === 0) {
|
|
189
|
+
return { mean_risk_emitted: 0, override_rate: 0, post_apply_correctness: 1 };
|
|
190
|
+
}
|
|
191
|
+
let riskSum = 0;
|
|
192
|
+
let overrides = 0;
|
|
193
|
+
let appliedResolved = 0;
|
|
194
|
+
let appliedCorrect = 0;
|
|
195
|
+
for (const rec of w) {
|
|
196
|
+
riskSum += rec.risk;
|
|
197
|
+
const overridden = !rec.accepted || rec.user_undo;
|
|
198
|
+
if (overridden) overrides += 1;
|
|
199
|
+
// "applied" = accepted AND not undone. Only resolved (non-null) correctness
|
|
200
|
+
// signals count toward the correctness rate.
|
|
201
|
+
const applied = rec.accepted && !rec.user_undo;
|
|
202
|
+
if (applied && rec.post_apply_correct !== null) {
|
|
203
|
+
appliedResolved += 1;
|
|
204
|
+
if (rec.post_apply_correct === true) appliedCorrect += 1;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
return {
|
|
208
|
+
mean_risk_emitted: riskSum / n,
|
|
209
|
+
override_rate: overrides / n,
|
|
210
|
+
post_apply_correctness: appliedResolved === 0 ? 1 : appliedCorrect / appliedResolved,
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Record one risk outcome for `agent`, append to its rolling-50 window, drop
|
|
216
|
+
* the oldest beyond 50, recompute the three statistics, and persist atomically.
|
|
217
|
+
*
|
|
218
|
+
* @param {string} agent the writer agent the assessment scored (e.g. 'design-fixer')
|
|
219
|
+
* @param {{risk?:number, accepted?:boolean, user_undo?:boolean, post_apply_correct?:boolean}} record
|
|
220
|
+
* @param {{file?:string, root?:string, baseDir?:string, now?:string|Date}} [opts]
|
|
221
|
+
* @returns {{agent:string, stats:{mean_risk_emitted:number, override_rate:number, post_apply_correctness:number}, windowSize:number, path:string}}
|
|
222
|
+
*/
|
|
223
|
+
function updateCalibration(agent, record, opts = {}) {
|
|
224
|
+
if (typeof agent !== 'string' || agent.length === 0) {
|
|
225
|
+
throw new TypeError('updateCalibration: agent (non-empty string) required');
|
|
226
|
+
}
|
|
227
|
+
const store = load(opts);
|
|
228
|
+
if (!store.agents || typeof store.agents !== 'object') store.agents = {};
|
|
229
|
+
|
|
230
|
+
const prev = store.agents[agent];
|
|
231
|
+
const prevWindow =
|
|
232
|
+
prev && Array.isArray(prev.window) ? prev.window.map(normalizeRecord) : [];
|
|
233
|
+
|
|
234
|
+
prevWindow.push(normalizeRecord(record));
|
|
235
|
+
// Keep only the last WINDOW_SIZE entries (rolling window).
|
|
236
|
+
const window =
|
|
237
|
+
prevWindow.length > WINDOW_SIZE ? prevWindow.slice(prevWindow.length - WINDOW_SIZE) : prevWindow;
|
|
238
|
+
|
|
239
|
+
const stats = computeStats(window);
|
|
240
|
+
store.agents[agent] = {
|
|
241
|
+
window,
|
|
242
|
+
mean_risk_emitted: stats.mean_risk_emitted,
|
|
243
|
+
override_rate: stats.override_rate,
|
|
244
|
+
post_apply_correctness: stats.post_apply_correctness,
|
|
245
|
+
};
|
|
246
|
+
|
|
247
|
+
const written = save(store, opts);
|
|
248
|
+
return { agent, stats, windowSize: window.length, path: written };
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Classify calibration drift from an agent's rolling stats. PURE.
|
|
253
|
+
*
|
|
254
|
+
* under_scoring: mean_risk_emitted < 0.35 && override_rate > 0.30
|
|
255
|
+
* over_scoring: mean_risk_emitted > 0.65 && post_apply_correctness > 0.90
|
|
256
|
+
* && override_rate < 0.10
|
|
257
|
+
* else: 'none'
|
|
258
|
+
*
|
|
259
|
+
* under_scoring is checked first; the two bands are mutually exclusive by
|
|
260
|
+
* construction (mean-risk bands do not overlap) but the explicit order makes
|
|
261
|
+
* the contract unambiguous.
|
|
262
|
+
*
|
|
263
|
+
* @param {{mean_risk_emitted?:number, override_rate?:number, post_apply_correctness?:number}} stats
|
|
264
|
+
* @param {object} [cfg] defaults to the frozen DRIFT thresholds
|
|
265
|
+
* @returns {'under_scoring'|'over_scoring'|'none'}
|
|
266
|
+
*/
|
|
267
|
+
function detectDrift(stats, cfg = DRIFT) {
|
|
268
|
+
const s = stats && typeof stats === 'object' ? stats : {};
|
|
269
|
+
const mean = typeof s.mean_risk_emitted === 'number' ? s.mean_risk_emitted : 0;
|
|
270
|
+
const override = typeof s.override_rate === 'number' ? s.override_rate : 0;
|
|
271
|
+
const correct = typeof s.post_apply_correctness === 'number' ? s.post_apply_correctness : 0;
|
|
272
|
+
|
|
273
|
+
const under = cfg && cfg.under_scoring ? cfg.under_scoring : DRIFT.under_scoring;
|
|
274
|
+
const over = cfg && cfg.over_scoring ? cfg.over_scoring : DRIFT.over_scoring;
|
|
275
|
+
|
|
276
|
+
if (mean < under.mean_risk_max && override > under.override_rate_min) {
|
|
277
|
+
return 'under_scoring';
|
|
278
|
+
}
|
|
279
|
+
if (
|
|
280
|
+
mean > over.mean_risk_min &&
|
|
281
|
+
correct > over.correctness_min &&
|
|
282
|
+
override < over.override_rate_max
|
|
283
|
+
) {
|
|
284
|
+
return 'over_scoring';
|
|
285
|
+
}
|
|
286
|
+
return 'none';
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Map a single risk outcome to a bandit reward in [0, 1]. PURE.
|
|
291
|
+
*
|
|
292
|
+
* Contract (mirrors the Phase 23.5 lexicographic shape — correctness first):
|
|
293
|
+
* - rejected (accepted === false) OR undone (user_undo === true) -> 0
|
|
294
|
+
* (the user vetoed the action; no credit regardless of risk).
|
|
295
|
+
* - otherwise -> clamp01(1 - 0.5 * risk)
|
|
296
|
+
* (an accepted, not-undone action earns a reward that decays linearly with
|
|
297
|
+
* the risk it carried: a confident low-risk accept ≈ 1.0; a high-risk
|
|
298
|
+
* accept still earns partial credit ≈ 0.5 because the user did keep it).
|
|
299
|
+
*
|
|
300
|
+
* Examples (the calibration suite pins these):
|
|
301
|
+
* {accepted:true, risk:0.2} -> 0.9
|
|
302
|
+
* {accepted:false, risk:0.2} -> 0
|
|
303
|
+
* {accepted:true, risk:0.9} -> 0.55
|
|
304
|
+
* {accepted:true, risk:0.0, user_undo:true} -> 0
|
|
305
|
+
*
|
|
306
|
+
* @param {{accepted?:boolean, risk?:number, user_undo?:boolean}} input
|
|
307
|
+
* @returns {number} reward in [0, 1]
|
|
308
|
+
*/
|
|
309
|
+
function riskReward(input) {
|
|
310
|
+
const i = input && typeof input === 'object' ? input : {};
|
|
311
|
+
// An explicit reject, or any user_undo, zeroes the reward.
|
|
312
|
+
if (i.accepted === false) return 0;
|
|
313
|
+
if (i.user_undo === true) return 0;
|
|
314
|
+
const risk = clamp01(typeof i.risk === 'number' ? i.risk : 0);
|
|
315
|
+
return clamp01(1 - 0.5 * risk);
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/**
|
|
319
|
+
* Thin best-effort bridge: compute the risk reward for an outcome and feed it
|
|
320
|
+
* into the Thompson-sampling bandit (scripts/lib/bandit-router.cjs update()).
|
|
321
|
+
*
|
|
322
|
+
* NEVER throws — a telemetry/learning write must not break the hook or agent
|
|
323
|
+
* turn that triggered it. On any failure (bandit module absent, bad input,
|
|
324
|
+
* FS error) it returns `{ recorded:false, reason }` and swallows the error.
|
|
325
|
+
*
|
|
326
|
+
* The bandit's update() needs `(agent, bin, tier, reward)`. The caller supplies
|
|
327
|
+
* the routing context it used (bin = touches-size bin, tier = model tier). When
|
|
328
|
+
* a context field is missing we DO NOT guess — we skip the bandit write and
|
|
329
|
+
* report it, because writing to the wrong arm would corrupt the posterior.
|
|
330
|
+
*
|
|
331
|
+
* @param {{
|
|
332
|
+
* agent: string,
|
|
333
|
+
* bin?: string,
|
|
334
|
+
* tier?: string,
|
|
335
|
+
* accepted?: boolean,
|
|
336
|
+
* risk?: number,
|
|
337
|
+
* user_undo?: boolean,
|
|
338
|
+
* bandit?: object, // injectable for tests (defaults to require'd module)
|
|
339
|
+
* root?: string, baseDir?: string, posteriorPath?: string,
|
|
340
|
+
* }} input
|
|
341
|
+
* @returns {{recorded:boolean, reward:number, reason?:string}}
|
|
342
|
+
*/
|
|
343
|
+
function recordRiskOutcome(input) {
|
|
344
|
+
const reward = riskReward(input || {});
|
|
345
|
+
try {
|
|
346
|
+
const i = input && typeof input === 'object' ? input : {};
|
|
347
|
+
if (typeof i.agent !== 'string' || i.agent.length === 0) {
|
|
348
|
+
return { recorded: false, reward, reason: 'agent required for bandit update' };
|
|
349
|
+
}
|
|
350
|
+
if (typeof i.bin !== 'string' || i.bin.length === 0 || typeof i.tier !== 'string' || i.tier.length === 0) {
|
|
351
|
+
// Without a routing context we cannot address an arm — skip cleanly.
|
|
352
|
+
return { recorded: false, reward, reason: 'bin+tier required for bandit update' };
|
|
353
|
+
}
|
|
354
|
+
// Lazy require so a missing/breaking bandit module degrades to best-effort.
|
|
355
|
+
const bandit = i.bandit || require('../bandit-router.cjs');
|
|
356
|
+
bandit.update({
|
|
357
|
+
agent: i.agent,
|
|
358
|
+
bin: i.bin,
|
|
359
|
+
tier: i.tier,
|
|
360
|
+
reward,
|
|
361
|
+
baseDir: i.baseDir ?? i.root,
|
|
362
|
+
posteriorPath: i.posteriorPath,
|
|
363
|
+
});
|
|
364
|
+
return { recorded: true, reward };
|
|
365
|
+
} catch (err) {
|
|
366
|
+
return { recorded: false, reward, reason: err && err.message ? err.message : String(err) };
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
module.exports = {
|
|
371
|
+
updateCalibration,
|
|
372
|
+
detectDrift,
|
|
373
|
+
riskReward,
|
|
374
|
+
recordRiskOutcome,
|
|
375
|
+
// Exposed for tests + sibling reuse.
|
|
376
|
+
computeStats,
|
|
377
|
+
normalizeRecord,
|
|
378
|
+
load,
|
|
379
|
+
save,
|
|
380
|
+
clamp01,
|
|
381
|
+
DRIFT,
|
|
382
|
+
WINDOW_SIZE,
|
|
383
|
+
SCHEMA_VERSION,
|
|
384
|
+
DEFAULT_CALIBRATION_PATH,
|
|
385
|
+
};
|