@hegemonart/get-design-done 1.59.4 → 1.59.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,14 +5,14 @@
5
5
  },
6
6
  "metadata": {
7
7
  "description": "Get Design Done — 5-stage agent-orchestrated design pipeline (Brief → Explore → Plan → Design → Verify) for AI coding agents. 64 agents, 95 skills, 39 connection integrations, two MCP servers, opt-in SQLite state backbone, bidirectional Figma write-back, and a reflector-driven self-improvement loop. Cross-runtime install for Claude Code, Codex, Cursor, OpenCode, Gemini, and more.",
8
- "version": "1.59.4"
8
+ "version": "1.59.5"
9
9
  },
10
10
  "plugins": [
11
11
  {
12
12
  "name": "get-design-done",
13
13
  "source": "./",
14
14
  "description": "Agent-orchestrated 5-stage design pipeline (Brief → Explore → Plan → Design → Verify) for AI coding agents. 64 specialized agents, 95 skills, 39 connection integrations (Figma, Refero, Preview, Storybook, Chromatic, Graphify, Linear, Jira, Notion, …), bidirectional Figma write-back, queryable intel store, opt-in SQLite state backbone, and a reflector-driven self-improvement loop. Two MCP servers (gdd-state for typed STATE mutators, gdd-mcp for 13 read-only project-priming tools), tier-aware routing with cost telemetry, and defense-in-depth hooks (protected paths, MCP circuit breaker, injection scanner, budget enforcer). Cross-runtime install for Claude Code, Codex, Cursor, OpenCode, Gemini, Copilot, and more.",
15
- "version": "1.59.4",
15
+ "version": "1.59.5",
16
16
  "author": {
17
17
  "name": "hegemonart"
18
18
  },
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "get-design-done",
3
3
  "short_name": "gdd",
4
- "version": "1.59.4",
4
+ "version": "1.59.5",
5
5
  "description": "Agent-orchestrated 5-stage design pipeline (Brief → Explore → Plan → Design → Verify) for AI coding agents. 64 specialized agents, 95 skills, 39 connection integrations (Figma, Refero, Preview, Storybook, Chromatic, Graphify, Linear, Jira, Notion, …), bidirectional Figma write-back, queryable intel store for O(1) design-surface lookups, opt-in SQLite state backbone, and a reflector-driven self-improvement loop. Two MCP servers (`gdd-state` for typed STATE mutators, `gdd-mcp` for 13 read-only project-priming tools), tier-aware agent routing with cost telemetry, defense-in-depth hooks (protected paths, MCP circuit breaker, injection scanner, budget enforcer), and a cross-runtime install layer for Claude Code, Codex, Cursor, OpenCode, Gemini, Copilot, and more.",
6
6
  "author": {
7
7
  "name": "hegemonart",
package/CHANGELOG.md CHANGED
@@ -4,6 +4,36 @@ All notable changes to get-design-done are documented here. Versions follow [sem
4
4
 
5
5
  ---
6
6
 
7
+ ## [1.59.5] - 2026-06-05
8
+
9
+ Fifth point release of the **v1.59 "Audit Closeout & Honesty Pass"** milestone. Batch H polish + runtime-model provenance.
10
+
11
+ ### Added
12
+
13
+ - **Risk-calibration now learns from the bandit feedback loop.** The bandit's post-spawn `recordOutcome` also updates the per-agent calibration table (best-effort), so calibration tracks the same signal that drives routing. (H2)
14
+ - **Runtime-model provenance guard.** `budget-enforcer` no longer applies a HARD budget cap from a BYOK / unverified runtime-model row (it degrades to advisory), and the schema records the provenance. This closes the risk of an unverified placeholder tier hard-blocking a user. (P1)
15
+
16
+ ### Fixed
17
+
18
+ - **Cursor installs no longer drop co-located skill reference files.** `installMultiArtifact` now carries a skill's sibling `*-procedure.md` reference files alongside `SKILL.md` for Cursor's flat layout, with symmetric uninstall cleanup. (H6)
19
+
20
+ ### Hardening
21
+
22
+ - Pinned the state backup-guard rotation cap (10 slots) + non-empty corruption check with a dedicated test, and verified `state-store.migrate()` is async with complete JSDoc. (H5/H7)
23
+
24
+ ### Notes
25
+
26
+ - Batch-H item H8 (the `composes_with` composition-graph backfill) is owned by Phase 58, not this release; no double-implementation here.
27
+ - Documented follow-ups (each a larger-than-a-slice scope cut): calibration `detectDrift` to reflector consumption, SQLite-header corruption detection in the backup guard, the cursor sibling-carry generalized to all flat-layout runtimes, and the structural `status:` key on runtime-model entries (currently blocked by the parser's allowed-key enforcement).
28
+
29
+ ### Breaking changes
30
+
31
+ None.
32
+
33
+ 5,070/5,070 tests pass.
34
+
35
+ ---
36
+
7
37
  ## [1.59.4] - 2026-06-04
8
38
 
9
39
  Fourth point release of the **v1.59 "Audit Closeout & Honesty Pass"** milestone. Skill-surface + build hygiene.
@@ -191,6 +191,21 @@ const tierResolver = nodeRequire(
191
191
  '../scripts/lib/tier-resolver.cjs',
192
192
  ) as TierResolverModule;
193
193
 
194
+ // Phase 59.5 P1: runtime-models parser for the BYOK/unverified provenance
195
+ // guard. We read the parsed runtime rows to learn a runtime's `status`
196
+ // ("verified" | "byok" | "unverified"). The parser is pure + never invoked
197
+ // for its model-resolution side here; only to classify the runtime so an
198
+ // unverified row never drives a HARD budget cap. Soft-imported defensively:
199
+ // any parser failure degrades to the built-in verified allowlist below.
200
+ interface RuntimeModelsParserModule {
201
+ parseRuntimeModels(opts?: { cwd?: string }): {
202
+ runtimes: Array<{ id: string; status?: string }>;
203
+ };
204
+ }
205
+ const runtimeModelsParser = nodeRequire(
206
+ '../scripts/lib/install/parse-runtime-models.cjs',
207
+ ) as RuntimeModelsParserModule;
208
+
194
209
  // Plan 33.6-03 (SC#6, D-08, D-12): OpenRouter tier-resolver adapter. When the
195
210
  // user opts in (`.design/config.json#openrouter_enabled: true` OR
196
211
  // `OPENROUTER_API_KEY` present), the hook consults this adapter FIRST for a
@@ -506,6 +521,75 @@ export function loadBudget(): ResolvedBudget {
506
521
  }
507
522
  }
508
523
 
524
+ // ── runtime provenance status (Phase 59.5 P1) ───────────────────────────────
525
+
526
+ /**
527
+ * Phase 59.5 P1: provenance confidence of a runtime's tier→model row, as
528
+ * documented in reference/runtime-models.md and enumerated by
529
+ * reference/schemas/runtime-models.schema.json#status.
530
+ */
531
+ export type RuntimeStatus = 'verified' | 'byok' | 'unverified';
532
+
533
+ /**
534
+ * Built-in verified allowlist: the 4 runtimes whose tier maps are confirmed
535
+ * against runtime-author docs (the runtime-models.md banner: "4 of 14 ...
536
+ * verified (claude, codex, gemini, qwen)"). Used as the fallback classifier
537
+ * when the parsed row carries no structured `status` field yet (the markdown
538
+ * JSON blocks do not emit `status` at the time of this plan; the schema is
539
+ * ready, the parser wiring is a deferred follow-up). Once a row DOES carry
540
+ * `status`, the parsed value takes precedence over this allowlist.
541
+ */
542
+ const VERIFIED_RUNTIME_IDS: ReadonlySet<string> = new Set([
543
+ 'claude',
544
+ 'codex',
545
+ 'gemini',
546
+ 'qwen',
547
+ ]);
548
+
549
+ /** Per-process memo of runtime-id → parsed `status` (null until first read). */
550
+ let _runtimeStatusMap: Map<string, RuntimeStatus> | null = null;
551
+
552
+ function isRuntimeStatus(v: unknown): v is RuntimeStatus {
553
+ return v === 'verified' || v === 'byok' || v === 'unverified';
554
+ }
555
+
556
+ /**
557
+ * Resolve a runtime's provenance status. Reads the parsed runtime-models
558
+ * doc once per process; if a row carries a structured `status` it wins,
559
+ * otherwise the built-in verified allowlist decides (verified vs unverified).
560
+ * Fail-open: any parser error → allowlist-only classification. Never throws.
561
+ *
562
+ * @param runtimeId runtime id (e.g. 'claude', 'cline'); falsy → 'unverified'.
563
+ */
564
+ export function runtimeStatus(runtimeId: string | null | undefined): RuntimeStatus {
565
+ if (typeof runtimeId !== 'string' || runtimeId.length === 0) {
566
+ return 'unverified';
567
+ }
568
+ if (_runtimeStatusMap === null) {
569
+ _runtimeStatusMap = new Map();
570
+ try {
571
+ const parsed = runtimeModelsParser.parseRuntimeModels({ cwd: process.cwd() });
572
+ const rows = Array.isArray(parsed?.runtimes) ? parsed.runtimes : [];
573
+ for (const row of rows) {
574
+ if (row && typeof row.id === 'string' && isRuntimeStatus(row.status)) {
575
+ _runtimeStatusMap.set(row.id, row.status);
576
+ }
577
+ }
578
+ } catch {
579
+ // Fail open: parser error degrades to the verified allowlist below.
580
+ }
581
+ }
582
+ const parsedStatus = _runtimeStatusMap.get(runtimeId);
583
+ if (parsedStatus !== undefined) return parsedStatus;
584
+ return VERIFIED_RUNTIME_IDS.has(runtimeId) ? 'verified' : 'unverified';
585
+ }
586
+
587
+ /** True when the runtime row must NOT drive a HARD budget cap (P1 guard). */
588
+ export function isUnverifiedRuntime(runtimeId: string | null | undefined): boolean {
589
+ const s = runtimeStatus(runtimeId);
590
+ return s === 'byok' || s === 'unverified';
591
+ }
592
+
509
593
  // ── cumulative phase spend (WR-02) ──────────────────────────────────────────
510
594
 
511
595
  /**
@@ -1138,7 +1222,40 @@ export async function main(): Promise<void> {
1138
1222
  // no router decision is supplied, behavior is identical to pre-25.
1139
1223
  const perSpawnCap = resolvePerSpawnCap(budget, complexityClass);
1140
1224
 
1141
- if (budget.enforcement_mode === 'enforce') {
1225
+ // ── Phase 59.5 P1: BYOK/unverified provenance guard ────────────────────────
1226
+ //
1227
+ // Resolve the runtime id (router-supplied `runtime`, else env detection,
1228
+ // else 'claude', same precedence the cost-recording block uses below) so we
1229
+ // can consult its runtime-models provenance `status` BEFORE the hard-cap
1230
+ // branches. When the runtime row is byok/unverified the resolved per-runtime
1231
+ // model is best-effort (the user's actual provider may diverge from the
1232
+ // Anthropic-default fill), so an estimated cost computed against it must NOT
1233
+ // hard-block the user. We degrade enforce-mode to advisory ('warn') for THIS
1234
+ // spawn only: the per-spawn + per-phase 100% caps stop blocking and surface a
1235
+ // stderr warning instead, while the 80% auto-downgrade still applies (a tier
1236
+ // downgrade is non-blocking and strictly cheaper, so it is safe to keep).
1237
+ // Verified runtimes (claude/codex/gemini/qwen) are unaffected (full hard
1238
+ // enforcement). The project-level cap above is intentionally NOT degraded: it
1239
+ // is governed by total ledger spend, not a per-runtime resolved model.
1240
+ const guardRuntimeId =
1241
+ (typeof routerDecision?.runtime === 'string' && routerDecision.runtime.length > 0
1242
+ ? routerDecision.runtime
1243
+ : runtimeDetect.detect()) ?? 'claude';
1244
+ const runtimeIsUnverified = isUnverifiedRuntime(guardRuntimeId);
1245
+ const effectiveEnforcementMode: ResolvedBudget['enforcement_mode'] =
1246
+ budget.enforcement_mode === 'enforce' && runtimeIsUnverified
1247
+ ? 'warn'
1248
+ : budget.enforcement_mode;
1249
+ if (budget.enforcement_mode === 'enforce' && runtimeIsUnverified) {
1250
+ process.stderr.write(
1251
+ `gdd-budget-enforcer WARN: runtime '${guardRuntimeId}' has provenance status ` +
1252
+ `'${runtimeStatus(guardRuntimeId)}' (BYOK/unverified tier→model row); ` +
1253
+ `hard budget caps degraded to advisory for this spawn so an unverified ` +
1254
+ `cost estimate never hard-blocks you.\n`,
1255
+ );
1256
+ }
1257
+
1258
+ if (effectiveEnforcementMode === 'enforce') {
1142
1259
  // Branch C: 100% per-spawn cap hard block (class-specific or per_task).
1143
1260
  if (estCost >= perSpawnCap) {
1144
1261
  writeTelemetry({
@@ -1202,12 +1319,24 @@ export async function main(): Promise<void> {
1202
1319
  toolInput._tier_override = 'haiku';
1203
1320
  toolInput._tier_downgraded = true;
1204
1321
  }
1205
- } else if (budget.enforcement_mode === 'warn') {
1322
+ } else if (effectiveEnforcementMode === 'warn') {
1206
1323
  if (estCost >= perSpawnCap) {
1207
1324
  process.stderr.write(
1208
1325
  `gdd-budget-enforcer WARN: per-spawn cap will be exceeded ($${estCost.toFixed(4)} >= $${perSpawnCap})\n`,
1209
1326
  );
1210
1327
  }
1328
+ // Phase 59.5 P1: when enforce was degraded to advisory for a byok/unverified
1329
+ // runtime, also surface the per-phase breach that the hard branch above
1330
+ // would otherwise have reported (it is skipped for unverified runtimes).
1331
+ if (
1332
+ budget.enforcement_mode === 'enforce' &&
1333
+ phaseSpend + estCost >= budget.per_phase_cap_usd
1334
+ ) {
1335
+ process.stderr.write(
1336
+ `gdd-budget-enforcer WARN: per-phase cap will be exceeded for ${phase} ` +
1337
+ `($${(phaseSpend + estCost).toFixed(4)} >= $${budget.per_phase_cap_usd.toFixed(2)})\n`,
1338
+ );
1339
+ }
1211
1340
  }
1212
1341
  // enforcement_mode === 'log': telemetry only.
1213
1342
 
@@ -1230,11 +1359,9 @@ export async function main(): Promise<void> {
1230
1359
  toolInput._tier_override ?? toolInput._default_tier ?? 'sonnet';
1231
1360
  // Runtime tag: prefer the router's explicit `runtime` (D-08) field;
1232
1361
  // fall back to env-var detection; default to 'claude' since the .ts
1233
- // hook itself only runs inside Claude Code.
1234
- const runtimeId =
1235
- (typeof routerDecision?.runtime === 'string' && routerDecision.runtime.length > 0
1236
- ? routerDecision.runtime
1237
- : runtimeDetect.detect()) ?? 'claude';
1362
+ // hook itself only runs inside Claude Code. Reuse the id already resolved
1363
+ // for the Phase 59.5 P1 provenance guard above (single resolution source).
1364
+ const runtimeId = guardRuntimeId;
1238
1365
 
1239
1366
  // ── Plan 27.5-02 — bandit consultation ────────────────────────────────────
1240
1367
  //
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hegemonart/get-design-done",
3
- "version": "1.59.4",
3
+ "version": "1.59.5",
4
4
  "description": "A design-quality pipeline for AI coding agents: brief, explore, plan, design, and verify UI work against your design system.",
5
5
  "author": "Hegemon",
6
6
  "homepage": "https://github.com/hegemonart/get-design-done",
@@ -10,7 +10,7 @@ Single canonical map from Anthropic tier names (`opus|sonnet|haiku`) and runtime
10
10
  >
11
11
  > Unverified: kilo, copilot, cursor, windsurf, antigravity, augment, trae, codebuddy, cline, opencode.
12
12
  >
13
- > The schema (`reference/schemas/runtime-models.schema.json`) explicitly accepts the placeholder marker so the file ships shape-valid; the unverified-ness is a content gap, not a structural defect.
13
+ > **Provenance status field.** Each row below is annotated with a structured `status` in its section heading: `verified` (confirmed against runtime-author docs), `byok` (BYOK / multi-provider, where the user-configured model may diverge from the Anthropic-default fill), or `unverified` (placeholder fill pending researcher confirmation). The schema (`reference/schemas/runtime-models.schema.json`) accepts an optional `status` enum of exactly these three values, so verified rows MAY omit it and remain shape-valid. The `hooks/budget-enforcer.ts` guard consults this status (or its built-in verified allowlist) so a `byok`/`unverified` row never drives a HARD budget cap: it degrades to advisory enforcement for that spawn. This makes the unverified-ness machine-readable, not just a content gap in prose.
14
14
 
15
15
  This file is parsed by `scripts/lib/install/parse-runtime-models.cjs` and consumed by:
16
16
 
@@ -36,7 +36,7 @@ This file is parsed by `scripts/lib/install/parse-runtime-models.cjs` and consum
36
36
 
37
37
  ---
38
38
 
39
- ## claude - Claude Code
39
+ ## claude - Claude Code (status: verified)
40
40
 
41
41
  Anthropic's first-party runtime. Public tier docs at https://docs.anthropic.com/en/docs/about-claude/models. Seed picks per CONTEXT.md D-02.
42
42
 
@@ -66,7 +66,7 @@ Anthropic's first-party runtime. Public tier docs at https://docs.anthropic.com/
66
66
 
67
67
  ---
68
68
 
69
- ## codex - OpenAI Codex CLI
69
+ ## codex - OpenAI Codex CLI (status: verified)
70
70
 
71
71
  OpenAI's Codex CLI runtime. Public tier docs at https://platform.openai.com/docs/models. Seed picks per CONTEXT.md D-02.
72
72
 
@@ -96,7 +96,7 @@ OpenAI's Codex CLI runtime. Public tier docs at https://platform.openai.com/docs
96
96
 
97
97
  ---
98
98
 
99
- ## gemini - Gemini CLI
99
+ ## gemini - Gemini CLI (status: verified)
100
100
 
101
101
  Google's Gemini CLI runtime. Public tier docs at https://ai.google.dev/gemini-api/docs/models. Seed picks per CONTEXT.md D-02.
102
102
 
@@ -126,7 +126,7 @@ Google's Gemini CLI runtime. Public tier docs at https://ai.google.dev/gemini-ap
126
126
 
127
127
  ---
128
128
 
129
- ## qwen - Qwen Code
129
+ ## qwen - Qwen Code (status: verified)
130
130
 
131
131
  Alibaba's Qwen Code runtime. Public tier docs at https://github.com/QwenLM/qwen-code. Seed picks per CONTEXT.md D-02.
132
132
 
@@ -156,7 +156,7 @@ Alibaba's Qwen Code runtime. Public tier docs at https://github.com/QwenLM/qwen-
156
156
 
157
157
  ---
158
158
 
159
- ## kilo - Kilo Code
159
+ ## kilo - Kilo Code (status: byok)
160
160
 
161
161
  Kilo Code adapter - multi-provider, Anthropic-default fill until runtime-author docs confirm. Researcher fill needed (CONTEXT.md D-02).
162
162
 
@@ -186,7 +186,7 @@ Kilo Code adapter - multi-provider, Anthropic-default fill until runtime-author
186
186
 
187
187
  ---
188
188
 
189
- ## copilot - GitHub Copilot CLI
189
+ ## copilot - GitHub Copilot CLI (status: byok)
190
190
 
191
191
  GitHub Copilot CLI - multi-provider routing under the hood. Researcher fill needed (CONTEXT.md D-02).
192
192
 
@@ -216,7 +216,7 @@ GitHub Copilot CLI - multi-provider routing under the hood. Researcher fill need
216
216
 
217
217
  ---
218
218
 
219
- ## cursor - Cursor
219
+ ## cursor - Cursor (status: byok)
220
220
 
221
221
  Cursor IDE/CLI - multi-provider routing. Researcher fill needed (CONTEXT.md D-02).
222
222
 
@@ -246,7 +246,7 @@ Cursor IDE/CLI - multi-provider routing. Researcher fill needed (CONTEXT.md D-02
246
246
 
247
247
  ---
248
248
 
249
- ## windsurf - Windsurf
249
+ ## windsurf - Windsurf (status: byok)
250
250
 
251
251
  Windsurf (formerly Codeium) - multi-provider Cascade router. Researcher fill needed (CONTEXT.md D-02).
252
252
 
@@ -276,7 +276,7 @@ Windsurf (formerly Codeium) - multi-provider Cascade router. Researcher fill nee
276
276
 
277
277
  ---
278
278
 
279
- ## antigravity - Antigravity
279
+ ## antigravity - Antigravity (status: unverified)
280
280
 
281
281
  Antigravity - Google's agentic coding platform. Researcher fill needed (CONTEXT.md D-02).
282
282
 
@@ -306,7 +306,7 @@ Antigravity - Google's agentic coding platform. Researcher fill needed (CONTEXT.
306
306
 
307
307
  ---
308
308
 
309
- ## augment - Augment
309
+ ## augment - Augment (status: byok)
310
310
 
311
311
  Augment Code - multi-provider agentic IDE. Researcher fill needed (CONTEXT.md D-02).
312
312
 
@@ -336,7 +336,7 @@ Augment Code - multi-provider agentic IDE. Researcher fill needed (CONTEXT.md D-
336
336
 
337
337
  ---
338
338
 
339
- ## trae - Trae
339
+ ## trae - Trae (status: unverified)
340
340
 
341
341
  Trae - single-model session runtime per CONTEXT.md D-02 example. `single_tier: true` annotates the row. Researcher fill needed.
342
342
 
@@ -367,7 +367,7 @@ Trae - single-model session runtime per CONTEXT.md D-02 example. `single_tier: t
367
367
 
368
368
  ---
369
369
 
370
- ## codebuddy - CodeBuddy
370
+ ## codebuddy - CodeBuddy (status: byok)
371
371
 
372
372
  CodeBuddy (Tencent) - multi-provider routing. Researcher fill needed (CONTEXT.md D-02).
373
373
 
@@ -397,7 +397,7 @@ CodeBuddy (Tencent) - multi-provider routing. Researcher fill needed (CONTEXT.md
397
397
 
398
398
  ---
399
399
 
400
- ## cline - Cline
400
+ ## cline - Cline (status: byok)
401
401
 
402
402
  Cline (formerly Claude Dev) - multi-provider VS Code agent. Researcher fill needed (CONTEXT.md D-02).
403
403
 
@@ -427,7 +427,7 @@ Cline (formerly Claude Dev) - multi-provider VS Code agent. Researcher fill need
427
427
 
428
428
  ---
429
429
 
430
- ## opencode - OpenCode
430
+ ## opencode - OpenCode (status: byok)
431
431
 
432
432
  OpenCode - open-source AI coding agent, BYOK multi-provider. Researcher fill needed (CONTEXT.md D-02).
433
433
 
@@ -1042,6 +1042,10 @@ export interface RuntimeEntry {
1042
1042
  * When true, the runtime exposes a single model that maps to all three tiers (D-02). Downstream consumers (router, budget-enforcer) may render a UI affordance noting tier-selection has no cost effect for this runtime.
1043
1043
  */
1044
1044
  single_tier?: boolean;
1045
+ /**
1046
+ * Provenance confidence of this runtime's tier map. 'verified' = confirmed against runtime-author docs (claude, codex, gemini, qwen). 'byok' = BYOK / multi-provider runtime whose user-configured model may diverge from the Anthropic-default fill. 'unverified' = placeholder fill pending researcher confirmation. Optional: rows omitting this field are treated as unverified-unless-stated by consumers, and verified rows MAY omit it. The budget-enforcer guard reads this (or its built-in verified allowlist) so a byok/unverified row never drives a HARD budget cap (degrades to advisory).
1047
+ */
1048
+ status?: 'verified' | 'byok' | 'unverified';
1045
1049
  /**
1046
1050
  * Map of canonical Anthropic tier names (D-03) to the runtime's concrete model identifier. All three keys are required even when single_tier=true (assign the same model three times).
1047
1051
  */
@@ -47,6 +47,11 @@
47
47
  "type": "boolean",
48
48
  "description": "When true, the runtime exposes a single model that maps to all three tiers (D-02). Downstream consumers (router, budget-enforcer) may render a UI affordance noting tier-selection has no cost effect for this runtime."
49
49
  },
50
+ "status": {
51
+ "type": "string",
52
+ "enum": ["verified", "byok", "unverified"],
53
+ "description": "Provenance confidence of this runtime's tier map. 'verified' = confirmed against runtime-author docs (claude, codex, gemini, qwen). 'byok' = BYOK / multi-provider runtime whose user-configured model may diverge from the Anthropic-default fill. 'unverified' = placeholder fill pending researcher confirmation. Optional: rows omitting this field are treated as unverified-unless-stated by consumers, and verified rows MAY omit it. The budget-enforcer guard reads this (or its built-in verified allowlist) so a byok/unverified row never drives a HARD budget cap (degrades to advisory)."
54
+ },
50
55
  "tier_to_model": {
51
56
  "type": "object",
52
57
  "additionalProperties": false,
@@ -35,6 +35,12 @@
35
35
 
36
36
  const banditRouter = require('../bandit-router.cjs');
37
37
  const adaptiveModeLib = require('../adaptive-mode.cjs');
38
+ // Phase 56 (CAL-01) per-agent risk calibration. recordOutcome feeds the same
39
+ // {agent, status} signal it gives the bandit into this table so calibration
40
+ // learns from the post-spawn outcome too. Lazy-tolerant: the call is wrapped in
41
+ // its own best-effort try/catch (D-04) so a calibration write can never break
42
+ // the bandit path.
43
+ const calibration = require('../risk/calibration.cjs');
38
44
 
39
45
  const DELEGATE_NONE = banditRouter.DELEGATE_NONE; // 'none'
40
46
  const VALID_DELEGATES = banditRouter.DEFAULT_DELEGATES; // ['none','gemini','codex','cursor','copilot','qwen']
@@ -299,6 +305,38 @@ function recordOutcome(input) {
299
305
  }
300
306
  }
301
307
 
308
+ // CAL-01: also fold the same outcome into the per-agent risk calibration
309
+ // table so the calibration layer (compute-risk feedback) learns from the
310
+ // identical post-spawn signal the bandit just saw. Independent best-effort
311
+ // try/catch (D-04): a calibration write failure must NEVER throw into or
312
+ // break the bandit path above. The bandit signal carries no emitted risk
313
+ // score, so `risk` degrades to 0 via normalizeRecord; status drives the
314
+ // correctness axis (completed → applied-correct, anything else → not-correct).
315
+ // Writes to calibration.DEFAULT_CALIBRATION_PATH ('.design/telemetry/
316
+ // calibration.json') under baseDir — the module's own canonical location.
317
+ try {
318
+ calibration.updateCalibration(
319
+ input.agent,
320
+ {
321
+ accepted: true,
322
+ post_apply_correct: input.status === 'completed',
323
+ },
324
+ { root: input.baseDir, baseDir: input.baseDir },
325
+ );
326
+ } catch (err) {
327
+ if (process.env.GDD_BANDIT_DEBUG === '1') {
328
+ try {
329
+ process.stderr.write(
330
+ '[bandit-integration] recordOutcome calibration swallowed: ' +
331
+ (err && err.message ? err.message : String(err)) +
332
+ '\n',
333
+ );
334
+ } catch {
335
+ /* swallow */
336
+ }
337
+ }
338
+ }
339
+
302
340
  return undefined;
303
341
  }
304
342
 
@@ -334,6 +334,58 @@ function listSourceSkills(skillsRoot) {
334
334
  });
335
335
  }
336
336
 
337
+ /**
338
+ * Enumerate co-located sibling `*.md` reference files for a skill.
339
+ *
340
+ * A skill source directory may ship reference files next to SKILL.md
341
+ * (e.g. `<name>-procedure.md`, `<name>-rules.md`, `cache-policy.md`).
342
+ * SKILL.md references these via relative links; if they are not installed
343
+ * the links resolve to nothing. This returns the top-level sibling `.md`
344
+ * files only (NOT SKILL.md itself, NOT files in nested subdirectories).
345
+ *
346
+ * Best-effort: any fs error yields an empty list (never throws). A single
347
+ * unreadable skill dir must not crash the whole install.
348
+ *
349
+ * @param {string} skillSrcDir absolute path to `<skillsRoot>/<name>`
350
+ * @returns {string[]} basenames of sibling `.md` files (excluding SKILL.md)
351
+ */
352
+ function listSiblingRefFiles(skillSrcDir) {
353
+ let entries;
354
+ try {
355
+ entries = fs.readdirSync(skillSrcDir, { withFileTypes: true });
356
+ } catch {
357
+ return [];
358
+ }
359
+ return entries
360
+ .filter((ent) => {
361
+ if (!ent.isFile()) return false;
362
+ if (ent.name === 'SKILL.md') return false;
363
+ return ent.name.toLowerCase().endsWith('.md');
364
+ })
365
+ .map((ent) => ent.name);
366
+ }
367
+
368
+ /**
369
+ * Wrap a passthrough sibling reference file's content with a plugin
370
+ * fingerprint header so foreign-file protection + uninstall can recognize
371
+ * it as plugin-owned. Idempotent: re-wrapping a file that already carries
372
+ * the fingerprint returns it unchanged.
373
+ *
374
+ * The fingerprint matches `merge.cjs#GDD_ADAPTER_FINGERPRINT`, the same
375
+ * marker every SKILL converter injects via `shared.ensureAdapterHeader`,
376
+ * so `isPluginOwned` treats the sibling as owned.
377
+ *
378
+ * @param {string} raw source sibling file content
379
+ * @returns {string}
380
+ */
381
+ function fingerprintSiblingRef(raw) {
382
+ const text = typeof raw === 'string' ? raw : '';
383
+ if (isPluginOwned(text)) return text;
384
+ const header =
385
+ '<!-- gdd: auto-generated from Claude SKILL.md. Reference adapter -->\n\n';
386
+ return header + text;
387
+ }
388
+
337
389
  /**
338
390
  * Install all artifacts for a `multi-artifact` runtime.
339
391
  *
@@ -395,6 +447,48 @@ function installMultiArtifact(runtime, configDir, dryRun, opts) {
395
447
  action: writeResult.action,
396
448
  ...(writeResult.reason ? { reason: writeResult.reason } : {}),
397
449
  });
450
+
451
+ // Batch H6: carry co-located sibling `*.md` reference files alongside
452
+ // SKILL.md. The skills layout only stages SKILL.md per skill, so
453
+ // reference siblings (e.g. `<name>-procedure.md`) are otherwise lost.
454
+ // Scoped to cursor (the audited flat-layout runtime); other runtimes
455
+ // keep their prior single-SKILL.md behavior. Siblings are passthrough
456
+ // copies fingerprinted so foreign-file protection + uninstall treat
457
+ // them as plugin-owned. Broader skillsKind-runtime carry is deferred
458
+ // (see converters/cursor.cjs KNOWN LIMITATION).
459
+ if (kind.kind === 'skills' && runtime.id === 'cursor' && item.srcPath) {
460
+ const skillSrcDir = path.dirname(item.srcPath);
461
+ const skillDestDir = path.dirname(destPath);
462
+ for (const sibling of listSiblingRefFiles(skillSrcDir)) {
463
+ let rawSibling;
464
+ try {
465
+ rawSibling = fs.readFileSync(
466
+ path.join(skillSrcDir, sibling),
467
+ 'utf8',
468
+ );
469
+ } catch (err) {
470
+ perFile.push({
471
+ kind: 'skill-ref',
472
+ path: path.join(skillDestDir, sibling),
473
+ action: 'skipped-foreign',
474
+ reason: `Could not read sibling ${sibling}: ${err.message}`,
475
+ });
476
+ continue;
477
+ }
478
+ const siblingDest = path.join(skillDestDir, sibling);
479
+ const siblingWrite = writeFingerprinted(
480
+ siblingDest,
481
+ fingerprintSiblingRef(rawSibling),
482
+ dryRun,
483
+ );
484
+ perFile.push({
485
+ kind: 'skill-ref',
486
+ path: siblingDest,
487
+ action: siblingWrite.action,
488
+ ...(siblingWrite.reason ? { reason: siblingWrite.reason } : {}),
489
+ });
490
+ }
491
+ }
398
492
  }
399
493
  }
400
494
 
@@ -489,7 +583,45 @@ function uninstallMultiArtifact(runtime, configDir, dryRun, opts) {
489
583
 
490
584
  // If we removed a SKILL.md, remember to trim its now-empty parent.
491
585
  if (kind.kind === 'skills') {
492
- skillDirsToTrim.push(path.dirname(destPath));
586
+ const skillDestDir = path.dirname(destPath);
587
+ skillDirsToTrim.push(skillDestDir);
588
+
589
+ // Batch H6: symmetric cleanup for the sibling reference files the
590
+ // cursor install carries alongside SKILL.md. Remove only the
591
+ // plugin-owned siblings so a now-empty dir can be trimmed below;
592
+ // user-authored siblings are left in place (foreign-file discipline).
593
+ if (runtime.id === 'cursor') {
594
+ for (const sibling of listSiblingRefFiles(skillDestDir)) {
595
+ const siblingPath = path.join(skillDestDir, sibling);
596
+ let siblingContent;
597
+ try {
598
+ siblingContent = fs.readFileSync(siblingPath, 'utf8');
599
+ } catch (err) {
600
+ perFile.push({
601
+ kind: 'skill-ref',
602
+ path: siblingPath,
603
+ action: 'skipped-foreign',
604
+ reason: `Could not read sibling ${sibling}: ${err.message}`,
605
+ });
606
+ continue;
607
+ }
608
+ if (!isPluginOwned(siblingContent)) {
609
+ perFile.push({
610
+ kind: 'skill-ref',
611
+ path: siblingPath,
612
+ action: 'skipped-foreign',
613
+ reason: `Existing ${sibling} was not authored by this plugin; not removing.`,
614
+ });
615
+ continue;
616
+ }
617
+ if (!dryRun) fs.unlinkSync(siblingPath);
618
+ perFile.push({
619
+ kind: 'skill-ref',
620
+ path: siblingPath,
621
+ action: 'removed',
622
+ });
623
+ }
624
+ }
493
625
  }
494
626
  }
495
627
  }