@hegemonart/get-design-done 1.32.0 → 1.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,14 +5,14 @@
5
5
  },
6
6
  "metadata": {
7
7
  "description": "Get Design Done — 5-stage agent-orchestrated design pipeline with 9 connections, handoff-first workflow, bidirectional Figma write-back, 22+ specialized agents, queryable knowledge layer (intel store, dependency analysis, learnings extraction), and a self-improvement loop (reflector, frontmatter + budget feedback, global-skills layer). v1.20.0 ships the SDK foundation: gdd-state MCP server (11 typed tools), lockfile-safe STATE.md mutations, event stream, and resilience primitives (jittered-backoff, rate-guard, error-classifier, iteration-budget) for rate-limit + 429 + context-overflow recovery. Full CI/CD pipeline (Node 22/24 × Linux/macOS/Windows) and release automation (auto-tag + GitHub Release + release-time smoke test).",
8
- "version": "1.32.0"
8
+ "version": "1.33.0"
9
9
  },
10
10
  "plugins": [
11
11
  {
12
12
  "name": "get-design-done",
13
13
  "source": "./",
14
14
  "description": "Agent-orchestrated 5-stage design pipeline: Brief → Explore → Plan → Design → Verify. 22+ specialized agents, 9 connections (Figma, Refero, Preview, Storybook, Chromatic, Figma Writer, Graphify, Pinterest, Claude Design), Claude Design handoff, bidirectional Figma write-back, and a queryable intel store (.design/intel/) for dependency and learnings queries. Standalone commands: style, darkmode, compare, figma-write, graphify, handoff, analyze-dependencies, skill-manifest, extract-learnings. Embeds NNG heuristics, WCAG thresholds, typographic systems, motion framework, and anti-pattern catalog. Ships with a full CI/CD pipeline (Node 22/24 × Linux/macOS/Windows) and release automation. Optimization layer (v1.0.4.1, retroactive): gdd-router + gdd-cache-manager skills, PreToolUse budget-enforcer hook, tier-aware agent frontmatter, lazy checker gates, streaming synthesizer, /gdd:warm-cache + /gdd:optimize commands, and cost telemetry at .design/telemetry/costs.jsonl — targeting 50-70% per-task token-cost reduction with no quality-floor regression. v1.20.0 SDK foundation: gdd-state MCP server (11 typed tools), lockfile-safe STATE.md mutations, event stream at .design/telemetry/events.jsonl, resilience primitives (jittered-backoff, rate-guard, error-classifier, iteration-budget) with rate-limit + 429 + context-overflow recovery, and TypeScript toolchain.",
15
- "version": "1.32.0",
15
+ "version": "1.33.0",
16
16
  "author": {
17
17
  "name": "hegemonart"
18
18
  },
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "get-design-done",
3
3
  "short_name": "gdd",
4
- "version": "1.32.0",
4
+ "version": "1.33.0",
5
5
  "description": "Agent-orchestrated 5-stage design pipeline: Brief → Explore → Plan → Design → Verify. 22+ specialized agents, 9 connections (Figma, Refero, Preview, Storybook, Chromatic, Figma Writer, Graphify, Pinterest, Claude Design), handoff-first workflow via Claude Design bundles, bidirectional Figma write-back (annotations, Code Connect), queryable intel store (`.design/intel/`) for O(1) design surface lookups, and self-improvement loop (reflector agent, frontmatter + budget feedback, global-skills layer at `~/.claude/gdd/global-skills/`). Standalone commands: style, darkmode, compare, figma-write, graphify, handoff, analyze-dependencies, skill-manifest, extract-learnings, reflect, apply-reflections. Embeds NNG heuristics, WCAG thresholds, typographic systems, motion framework, and anti-pattern catalog. Ships with a full CI/CD pipeline (Node 22/24 × Linux/macOS/Windows, lint + schema + frontmatter + stale-ref + shellcheck + gitleaks + injection-scan + blocking size-budget) and release automation (auto-tag + GitHub Release + release-time smoke test). Optimization layer (v1.0.4.1, retroactive): gdd-router + gdd-cache-manager skills, PreToolUse budget-enforcer hook, tier-aware agent frontmatter, lazy checker gates, streaming synthesizer, /gdd:warm-cache + /gdd:optimize commands, and cost telemetry at .design/telemetry/costs.jsonl — targeting 50-70% per-task token-cost reduction with no quality-floor regression. v1.20.0 SDK foundation: gdd-state MCP server (11 typed tools), lockfile-safe STATE.md mutations, event stream at .design/telemetry/events.jsonl, resilience primitives (jittered-backoff, rate-guard, error-classifier, iteration-budget) with rate-limit + 429 + context-overflow recovery, and TypeScript toolchain. v1.27.7 ships gdd-mcp (Phase 27.7): 12 read-only MCP tools for sub-3s priming. v1.28.0 (Phase 28): Foundational References Tier 2 — 5 new reference files (color-theory, composition, proportion-systems, i18n, contrast-advanced), 2 verifier i18n probes + 1 explore i18n-readiness probe, 12 additive cross-link insertions across 10 existing references, 2 orthogonal audit-scoring lens-tags (composition_alignment + i18n_readiness).",
6
6
  "author": {
7
7
  "name": "hegemonart",
package/CHANGELOG.md CHANGED
@@ -4,6 +4,38 @@ All notable changes to get-design-done are documented here. Versions follow [sem
4
4
 
5
5
  ---
6
6
 
7
+ ## [1.33.0] - 2026-05-30
8
+
9
+ ### Phase 33 — Skill Behavior Tests (Pressure-Scenario Harness)
10
+
11
+ Adds a **behavior-test category** that complements the static validators (Phase 28.5 line/frontmatter) and static guardrails (Phase 32 `<HARD-GATE>` presence) with tests that verify skills hold UNDER PRESSURE. A manifest-driven runner drives a pressure scenario (time / sunk-cost / authority / exhaustion / scope-minimization) through an injectable agent-invoker and validates the response against a compliance/violation rubric with N-attempts + majority rule. Ships the harness + 8 baseline scenarios + synthetic RED baselines + the description-format A/B methodology + reflector telemetry integration. Ports the TDD-for-skills methodology + the pressure-scenario pattern from [`obra/superpowers/skills/writing-skills`](https://github.com/obra/superpowers) (MIT). 6 plans across Waves A–C.
12
+
13
+ ### Added
14
+
15
+ - **Manifest-driven pressure-scenario runner** — `scripts/lib/skill-behavior/runner.cjs` exposes an INJECTABLE `invokeAgent(prompt, opts) -> { text }` seam (no `@anthropic-ai/sdk` dependency — D-03): a deterministic STUB invoker (`scripts/lib/skill-behavior/stub-invoker.cjs`) for CI/tests, plus a documented real-invoker adapter for the opt-in keyed run. Runs each scenario N times and decides compliance by majority.
16
+ - **Pressure-scenario schema** — `reference/schemas/pressure-scenario.schema.json` (wired into `validate:schemas`), with conformance tests for the 8 scenario manifests.
17
+ - **8 pressure scenarios + synthetic RED baselines** — `test/suite/skill-behavior/scenarios/` (7 stage skills + `using-gdd`) with synthetic-from-observed-cycle-drift RED baselines at `test/fixtures/skill-behavior-baseline/` (D-02 — ROADMAP-sanctioned).
18
+ - **Description-format A/B methodology** — `docs/research/description-format-ab.md` documents the trigger-only vs `<what>. Use when` counterfactual + the 7/10-run threshold (D-08), with a `pending: keyed run` marker. The empirical result is an opt-in maintainer follow-up (no API key in CI).
19
+ - **Reflector telemetry** — `scripts/lib/skill-behavior/telemetry.cjs` emits to `.design/telemetry/skill-behavior.jsonl`; a sustained-failure signal (≥3 of last 10 runs failing for a scenario) feeds an `apply-reflections` proposal (stub-tested integration — D-07).
20
+ - **`npm run test:behavior` (opt-in, D-06).** A new script that runs the behavior tests ONLY when `ANTHROPIC_API_KEY` is set (a clear skip message + exit 0 otherwise). The default `npm test` is UNCHANGED — the structural stub tests stay CI-green (LLM non-determinism keeps live behavior runs out of the default suite).
21
+ - **Docs** — `CONTRIBUTING.md` gains a "How to add a pressure scenario" section + the keyed `ANTHROPIC_API_KEY=… npm run test:behavior` procedure; `README.md` gains a "Skill behavior tests" subsection.
22
+
23
+ ### Removed
24
+
25
+ - **BREAKING: the Phase-31.5 deprecation shims are removed (D-04).** The 10 `GDD-DEPRECATION-SHIM` re-exports re-created at the OLD SDK paths in v1.31.5 — `scripts/lib/{cli,event-stream,gdd-state,gdd-errors}/index.ts`, `scripts/lib/{error-classifier,iteration-budget,jittered-backoff,lockfile}.cjs`, and `scripts/mcp-servers/{gdd-state,gdd-mcp}/server.ts` — are deleted. The grace window elapsed (v1.31.5 shipped with shims → v1.32.0 still had them → v1.33.0 removes them). The now-empty `scripts/mcp-servers/` is dropped from the `package.json` `files` allowlist. **If you imported `scripts/lib/…` or `scripts/mcp-servers/…` directly, import from `sdk/…` instead** (e.g. `scripts/lib/cli` → `sdk/cli`, `scripts/lib/error-classifier.cjs` → `sdk/primitives/error-classifier.cjs`, `scripts/mcp-servers/gdd-state/server.ts` → `sdk/mcp/gdd-state/server.ts`). Internal callers were all repointed to `sdk/` in 31.5 + the Phase-32 gdd-events fix; the `gdd-state-mcp` / `gdd-mcp` bins target `sdk/`, so deletion drops only the external re-export — proven by the `no-stale-internal-refs` guard + the full suite + the 31.5 headless pack→install→run E2E.
26
+
27
+ ### Attribution
28
+
29
+ - **Methodology + pattern ported from [`obra/superpowers/skills/writing-skills`](https://github.com/obra/superpowers) (MIT).** The TDD-for-skills cycle (RED: agent fails without the skill → GREEN: skill counters the rationalizations → REFACTOR: close new loopholes) and the pressure-scenario pattern. See `NOTICE`. We port the methodology, not the content — GDD's scenarios, rubrics, and skills are GDD-specific.
30
+
31
+ ### Notes
32
+
33
+ - The behavioral evidence (real RED baselines from live agent runs + the empirical A/B result) is NOT capturable autonomously (no API key / SDK in CI). RED baselines are authored synthetic-from-observed-cycle-drift (D-02); the A/B evidence file documents methodology + expected-signal + a `pending: keyed run` marker. A Phase-28.5 feedback note points at `docs/research/description-format-ab.md`; **Phase 28.5's description-format validator regex is unchanged** (33-06 emits the pointer only — D-08).
34
+ - The 31.5 tarball golden (`test/fixtures/baselines/phase-31-5/tarball-manifest.txt`) was regenerated as a reviewed delta: **+4** skill-behavior paths (`reference/schemas/pressure-scenario.schema.json` + the 3 `scripts/lib/skill-behavior/*.cjs`) and **−10** removed shim paths (618 paths).
35
+ - 6-manifest lockstep at **v1.33.0** (`package.json` + `package-lock.json` + `.claude-plugin/plugin.json` + `.claude-plugin/marketplace.json` (metadata.version + plugins[0].version) + `.cursor-plugin/plugin.json` + `.codex-plugin/plugin.json`). Version-sync hygiene done upfront (D-09): `OFF_CADENCE_VERSIONS.add('1.33.0')` + prior `manifests-version.txt` baselines forward-propagated 1.32.0 → 1.33.0.
36
+
37
+ ---
38
+
7
39
  ## [1.32.0] - 2026-05-30
8
40
 
9
41
  ### Phase 32 — Skill Auto-Trigger Discipline + Defensive Guardrails
package/NOTICE CHANGED
@@ -249,14 +249,52 @@ Three ported artifacts:
249
249
  The mechanism is the contribution being attributed; the discipline content is
250
250
  original to get-design-done.
251
251
 
252
+ ──────────────────────────────────────────────────────────────────────────────
253
+ Phase 33 — Skill Behavior Tests (Pressure-Scenario Harness) (v1.33.0, 2026-05-30)
254
+ ──────────────────────────────────────────────────────────────────────────────
255
+
256
+ The skill-behavior pressure-scenario harness shipped in v1.33.0 ports the
257
+ TDD-for-skills METHODOLOGY and the pressure-scenario PATTERN (not the content)
258
+ from:
259
+
260
+ obra/superpowers/skills/writing-skills (https://github.com/obra/superpowers)
261
+ License: MIT
262
+
263
+ writing-skills codifies the TDD-for-skills cycle (RED: an agent fails the task
264
+ without the skill → GREEN: the skill counters those specific rationalizations →
265
+ REFACTOR: close newly-discovered loopholes) and the pattern of testing a skill
266
+ UNDER PRESSURE (time / sunk-cost / authority / exhaustion / scope-minimization)
267
+ rather than only statically. We re-derive the methodology + pattern in GDD's own
268
+ runtime and skill set:
269
+
270
+ scripts/lib/skill-behavior/runner.cjs
271
+ └─ The manifest-driven pressure-scenario runner (injectable agent-invoker
272
+ seam, N-attempts + majority rule, RED→GREEN structured result) adapts
273
+ writing-skills' TDD-for-skills test loop. GDD content: the injectable
274
+ invoker seam (no SDK dependency — D-03), the scenario-manifest schema,
275
+ and the stub-LLM CI path.
276
+
277
+ test/suite/skill-behavior/scenarios/*.json
278
+ └─ The pressure-scenario manifest pattern (a scenario applies a named
279
+ pressure to a skill and scores compliance vs violation against a rubric)
280
+ adapts writing-skills' pressure-test pattern. The specific scenarios,
281
+ pressures, rubrics, and the 8 covered skills are GDD-specific.
282
+
283
+ reference/schemas/pressure-scenario.schema.json
284
+ └─ The scenario-manifest contract formalizing the pattern. GDD original.
285
+
286
+ The methodology + pattern are the contribution being attributed; the scenarios,
287
+ rubrics, runner implementation, and skills are original to get-design-done.
288
+
252
289
  ────────────────────────────────────────────────────────────────────────
253
290
 
254
291
  Note on the broader codebase: get-design-done as a whole is licensed under
255
292
  the MIT License (see LICENSE). The Apache 2.0 attribution above applies
256
293
  specifically to the cc-multi-cli-derived files listed under the Phase 27
257
- block. The MIT attributions under Phase 28.5, Phase 28.7, and Phase 32 cover
258
- content/mechanism adapted from mattpocock/skills (MIT), gsd-build/get-shit-done
259
- (MIT), and obra/superpowers (MIT) respectively — the MIT-to-MIT re-licensing is
260
- straightforward and the attributions above provide the required source
261
- citation. The MIT and Apache 2.0 licenses are compatible — see
294
+ block. The MIT attributions under Phase 28.5, Phase 28.7, Phase 32, and
295
+ Phase 33 cover content/mechanism/methodology adapted from mattpocock/skills
296
+ (MIT), gsd-build/get-shit-done (MIT), obra/superpowers (MIT), and
297
+ obra/superpowers/skills/writing-skills (MIT) respectively the MIT-to-MIT
298
+ re-licensing is straightforward and the attributions above provide the
299
+ required source citation. The MIT and Apache 2.0 licenses are compatible — see
262
300
  https://www.apache.org/legal/resolved.html#category-a.
package/README.md CHANGED
@@ -288,6 +288,19 @@ GDD ships 70+ skills, but a description-match skill router consults them opportu
288
288
 
289
289
  See [`skills/using-gdd/SKILL.md`](skills/using-gdd/SKILL.md) and the `NOTICE` attribution for details.
290
290
 
291
+ ### Skill behavior tests (v1.33.0+)
292
+
293
+ Static validators check a skill's shape; **behavior tests** check that it holds under pressure. v1.33.0 adds a manifest-driven pressure-scenario harness (porting the TDD-for-skills methodology + pressure-scenario pattern from [`obra/superpowers/skills/writing-skills`](https://github.com/obra/superpowers), MIT): a runner drives a scenario (time / sunk-cost / authority / exhaustion / scope-minimization) through an injectable agent-invoker and scores the response against a compliance/violation rubric with N-attempts + majority rule. Ships 8 scenarios (7 stage skills + `using-gdd`) with synthetic RED baselines.
294
+
295
+ Behavior tests are **opt-in** and key-gated — the default `npm test` stub suite covers the harness structurally and stays CI-green (LLM non-determinism keeps live runs out of CI). To run the live pass:
296
+
297
+ ```bash
298
+ # Skips + exits 0 when ANTHROPIC_API_KEY is unset.
299
+ ANTHROPIC_API_KEY=sk-... GDD_BEHAVIOR_INVOKER=./path/to/invoker.cjs npm run test:behavior
300
+ ```
301
+
302
+ See [`docs/research/description-format-ab.md`](docs/research/description-format-ab.md) for the description-format A/B methodology and [`CONTRIBUTING.md`](CONTRIBUTING.md) ("How to add a pressure scenario").
303
+
291
304
 
292
305
  ## How It Works
293
306
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hegemonart/get-design-done",
3
- "version": "1.32.0",
3
+ "version": "1.33.0",
4
4
  "description": "A design-quality pipeline for AI coding agents: brief, plan, implement, and verify UI work against your design system.",
5
5
  "author": "Hegemon",
6
6
  "homepage": "https://github.com/hegemonart/get-design-done",
@@ -24,7 +24,6 @@
24
24
  "recipes/",
25
25
  "docs/i18n/",
26
26
  "scripts/lib/",
27
- "scripts/mcp-servers/",
28
27
  "scripts/cli/",
29
28
  "scripts/install.cjs",
30
29
  "SKILL.md",
@@ -51,6 +50,7 @@
51
50
  "prepack": "npm run build:sdk",
52
51
  "postpack": "node scripts/build-sdk-bins.cjs --clean",
53
52
  "test": "node --test --experimental-strip-types \"test/suite/**/*.test.cjs\" \"test/suite/**/*.test.ts\"",
53
+ "test:behavior": "node scripts/run-behavior-tests.cjs",
54
54
  "typecheck": "tsc --noEmit",
55
55
  "codegen:schemas": "node --experimental-strip-types scripts/codegen-schema-types.ts",
56
56
  "lint:md": "npx --yes markdownlint-cli2 \"**/*.md\" \"#node_modules\" \"#.planning\" \"#.claude\" \"#test/fixtures/baselines\"",
@@ -0,0 +1,69 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "$id": "https://get-design-done.example/schemas/pressure-scenario.schema.json",
4
+ "title": "Pressure Scenario Manifest",
5
+ "description": "Contract for a Phase-33 skill-behavior pressure-scenario manifest. The runner (scripts/lib/skill-behavior/runner.cjs) loads manifests conforming to this schema, spawns a subagent against `setup_prompt` under the named `pressures`, and validates the response against the `expected_compliance` / `expected_violations` regex sources (compiled with new RegExp(source)). The 5-value `pressures` enum and the required-field set come verbatim from ROADMAP Phase-33 SC#2.",
6
+ "type": "object",
7
+ "additionalProperties": false,
8
+ "required": [
9
+ "name",
10
+ "target_skill",
11
+ "pressures",
12
+ "setup_prompt",
13
+ "expected_compliance",
14
+ "expected_violations"
15
+ ],
16
+ "properties": {
17
+ "name": {
18
+ "type": "string",
19
+ "minLength": 1,
20
+ "description": "Unique scenario identifier, e.g. \"brief-time-pressure\"."
21
+ },
22
+ "target_skill": {
23
+ "type": "string",
24
+ "minLength": 1,
25
+ "description": "The skill under test, e.g. \"brief\", \"explore\", \"plan\", \"using-gdd\"."
26
+ },
27
+ "pressures": {
28
+ "type": "array",
29
+ "minItems": 1,
30
+ "description": "One or more pressure vectors applied in the setup_prompt.",
31
+ "items": {
32
+ "enum": ["time", "sunk-cost", "authority", "exhaustion", "scope-minimization"]
33
+ }
34
+ },
35
+ "setup_prompt": {
36
+ "type": "string",
37
+ "minLength": 1,
38
+ "description": "The prompt handed to the subagent — embeds the pressure(s) and asks it to act."
39
+ },
40
+ "expected_compliance": {
41
+ "type": "array",
42
+ "minItems": 1,
43
+ "description": "Regex SOURCE strings the response MUST match to count as compliant (the runner compiles each with new RegExp(source)).",
44
+ "items": { "type": "string", "minLength": 1 }
45
+ },
46
+ "expected_violations": {
47
+ "type": "array",
48
+ "description": "Regex SOURCE strings that, if matched, count as a violation (the runner compiles each with new RegExp(source)). May be empty.",
49
+ "items": { "type": "string", "minLength": 1 }
50
+ },
51
+ "description": {
52
+ "type": "string",
53
+ "description": "Optional free-text scenario note (33-03 baselines reference it)."
54
+ },
55
+ "variant": {
56
+ "type": "string",
57
+ "description": "Optional A/B variant label, e.g. \"trigger-only\" | \"what-clause\" (33-04 description-format A/B)."
58
+ },
59
+ "variants": {
60
+ "type": "array",
61
+ "description": "Optional array of A/B variant descriptors for a single-manifest A/B pair (33-04). Each item is an object, e.g. { label, description }.",
62
+ "items": { "type": "object" }
63
+ },
64
+ "body_probe": {
65
+ "type": "string",
66
+ "description": "Optional body-only probe prompt the A/B scenario asks (33-04 description-format A/B)."
67
+ }
68
+ }
69
+ }
@@ -0,0 +1,187 @@
1
+ /**
2
+ * runner.cjs — manifest-driven pressure-scenario runner (Plan 33-01).
3
+ *
4
+ * The ROOT engine of Phase 33: every later plan (33-03 scenarios, 33-04 A/B,
5
+ * 33-05 telemetry) builds on this. It loads a parsed pressure-scenario
6
+ * manifest, invokes an agent via an INJECTABLE `invokeAgent(prompt, opts) ->
7
+ * { text }` seam, runs N attempts (default 3), scores each response against
8
+ * the manifest's expected_compliance[] (must-match regexes) and
9
+ * expected_violations[] (failure regexes), applies a STRICT 2/3 majority
10
+ * rule, and emits a structured result.
11
+ *
12
+ * D-03 — invoker-agnostic, NO direct Anthropic SDK dependency:
13
+ * This file deps on node:fs + node:path ONLY. It NEVER requires the
14
+ * Anthropic SDK package. The default invoker is the deterministic stub at
15
+ * ./stub-invoker.cjs so CI/tests run with no API key and no network. A
16
+ * maintainer later wires a real invoker (peer-CLI ACP spawn or a thin keyed
17
+ * SDK adapter) by passing opts.invokeAgent. (The guard test asserts the
18
+ * exact package name never appears in this source.)
19
+ *
20
+ * Purity / injectability:
21
+ * invokeAgent, the clock (now), and fs are all injectable via opts so every
22
+ * test drives the stub with a fixed clock.
23
+ *
24
+ * Result (EXACT shape):
25
+ * {
26
+ * scenario: string, // = manifest.name
27
+ * attempts: Array<{ // one entry per attempt (length === attempts)
28
+ * text: string,
29
+ * pass: boolean, // ALL compliance matched AND zero violations
30
+ * compliance_hits: number, // # expected_compliance regexes matching this text
31
+ * violation_hits: number, // # expected_violations regexes matching this text
32
+ * }>,
33
+ * pass: boolean, // MAJORITY: (#passing attempts) * 2 > attempts.length
34
+ * compliance_hits: number, // aggregate sum across attempts
35
+ * violation_hits: number, // aggregate sum across attempts
36
+ * }
37
+ *
38
+ * Pattern reference (NOT a dependency): scripts/lib/event-chain.cjs shows the
39
+ * house CommonJS idiom (defensive fs, pure functions). Style mirrored, not imported.
40
+ */
41
+
42
+ 'use strict';
43
+
44
+ const nodeFs = require('node:fs');
45
+ const path = require('node:path');
46
+
47
+ const DEFAULT_ATTEMPTS = 3;
48
+
49
+ /**
50
+ * Load a pressure-scenario manifest. Accepts either an already-parsed object
51
+ * (returned as-is) or a path to a JSON file (read + parsed via the injectable
52
+ * fs). Keeping this injectable lets later plans (33-03) load real manifest
53
+ * files while tests pass inline objects.
54
+ *
55
+ * @param {object | string} input parsed manifest OR a path to a JSON manifest
56
+ * @param {{ fs?: typeof import('node:fs') }} [deps]
57
+ * @returns {object} the parsed manifest
58
+ */
59
+ function loadManifest(input, deps) {
60
+ if (input && typeof input === 'object') {
61
+ return input;
62
+ }
63
+ if (typeof input === 'string') {
64
+ const fs = (deps && deps.fs) || nodeFs;
65
+ const abs = path.isAbsolute(input) ? input : path.resolve(process.cwd(), input);
66
+ const raw = fs.readFileSync(abs, 'utf8');
67
+ return JSON.parse(raw);
68
+ }
69
+ throw new TypeError('loadManifest: input must be a parsed manifest object or a path string');
70
+ }
71
+
72
+ /**
73
+ * Compile an array of regex SOURCE strings into RegExp objects. Manifests
74
+ * author patterns as plain strings (NOT pre-compiled) so they stay JSON-safe;
75
+ * the runner owns compilation.
76
+ *
77
+ * @param {unknown} sources
78
+ * @returns {RegExp[]}
79
+ */
80
+ function compilePatterns(sources) {
81
+ if (!Array.isArray(sources)) return [];
82
+ return sources.map((src) => new RegExp(String(src)));
83
+ }
84
+
85
+ /**
86
+ * Coerce an invoker's `.text` to a string. A non-string (or absent) value
87
+ * becomes '' so scoring never throws and is treated as a compliance-miss.
88
+ *
89
+ * @param {unknown} response
90
+ * @returns {string}
91
+ */
92
+ function textOf(response) {
93
+ if (response && typeof response.text === 'string') return response.text;
94
+ return '';
95
+ }
96
+
97
+ /**
98
+ * Score a single response text against pre-compiled compliance/violation
99
+ * regexes.
100
+ *
101
+ * @param {string} text
102
+ * @param {RegExp[]} complianceRes
103
+ * @param {RegExp[]} violationRes
104
+ * @returns {{ text: string, pass: boolean, compliance_hits: number, violation_hits: number }}
105
+ */
106
+ function scoreAttempt(text, complianceRes, violationRes) {
107
+ const compliance_hits = complianceRes.filter((re) => re.test(text)).length;
108
+ const violation_hits = violationRes.filter((re) => re.test(text)).length;
109
+ // An attempt PASSES iff ALL compliance regexes matched AND zero violations did.
110
+ const pass = compliance_hits === complianceRes.length && violation_hits === 0;
111
+ return { text, pass, compliance_hits, violation_hits };
112
+ }
113
+
114
+ /**
115
+ * Run a pressure scenario: invoke the seam N times, score each response, and
116
+ * apply a strict majority rule.
117
+ *
118
+ * @param {object} manifest parsed pressure-scenario manifest
119
+ * { name, target_skill, pressures[], setup_prompt, expected_compliance[], expected_violations[] }
120
+ * @param {{
121
+ * invokeAgent?: (prompt: string, opts: object) => { text: string },
122
+ * attempts?: number,
123
+ * now?: () => number,
124
+ * fs?: typeof import('node:fs'),
125
+ * }} [opts]
126
+ * @returns {{
127
+ * scenario: string,
128
+ * attempts: Array<{ text: string, pass: boolean, compliance_hits: number, violation_hits: number }>,
129
+ * pass: boolean,
130
+ * compliance_hits: number,
131
+ * violation_hits: number,
132
+ * }}
133
+ */
134
+ function runScenario(manifest, opts) {
135
+ const o = opts || {};
136
+ // D-03: default to the deterministic stub invoker — never the real SDK.
137
+ const invokeAgent = o.invokeAgent || require('./stub-invoker.cjs').invokeAgent;
138
+ const attempts =
139
+ Number.isInteger(o.attempts) && o.attempts > 0 ? o.attempts : DEFAULT_ATTEMPTS;
140
+ // Injectable clock (reserved for future telemetry timestamps; called so the
141
+ // seam is exercised and a fixed now() is honored).
142
+ const now = typeof o.now === 'function' ? o.now : Date.now;
143
+
144
+ const complianceRes = compilePatterns(manifest && manifest.expected_compliance);
145
+ const violationRes = compilePatterns(manifest && manifest.expected_violations);
146
+ const scenario = manifest && manifest.name;
147
+ const prompt = (manifest && manifest.setup_prompt) || '';
148
+
149
+ const attemptResults = [];
150
+ for (let i = 0; i < attempts; i++) {
151
+ now(); // exercise the injectable clock (deterministic under a fixed now)
152
+ let text = '';
153
+ try {
154
+ // Pass the scenario key through so the stub (or a real invoker) can key on it.
155
+ const response = invokeAgent(prompt, { scenario, attempt: i });
156
+ text = textOf(response);
157
+ } catch (_err) {
158
+ // A thrown invoker must NOT crash the run — record a failed empty attempt.
159
+ text = '';
160
+ }
161
+ attemptResults.push(scoreAttempt(text, complianceRes, violationRes));
162
+ }
163
+
164
+ const passed = attemptResults.filter((a) => a.pass).length;
165
+ // STRICT majority: 2/3 and 3/3 pass; 0/3 and 1/3 fail.
166
+ const pass = passed * 2 > attemptResults.length;
167
+
168
+ const compliance_hits = attemptResults.reduce((sum, a) => sum + a.compliance_hits, 0);
169
+ const violation_hits = attemptResults.reduce((sum, a) => sum + a.violation_hits, 0);
170
+
171
+ return {
172
+ scenario,
173
+ attempts: attemptResults,
174
+ pass,
175
+ compliance_hits,
176
+ violation_hits,
177
+ };
178
+ }
179
+
180
+ module.exports = {
181
+ runScenario,
182
+ loadManifest,
183
+ // Exposed for unit-level reuse / later plans; not part of the core contract.
184
+ scoreAttempt,
185
+ compilePatterns,
186
+ DEFAULT_ATTEMPTS,
187
+ };
@@ -0,0 +1,95 @@
1
+ /**
2
+ * stub-invoker.cjs — deterministic, scenario-keyed agent invoker (Plan 33-01).
3
+ *
4
+ * The DEFAULT invokeAgent seam for `runner.cjs` (D-03): the runner is
5
+ * invoker-agnostic and exposes an injectable `invokeAgent(prompt, opts) ->
6
+ * { text }` seam. A maintainer later wires a REAL invoker (a peer-CLI ACP
7
+ * spawn of a local `claude`/`codex`, or a thin keyed SDK adapter); this stub
8
+ * is what every Phase-33 CI/structural test drives so runs are reproducible
9
+ * with NO API key and NO network.
10
+ *
11
+ * Determinism contract:
12
+ * * NO randomness, NO network, NO @anthropic-ai/sdk.
13
+ * * A canned response is resolved by a KEY derived from
14
+ * opts.scenario || opts.stubKey, falling back to scanning `prompt` for a
15
+ * registered key marker.
16
+ * * An UNKNOWN key returns a neutral { text: '' } so the runner never throws.
17
+ *
18
+ * Tests MAY instead pass their own inline invokeAgent to runScenario — both
19
+ * paths are valid (D-03). This module is the no-arg default.
20
+ */
21
+
22
+ 'use strict';
23
+
24
+ // Internal canned-response table: key -> response text. Seeded with one
25
+ // illustrative scenario; callers extend it via register().
26
+ const TABLE = new Map([
27
+ // A neutral, compliance-shaped sample so the default stub is non-empty for a
28
+ // known demo key. Real scenarios register their own canned text.
29
+ [
30
+ 'runner-demo',
31
+ 'A <HARD-GATE> blocks me — I must write the brief before any other stage.',
32
+ ],
33
+ ]);
34
+
35
+ /**
36
+ * Seed or overwrite a canned response for a scenario key.
37
+ *
38
+ * @param {string} key scenario name / stub key
39
+ * @param {string} text canned response text the stub returns for that key
40
+ * @returns {void}
41
+ */
42
+ function register(key, text) {
43
+ if (typeof key !== 'string' || key.length === 0) {
44
+ throw new TypeError('register: key must be a non-empty string');
45
+ }
46
+ TABLE.set(key, typeof text === 'string' ? text : String(text == null ? '' : text));
47
+ }
48
+
49
+ /**
50
+ * Resolve a response key from opts, then (as a fallback) by scanning the
51
+ * prompt for any registered key as a substring marker.
52
+ *
53
+ * @param {string} prompt
54
+ * @param {{scenario?: string, stubKey?: string} | undefined} opts
55
+ * @returns {string | undefined}
56
+ */
57
+ function resolveKey(prompt, opts) {
58
+ if (opts && typeof opts.scenario === 'string' && opts.scenario.length > 0) {
59
+ return opts.scenario;
60
+ }
61
+ if (opts && typeof opts.stubKey === 'string' && opts.stubKey.length > 0) {
62
+ return opts.stubKey;
63
+ }
64
+ if (typeof prompt === 'string' && prompt.length > 0) {
65
+ for (const key of TABLE.keys()) {
66
+ if (prompt.includes(key)) return key;
67
+ }
68
+ }
69
+ return undefined;
70
+ }
71
+
72
+ /**
73
+ * Deterministic invokeAgent-shaped function. Returns a canned { text } for a
74
+ * known scenario key, or a neutral { text: '' } for an unknown key (so the
75
+ * runner can score it as a compliance-miss without throwing).
76
+ *
77
+ * @param {string} prompt
78
+ * @param {{scenario?: string, stubKey?: string}} [opts]
79
+ * @returns {{ text: string }}
80
+ */
81
+ function invokeAgent(prompt, opts) {
82
+ const key = resolveKey(prompt, opts);
83
+ if (key !== undefined && TABLE.has(key)) {
84
+ return { text: TABLE.get(key) };
85
+ }
86
+ // Unknown key -> neutral default; never throw.
87
+ return { text: '' };
88
+ }
89
+
90
+ module.exports = {
91
+ invokeAgent,
92
+ register,
93
+ // Exposed for advanced callers/tests that want to inspect or reset seeds.
94
+ _table: TABLE,
95
+ };
@@ -0,0 +1,379 @@
1
+ /**
2
+ * telemetry.cjs — reflector-telemetry layer for the pressure-scenario harness
3
+ * (Plan 33-05). The third leg of Phase 33: it CONSUMES the 33-01 runner result
4
+ * ({ scenario, target_skill, pass, compliance_hits, violation_hits }), records a
5
+ * scenario-failure event to a JSONL artifact, detects SUSTAINED failure, and on
6
+ * sustained failure produces a PROPOSE-ONLY reflector content-edit draft via the
7
+ * same incubator/apply-reflections surface the shipped reflector-kfm-proposer
8
+ * uses.
9
+ *
10
+ * Why this module exists: behavior tests only matter if a sustained failure
11
+ * prompts a content fix. This closes that loop — a failing run is recorded; when
12
+ * a scenario fails ≥3 of its last 10 runs (D-07 threshold), the reflector
13
+ * proposes a skill-content edit for human review via /gdd:apply-reflections. The
14
+ * proposal NEVER auto-edits a skill (Phase 11/29 propose-only SC; Phase 33
15
+ * out-of-scope: "Auto-applying reflector-proposed skill edits — propose-only").
16
+ *
17
+ * Decisions honored:
18
+ * * D-07 — telemetry → .design/telemetry/skill-behavior.jsonl (runtime
19
+ * artifact, gitignored, local); sustained-failure signal = ≥3 of the last 10
20
+ * runs failing for a scenario; reflector consumption is STUB-tested (no live
21
+ * runs — all paths + the clock are injectable so tests use a tmp dir).
22
+ * * D-06 — this module is exercised by the DEFAULT suite (no API key / no LLM).
23
+ *
24
+ * Injectability / purity:
25
+ * The JSONL path, the incubator root, `fs`, and the clock (`now`) are ALL
26
+ * injectable via opts so every test writes to an os.tmpdir() dir and NOTHING
27
+ * touches the real .design/ tree. The runner (33-01) does NOT stamp a `ts`;
28
+ * the timestamp is stamped HERE via the injected `now`.
29
+ *
30
+ * Pattern references (style mirrored, NOT imported):
31
+ * * scripts/lib/event-chain.cjs — house JSONL append (defensive mkdir -p +
32
+ * append, never-throw) + findRepoRoot + line-by-line read idiom.
33
+ * * scripts/lib/reflector-kfm-proposer.cjs — shouldPropose-style stability gate
34
+ * + proposeKfmDraft writing a proposal-only draft under
35
+ * .design/reflections/incubator/<slug>/CATALOGUE-ENTRY.md.
36
+ *
37
+ * Public API:
38
+ * recordRun(result, opts) → event | null (append on pass:false)
39
+ * readRuns(scenario, opts) → Array<event> (tail JSONL, filter)
40
+ * isSustainedFailure(scenario, opts) → boolean (≥3 of last 10 failed)
41
+ * maybeProposeReflection(scenario, opts) → { action:'drafted', path, slug }
42
+ * | { action:'skipped', reason }
43
+ *
44
+ * Pure CommonJS, deps = node:fs + node:path ONLY. No npm dependencies.
45
+ */
46
+
47
+ 'use strict';
48
+
49
+ const nodeFs = require('node:fs');
50
+ const path = require('node:path');
51
+
52
+ // -------------------------------------------------------------------
53
+ // Constants
54
+ // -------------------------------------------------------------------
55
+
56
+ const EVENT_TYPE = 'skill_behavior_failure';
57
+ const DEFAULT_JSONL_REL = '.design/telemetry/skill-behavior.jsonl';
58
+ const DEFAULT_INCUBATOR_REL = '.design/reflections/incubator';
59
+ const SUSTAINED_WINDOW = 10; // D-07: look at the last N runs
60
+ const SUSTAINED_THRESHOLD = 3; // D-07: ≥3 failures of the last 10 == sustained
61
+ const INCUBATOR_PREFIX = 'skill-edit-';
62
+
63
+ // -------------------------------------------------------------------
64
+ // Helpers
65
+ // -------------------------------------------------------------------
66
+
67
+ /**
68
+ * Walk up from a start dir until a package.json is found (repo root). Mirrors
69
+ * the reflector-kfm-proposer / event-chain findRepoRoot idiom.
70
+ *
71
+ * @param {string} [startDir]
72
+ * @returns {string}
73
+ */
74
+ function findRepoRoot(startDir) {
75
+ let dir = startDir || __dirname;
76
+ for (let i = 0; i < 12; i++) {
77
+ if (nodeFs.existsSync(path.join(dir, 'package.json'))) return dir;
78
+ const parent = path.dirname(dir);
79
+ if (parent === dir) break;
80
+ dir = parent;
81
+ }
82
+ return path.resolve(__dirname, '..', '..', '..');
83
+ }
84
+
85
+ /**
86
+ * Resolve the JSONL emit path: explicit opts.jsonlPath wins (absolute or
87
+ * relative to cwd); otherwise <repoRoot>/.design/telemetry/skill-behavior.jsonl.
88
+ */
89
+ function resolveJsonlPath(opts) {
90
+ const o = opts || {};
91
+ if (o.jsonlPath) {
92
+ return path.isAbsolute(o.jsonlPath)
93
+ ? o.jsonlPath
94
+ : path.resolve(o.repoRoot || process.cwd(), o.jsonlPath);
95
+ }
96
+ return path.join(o.repoRoot || findRepoRoot(), DEFAULT_JSONL_REL);
97
+ }
98
+
99
+ /**
100
+ * Resolve the incubator draft root: explicit opts.incubatorRoot wins; otherwise
101
+ * <repoRoot>/.design/reflections/incubator.
102
+ */
103
+ function resolveIncubatorRoot(opts) {
104
+ const o = opts || {};
105
+ if (o.incubatorRoot) {
106
+ return path.isAbsolute(o.incubatorRoot)
107
+ ? o.incubatorRoot
108
+ : path.resolve(o.repoRoot || process.cwd(), o.incubatorRoot);
109
+ }
110
+ return path.join(o.repoRoot || findRepoRoot(), DEFAULT_INCUBATOR_REL);
111
+ }
112
+
113
+ /**
114
+ * Kebab-case slug from a free-text scenario name (mirrors the reflector-kfm
115
+ * deriveSlug semantics — ASCII-only, dash-collapsed, ≤40 chars).
116
+ */
117
+ function deriveSlug(text) {
118
+ const raw = typeof text === 'string' ? text : '';
119
+ let s = raw.toLowerCase();
120
+ s = s.replace(/[^\x20-\x7e]+/g, '');
121
+ s = s.replace(/[^a-z0-9]+/g, '-');
122
+ s = s.replace(/-+/g, '-');
123
+ s = s.replace(/^-+|-+$/g, '');
124
+ if (s.length > 40) s = s.slice(0, 40);
125
+ s = s.replace(/-+$/g, '');
126
+ return s || 'unnamed';
127
+ }
128
+
129
+ // -------------------------------------------------------------------
130
+ // recordRun — emit a scenario-failure event to the JSONL artifact
131
+ // -------------------------------------------------------------------
132
+
133
+ /**
134
+ * Append ONE scenario-failure event to the JSONL artifact when a 33-01 runner
135
+ * result has pass:false. The timestamp is stamped HERE via the injected clock
136
+ * (the runner does not emit a `ts`). On a passing result, returns null (the
137
+ * sustained-failure detector reads failures only).
138
+ *
139
+ * Never throws on a missing .design/ tree — mkdir -p the parent defensively and
140
+ * swallow write errors (mirrors event-chain.cjs).
141
+ *
142
+ * EVENT SHAPE:
143
+ * { event_type:'skill_behavior_failure', scenario, target_skill?, pass:false,
144
+ * compliance_hits, violation_hits, ts }
145
+ *
146
+ * @param {{ scenario:string, target_skill?:string, pass:boolean,
147
+ * compliance_hits?:number, violation_hits?:number }} result
148
+ * @param {{ jsonlPath?:string, fs?:typeof import('node:fs'),
149
+ * now?:() => number|string, repoRoot?:string }} [opts]
150
+ * @returns {object | null} the appended event, or null on a passing result
151
+ */
152
+ function recordRun(result, opts) {
153
+ const o = opts || {};
154
+ const fs = o.fs || nodeFs;
155
+ const now = typeof o.now === 'function' ? o.now : () => new Date().toISOString();
156
+
157
+ if (!result || typeof result !== 'object') return null;
158
+ // Detector reads FAILURES only — a passing run emits nothing.
159
+ if (result.pass !== false) return null;
160
+
161
+ const event = {
162
+ event_type: EVENT_TYPE,
163
+ scenario: result.scenario,
164
+ pass: false,
165
+ compliance_hits: Number.isFinite(result.compliance_hits) ? result.compliance_hits : 0,
166
+ violation_hits: Number.isFinite(result.violation_hits) ? result.violation_hits : 0,
167
+ ts: now(),
168
+ };
169
+ // Preserve target_skill when the runner supplied it (useful for the proposal).
170
+ if (result.target_skill !== undefined) event.target_skill = result.target_skill;
171
+
172
+ const jsonlPath = resolveJsonlPath(o);
173
+ try {
174
+ fs.mkdirSync(path.dirname(jsonlPath), { recursive: true });
175
+ fs.appendFileSync(jsonlPath, JSON.stringify(event) + '\n', { flag: 'a' });
176
+ } catch (err) {
177
+ // Defensive: telemetry must never crash a run. Mirror event-chain.cjs.
178
+ try {
179
+ process.stderr.write(
180
+ `[skill-behavior-telemetry] write failed: ${err && err.message ? err.message : String(err)}\n`,
181
+ );
182
+ } catch (_e) {
183
+ /* swallow */
184
+ }
185
+ }
186
+ return event;
187
+ }
188
+
189
+ // -------------------------------------------------------------------
190
+ // readRuns — tail the JSONL, filter by scenario
191
+ // -------------------------------------------------------------------
192
+
193
+ /**
194
+ * Read the JSONL artifact and return every recorded event for `scenario`, in
195
+ * file order (oldest → newest). Defensive on a missing file: returns []. Invalid
196
+ * JSON lines are skipped.
197
+ *
198
+ * @param {string} scenario
199
+ * @param {{ jsonlPath?:string, fs?:typeof import('node:fs'), repoRoot?:string }} [opts]
200
+ * @returns {Array<object>}
201
+ */
202
+ function readRuns(scenario, opts) {
203
+ const o = opts || {};
204
+ const fs = o.fs || nodeFs;
205
+ const jsonlPath = resolveJsonlPath(o);
206
+ if (!fs.existsSync(jsonlPath)) return [];
207
+
208
+ let raw;
209
+ try {
210
+ raw = fs.readFileSync(jsonlPath, 'utf8');
211
+ } catch (_e) {
212
+ return [];
213
+ }
214
+
215
+ const out = [];
216
+ for (const line of raw.split('\n')) {
217
+ if (line.trim() === '') continue;
218
+ let rec;
219
+ try {
220
+ rec = JSON.parse(line);
221
+ } catch (_e) {
222
+ continue; // skip malformed line
223
+ }
224
+ if (rec && rec.scenario === scenario) out.push(rec);
225
+ }
226
+ return out;
227
+ }
228
+
229
+ // -------------------------------------------------------------------
230
+ // isSustainedFailure — ≥3 of the last 10 runs failed for a scenario (D-07)
231
+ // -------------------------------------------------------------------
232
+
233
+ /**
234
+ * Sustained-failure detector. Considers the LAST 10 runs for `scenario` and
235
+ * returns true iff ≥3 of them failed (D-07). Accepts EITHER an in-memory
236
+ * opts.window (array of `{ pass }` objects — for unit tests) OR reads the
237
+ * on-disk JSONL tail via readRuns().
238
+ *
239
+ * Boundary: 2/10 → false, 3/10 → true; strictly windowed to the last 10 (older
240
+ * failures excluded).
241
+ *
242
+ * Note: recordRun only persists FAILURE events, so the on-disk path counts each
243
+ * recorded row as a failure. The in-memory window path inspects `pass` so tests
244
+ * can mix pass/fail entries to exercise the windowing math precisely.
245
+ *
246
+ * @param {string} scenario
247
+ * @param {{ window?:Array<{pass:boolean}>, jsonlPath?:string,
248
+ * fs?:typeof import('node:fs'), window_size?:number,
249
+ * threshold?:number, repoRoot?:string }} [opts]
250
+ * @returns {boolean}
251
+ */
252
+ function isSustainedFailure(scenario, opts) {
253
+ const o = opts || {};
254
+ const windowSize = Number.isInteger(o.window_size) && o.window_size > 0 ? o.window_size : SUSTAINED_WINDOW;
255
+ const threshold = Number.isInteger(o.threshold) && o.threshold > 0 ? o.threshold : SUSTAINED_THRESHOLD;
256
+
257
+ let runs;
258
+ if (Array.isArray(o.window)) {
259
+ runs = o.window;
260
+ } else {
261
+ runs = readRuns(scenario, o);
262
+ }
263
+
264
+ // Strictly the LAST `windowSize` runs.
265
+ const tail = runs.slice(-windowSize);
266
+ // A row counts as a failure when pass === false. On-disk rows are all failures
267
+ // (recordRun only persists pass:false), so a missing `pass` defaults to failed
268
+ // for the disk path; the in-memory window always carries an explicit `pass`.
269
+ const failures = tail.filter((r) => r && r.pass !== true).length;
270
+ return failures >= threshold;
271
+ }
272
+
273
+ // -------------------------------------------------------------------
274
+ // maybeProposeReflection — propose-only reflector content-edit draft
275
+ // -------------------------------------------------------------------
276
+
277
+ /**
278
+ * Reflector consumption point (mirrors reflector-kfm-proposer's shouldPropose +
279
+ * proposeKfmDraft idiom): gate on isSustainedFailure(scenario); if NOT sustained
280
+ * return { action:'skipped', reason:'below_sustained_threshold' }; if sustained,
281
+ * write a PROPOSE-ONLY draft under the (injectable) incubator root at
282
+ * <incubatorRoot>/skill-edit-<scenario>/CATALOGUE-ENTRY.md naming the failing
283
+ * scenario/skill + the sustained-failure signal + a TODO for the content edit,
284
+ * and return { action:'drafted', path, slug }.
285
+ *
286
+ * This draft lands in the SAME incubator tree that
287
+ * scripts/lib/apply-reflections/incubator-proposals.cjs surfaces in
288
+ * /gdd:apply-reflections — so a maintainer reviews + accepts/rejects the proposed
289
+ * skill edit there. It NEVER auto-edits a skill (Phase 11/29 propose-only SC;
290
+ * Phase 33 out-of-scope).
291
+ *
292
+ * @param {string} scenario
293
+ * @param {{ window?:Array<{pass:boolean}>, jsonlPath?:string,
294
+ * incubatorRoot?:string, fs?:typeof import('node:fs'),
295
+ * now?:() => number|string, target_skill?:string,
296
+ * repoRoot?:string }} [opts]
297
+ * @returns {{ action:'drafted', path:string, slug:string }
298
+ * | { action:'skipped', reason:string }}
299
+ */
300
+ function maybeProposeReflection(scenario, opts) {
301
+ const o = opts || {};
302
+ const fs = o.fs || nodeFs;
303
+ const now = typeof o.now === 'function' ? o.now : () => new Date().toISOString();
304
+
305
+ // Stability gate — the ≥3/10 sustained-failure threshold (analogous to the
306
+ // reflector-kfm ≥K gate).
307
+ if (!isSustainedFailure(scenario, o)) {
308
+ return { action: 'skipped', reason: 'below_sustained_threshold' };
309
+ }
310
+
311
+ const slug = `${INCUBATOR_PREFIX}${deriveSlug(scenario)}`;
312
+ const incubatorRoot = resolveIncubatorRoot(o);
313
+ const draftDir = path.join(incubatorRoot, slug);
314
+ const draftPath = path.join(draftDir, 'CATALOGUE-ENTRY.md');
315
+
316
+ // Best-effort target_skill: prefer an injected hint, else the latest recorded
317
+ // failure event for this scenario (recordRun stamps target_skill).
318
+ let targetSkill = o.target_skill;
319
+ if (!targetSkill && !Array.isArray(o.window)) {
320
+ const recorded = readRuns(scenario, o);
321
+ const last = recorded.length ? recorded[recorded.length - 1] : null;
322
+ if (last && last.target_skill) targetSkill = last.target_skill;
323
+ }
324
+
325
+ const body = [
326
+ `# Skill-edit proposal — ${scenario}`,
327
+ '',
328
+ `**Source:** skill-behavior-telemetry (pressure-scenario harness)`,
329
+ `**Failing scenario:** ${scenario}`,
330
+ `**Target skill:** ${targetSkill || 'TODO: <skill that failed under pressure>'}`,
331
+ `**Signal:** sustained failure — ≥${SUSTAINED_THRESHOLD} of the last ${SUSTAINED_WINDOW} runs failed (D-07).`,
332
+ '',
333
+ `Drafted ${now()}. **PROPOSE-ONLY** — review via \`/gdd:apply-reflections\`.`,
334
+ 'This draft NEVER auto-edits a skill (Phase 11/29 propose-only SC; Phase 33 out-of-scope).',
335
+ '',
336
+ '## Rationalization signal',
337
+ '',
338
+ `The "${scenario}" pressure scenario is failing repeatedly: the target skill is`,
339
+ 'not holding under pressure (an agent is rationalizing past its HARD-GATE /',
340
+ 'rationalization table). A content edit is proposed to close the loophole.',
341
+ '',
342
+ '## Proposed content edit',
343
+ '',
344
+ `- TODO: identify which rationalization the "${scenario}" scenario exploits.`,
345
+ '- TODO: add / strengthen the counter-rationalization row in the target skill',
346
+ " (the '| Thought | Reality |' table) OR tighten its <HARD-GATE> wording.",
347
+ '- TODO: re-run `npm run test:behavior` for this scenario to confirm GREEN.',
348
+ '',
349
+ ].join('\n');
350
+
351
+ try {
352
+ fs.mkdirSync(draftDir, { recursive: true });
353
+ fs.writeFileSync(draftPath, body);
354
+ } catch (err) {
355
+ // A draft-write failure must not crash the harness; surface as skipped.
356
+ return { action: 'skipped', reason: `draft_write_failed: ${err && err.message ? err.message : String(err)}` };
357
+ }
358
+
359
+ return { action: 'drafted', path: draftPath, slug };
360
+ }
361
+
362
+ // -------------------------------------------------------------------
363
+ // Exports
364
+ // -------------------------------------------------------------------
365
+
366
+ module.exports = {
367
+ recordRun,
368
+ readRuns,
369
+ isSustainedFailure,
370
+ maybeProposeReflection,
371
+ // Exposed for tests / higher-level integration.
372
+ EVENT_TYPE,
373
+ DEFAULT_JSONL_REL,
374
+ DEFAULT_INCUBATOR_REL,
375
+ SUSTAINED_WINDOW,
376
+ SUSTAINED_THRESHOLD,
377
+ _deriveSlug: deriveSlug,
378
+ _findRepoRoot: findRepoRoot,
379
+ };
@@ -1,29 +0,0 @@
1
- // scripts/lib/cli/index.ts — GDD-DEPRECATION-SHIM (Plan 31-5-06, SDK-05, D-02).
2
- //
3
- // Thin deprecation shim. The real implementation moved to sdk/cli/index.ts
4
- // in Plan 31-5-04 (SDK consolidation). This file is re-created at the OLD
5
- // path so undocumented EXTERNAL importers (anyone who reached into
6
- // node_modules/@hegemonart/get-design-done/scripts/lib/cli/index.ts directly)
7
- // keep working for one minor grace window.
8
- //
9
- // REMOVED IN v1.33.0 (D-02). Grace window: 1.31.5 ships with shims →
10
- // 1.32.0 still has them → 1.33.0 removes them. Internal callers already use
11
- // the sdk/ path (Plan 31-5-04/05) — this shim is external-only; 31-5-10's
12
- // no-stale-internal-refs guard excludes files carrying the
13
- // GDD-DEPRECATION-SHIM marker above.
14
- //
15
- // Runs under --experimental-strip-types (the runtime `bin/gdd-sdk` and the
16
- // test suite both use it), so `export *` re-export is strip-types-clean.
17
-
18
- import { emitWarning } from 'node:process';
19
-
20
- let warned = false;
21
- if (!warned) {
22
- warned = true;
23
- emitWarning(
24
- 'scripts/lib/cli/index.ts is deprecated; import sdk/cli instead. Removed in v1.33.0.',
25
- 'DeprecationWarning',
26
- );
27
- }
28
-
29
- export * from '../../../sdk/cli/index.ts';
@@ -1,29 +0,0 @@
1
- 'use strict';
2
- // scripts/lib/error-classifier.cjs — GDD-DEPRECATION-SHIM (Plan 31-5-06, SDK-05, D-02).
3
- //
4
- // Thin deprecation shim. The real implementation moved to
5
- // sdk/primitives/error-classifier.cjs in Plan 31-5-04 (SDK consolidation).
6
- // This file is re-created at the OLD path so undocumented EXTERNAL importers
7
- // (anyone who reached into node_modules/@hegemonart/get-design-done/scripts/
8
- // lib/error-classifier.cjs directly) keep working for one minor grace window.
9
- //
10
- // REMOVED IN v1.33.0 (D-02). Grace window: 1.31.5 ships with shims →
11
- // 1.32.0 still has them → 1.33.0 removes them. Internal callers already use
12
- // the sdk/ path (Plan 31-5-04/05) — this shim is external-only and 31-5-10's
13
- // no-stale-internal-refs guard excludes files carrying the GDD-DEPRECATION-SHIM
14
- // marker above.
15
- //
16
- // Emits a DeprecationWarning exactly ONCE per process: the module-level
17
- // `warned` flag plus Node's module cache (this file is evaluated once per
18
- // process regardless of how many times it is required).
19
-
20
- let warned = false;
21
- if (!warned) {
22
- warned = true;
23
- process.emitWarning(
24
- 'scripts/lib/error-classifier.cjs is deprecated; import sdk/primitives/error-classifier instead. Removed in v1.33.0.',
25
- 'DeprecationWarning',
26
- );
27
- }
28
-
29
- module.exports = require('../../sdk/primitives/error-classifier.cjs');
@@ -1,29 +0,0 @@
1
- // scripts/lib/event-stream/index.ts — GDD-DEPRECATION-SHIM (Plan 31-5-06, SDK-05, D-02).
2
- //
3
- // Thin deprecation shim. The real implementation moved to
4
- // sdk/event-stream/index.ts in Plan 31-5-04 (SDK consolidation). This file
5
- // is re-created at the OLD path so undocumented EXTERNAL importers (anyone
6
- // who reached into node_modules/@hegemonart/get-design-done/scripts/lib/
7
- // event-stream/index.ts directly) keep working for one minor grace window.
8
- //
9
- // REMOVED IN v1.33.0 (D-02). Grace window: 1.31.5 ships with shims →
10
- // 1.32.0 still has them → 1.33.0 removes them. Internal callers already use
11
- // the sdk/ path (Plan 31-5-04/05) — this shim is external-only; 31-5-10's
12
- // no-stale-internal-refs guard excludes files carrying the
13
- // GDD-DEPRECATION-SHIM marker above.
14
- //
15
- // Runs under --experimental-strip-types, so `export *` re-export is
16
- // strip-types-clean.
17
-
18
- import { emitWarning } from 'node:process';
19
-
20
- let warned = false;
21
- if (!warned) {
22
- warned = true;
23
- emitWarning(
24
- 'scripts/lib/event-stream/index.ts is deprecated; import sdk/event-stream instead. Removed in v1.33.0.',
25
- 'DeprecationWarning',
26
- );
27
- }
28
-
29
- export * from '../../../sdk/event-stream/index.ts';
@@ -1,29 +0,0 @@
1
- // scripts/lib/gdd-errors/index.ts — GDD-DEPRECATION-SHIM (Plan 31-5-06, SDK-05, D-02).
2
- //
3
- // Thin deprecation shim. The real implementation moved to sdk/errors/index.ts
4
- // in Plan 31-5-04 (SDK consolidation). This file is re-created at the OLD
5
- // path so undocumented EXTERNAL importers (anyone who reached into
6
- // node_modules/@hegemonart/get-design-done/scripts/lib/gdd-errors/index.ts
7
- // directly) keep working for one minor grace window.
8
- //
9
- // REMOVED IN v1.33.0 (D-02). Grace window: 1.31.5 ships with shims →
10
- // 1.32.0 still has them → 1.33.0 removes them. Internal callers already use
11
- // the sdk/ path (Plan 31-5-04/05) — this shim is external-only; 31-5-10's
12
- // no-stale-internal-refs guard excludes files carrying the
13
- // GDD-DEPRECATION-SHIM marker above.
14
- //
15
- // Runs under --experimental-strip-types, so `export *` re-export is
16
- // strip-types-clean.
17
-
18
- import { emitWarning } from 'node:process';
19
-
20
- let warned = false;
21
- if (!warned) {
22
- warned = true;
23
- emitWarning(
24
- 'scripts/lib/gdd-errors/index.ts is deprecated; import sdk/errors instead. Removed in v1.33.0.',
25
- 'DeprecationWarning',
26
- );
27
- }
28
-
29
- export * from '../../../sdk/errors/index.ts';
@@ -1,29 +0,0 @@
1
- // scripts/lib/gdd-state/index.ts — GDD-DEPRECATION-SHIM (Plan 31-5-06, SDK-05, D-02).
2
- //
3
- // Thin deprecation shim. The real implementation moved to sdk/state/index.ts
4
- // in Plan 31-5-04 (SDK consolidation). This file is re-created at the OLD
5
- // path so undocumented EXTERNAL importers (anyone who reached into
6
- // node_modules/@hegemonart/get-design-done/scripts/lib/gdd-state/index.ts
7
- // directly) keep working for one minor grace window.
8
- //
9
- // REMOVED IN v1.33.0 (D-02). Grace window: 1.31.5 ships with shims →
10
- // 1.32.0 still has them → 1.33.0 removes them. Internal callers already use
11
- // the sdk/ path (Plan 31-5-04/05) — this shim is external-only; 31-5-10's
12
- // no-stale-internal-refs guard excludes files carrying the
13
- // GDD-DEPRECATION-SHIM marker above.
14
- //
15
- // Runs under --experimental-strip-types, so `export *` re-export is
16
- // strip-types-clean.
17
-
18
- import { emitWarning } from 'node:process';
19
-
20
- let warned = false;
21
- if (!warned) {
22
- warned = true;
23
- emitWarning(
24
- 'scripts/lib/gdd-state/index.ts is deprecated; import sdk/state instead. Removed in v1.33.0.',
25
- 'DeprecationWarning',
26
- );
27
- }
28
-
29
- export * from '../../../sdk/state/index.ts';
@@ -1,29 +0,0 @@
1
- 'use strict';
2
- // scripts/lib/iteration-budget.cjs — GDD-DEPRECATION-SHIM (Plan 31-5-06, SDK-05, D-02).
3
- //
4
- // Thin deprecation shim. The real implementation moved to
5
- // sdk/primitives/iteration-budget.cjs in Plan 31-5-04 (SDK consolidation).
6
- // This file is re-created at the OLD path so undocumented EXTERNAL importers
7
- // (anyone who reached into node_modules/@hegemonart/get-design-done/scripts/
8
- // lib/iteration-budget.cjs directly) keep working for one minor grace window.
9
- //
10
- // REMOVED IN v1.33.0 (D-02). Grace window: 1.31.5 ships with shims →
11
- // 1.32.0 still has them → 1.33.0 removes them. Internal callers already use
12
- // the sdk/ path (Plan 31-5-04/05) — this shim is external-only and 31-5-10's
13
- // no-stale-internal-refs guard excludes files carrying the GDD-DEPRECATION-SHIM
14
- // marker above.
15
- //
16
- // Emits a DeprecationWarning exactly ONCE per process: the module-level
17
- // `warned` flag plus Node's module cache (this file is evaluated once per
18
- // process regardless of how many times it is required).
19
-
20
- let warned = false;
21
- if (!warned) {
22
- warned = true;
23
- process.emitWarning(
24
- 'scripts/lib/iteration-budget.cjs is deprecated; import sdk/primitives/iteration-budget instead. Removed in v1.33.0.',
25
- 'DeprecationWarning',
26
- );
27
- }
28
-
29
- module.exports = require('../../sdk/primitives/iteration-budget.cjs');
@@ -1,29 +0,0 @@
1
- 'use strict';
2
- // scripts/lib/jittered-backoff.cjs — GDD-DEPRECATION-SHIM (Plan 31-5-06, SDK-05, D-02).
3
- //
4
- // Thin deprecation shim. The real implementation moved to
5
- // sdk/primitives/jittered-backoff.cjs in Plan 31-5-04 (SDK consolidation).
6
- // This file is re-created at the OLD path so undocumented EXTERNAL importers
7
- // (anyone who reached into node_modules/@hegemonart/get-design-done/scripts/
8
- // lib/jittered-backoff.cjs directly) keep working for one minor grace window.
9
- //
10
- // REMOVED IN v1.33.0 (D-02). Grace window: 1.31.5 ships with shims →
11
- // 1.32.0 still has them → 1.33.0 removes them. Internal callers already use
12
- // the sdk/ path (Plan 31-5-04/05) — this shim is external-only and 31-5-10's
13
- // no-stale-internal-refs guard excludes files carrying the GDD-DEPRECATION-SHIM
14
- // marker above.
15
- //
16
- // Emits a DeprecationWarning exactly ONCE per process: the module-level
17
- // `warned` flag plus Node's module cache (this file is evaluated once per
18
- // process regardless of how many times it is required).
19
-
20
- let warned = false;
21
- if (!warned) {
22
- warned = true;
23
- process.emitWarning(
24
- 'scripts/lib/jittered-backoff.cjs is deprecated; import sdk/primitives/jittered-backoff instead. Removed in v1.33.0.',
25
- 'DeprecationWarning',
26
- );
27
- }
28
-
29
- module.exports = require('../../sdk/primitives/jittered-backoff.cjs');
@@ -1,29 +0,0 @@
1
- 'use strict';
2
- // scripts/lib/lockfile.cjs — GDD-DEPRECATION-SHIM (Plan 31-5-06, SDK-05, D-02).
3
- //
4
- // Thin deprecation shim. The real implementation moved to
5
- // sdk/primitives/lockfile.cjs in Plan 31-5-04 (SDK consolidation).
6
- // This file is re-created at the OLD path so undocumented EXTERNAL importers
7
- // (anyone who reached into node_modules/@hegemonart/get-design-done/scripts/
8
- // lib/lockfile.cjs directly) keep working for one minor grace window.
9
- //
10
- // REMOVED IN v1.33.0 (D-02). Grace window: 1.31.5 ships with shims →
11
- // 1.32.0 still has them → 1.33.0 removes them. Internal callers already use
12
- // the sdk/ path (Plan 31-5-04/05) — this shim is external-only and 31-5-10's
13
- // no-stale-internal-refs guard excludes files carrying the GDD-DEPRECATION-SHIM
14
- // marker above.
15
- //
16
- // Emits a DeprecationWarning exactly ONCE per process: the module-level
17
- // `warned` flag plus Node's module cache (this file is evaluated once per
18
- // process regardless of how many times it is required).
19
-
20
- let warned = false;
21
- if (!warned) {
22
- warned = true;
23
- process.emitWarning(
24
- 'scripts/lib/lockfile.cjs is deprecated; import sdk/primitives/lockfile instead. Removed in v1.33.0.',
25
- 'DeprecationWarning',
26
- );
27
- }
28
-
29
- module.exports = require('../../sdk/primitives/lockfile.cjs');
@@ -1,35 +0,0 @@
1
- // scripts/mcp-servers/gdd-mcp/server.ts — GDD-DEPRECATION-SHIM (Plan 31-5-06, SDK-05, D-02).
2
- //
3
- // Thin deprecation shim. The real MCP `gdd-mcp` server moved to
4
- // sdk/mcp/gdd-mcp/server.ts in Plan 31-5-05 (SDK consolidation, D-08). This
5
- // file is re-created at the OLD path so undocumented EXTERNAL importers /
6
- // invokers (anyone who reached into node_modules/@hegemonart/get-design-done/
7
- // scripts/mcp-servers/gdd-mcp/server.ts directly) keep working for one minor
8
- // grace window.
9
- //
10
- // REMOVED IN v1.33.0 (D-02). Grace window: 1.31.5 ships with shims →
11
- // 1.32.0 still has them → 1.33.0 removes them. The canonical invocation is
12
- // now the `bin/gdd-mcp` trampoline (Plan 31-5-05); internal callers already
13
- // use the sdk/ path. This shim is external-only; 31-5-10's
14
- // no-stale-internal-refs guard excludes files carrying the
15
- // GDD-DEPRECATION-SHIM marker above.
16
- //
17
- // Re-exporting the sdk/ server keeps the library surface (buildServer,
18
- // runStdio, SERVER_NAME, SERVER_VERSION, TOOL_DESCRIPTIONS, TOOL_READONLY)
19
- // reachable via the old path. The sdk/ server's own isMain() entry guard
20
- // keys off process.argv[1] ending with its own sdk/ path, so a re-export
21
- // does NOT auto-start the server — direct execution should go through the
22
- // bin trampoline. Runs under --experimental-strip-types.
23
-
24
- import { emitWarning } from 'node:process';
25
-
26
- let warned = false;
27
- if (!warned) {
28
- warned = true;
29
- emitWarning(
30
- 'scripts/mcp-servers/gdd-mcp/server.ts is deprecated; use the bin/gdd-mcp trampoline or import sdk/mcp/gdd-mcp instead. Removed in v1.33.0.',
31
- 'DeprecationWarning',
32
- );
33
- }
34
-
35
- export * from '../../../sdk/mcp/gdd-mcp/server.ts';
@@ -1,34 +0,0 @@
1
- // scripts/mcp-servers/gdd-state/server.ts — GDD-DEPRECATION-SHIM (Plan 31-5-06, SDK-05, D-02).
2
- //
3
- // Thin deprecation shim. The real MCP `gdd-state` server moved to
4
- // sdk/mcp/gdd-state/server.ts in Plan 31-5-04 (SDK consolidation). This file
5
- // is re-created at the OLD path so undocumented EXTERNAL importers / invokers
6
- // (anyone who reached into node_modules/@hegemonart/get-design-done/scripts/
7
- // mcp-servers/gdd-state/server.ts directly) keep working for one minor grace
8
- // window.
9
- //
10
- // REMOVED IN v1.33.0 (D-02). Grace window: 1.31.5 ships with shims →
11
- // 1.32.0 still has them → 1.33.0 removes them. The canonical invocation is
12
- // now the `bin/gdd-state-mcp` trampoline (Plan 31-5-05); internal callers
13
- // already use the sdk/ path. This shim is external-only; 31-5-10's
14
- // no-stale-internal-refs guard excludes files carrying the
15
- // GDD-DEPRECATION-SHIM marker above.
16
- //
17
- // Re-exporting the sdk/ server keeps the library surface (buildServer,
18
- // runStdio) reachable via the old path. The sdk/ server's own isMain()
19
- // entry guard keys off process.argv[1] ending with its own sdk/ path, so a
20
- // re-export does NOT auto-start the server — direct execution should go
21
- // through the bin trampoline. Runs under --experimental-strip-types.
22
-
23
- import { emitWarning } from 'node:process';
24
-
25
- let warned = false;
26
- if (!warned) {
27
- warned = true;
28
- emitWarning(
29
- 'scripts/mcp-servers/gdd-state/server.ts is deprecated; use the bin/gdd-state-mcp trampoline or import sdk/mcp/gdd-state instead. Removed in v1.33.0.',
30
- 'DeprecationWarning',
31
- );
32
- }
33
-
34
- export * from '../../../sdk/mcp/gdd-state/server.ts';