@tekyzinc/gsd-t 3.13.15 → 3.15.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +57 -0
  2. package/README.md +7 -7
  3. package/bin/gsd-t-benchmark-orchestrator.js +437 -0
  4. package/bin/gsd-t-capture-lint.cjs +276 -0
  5. package/bin/gsd-t-completion-check.cjs +106 -0
  6. package/bin/gsd-t-orchestrator-config.cjs +64 -0
  7. package/bin/gsd-t-orchestrator-queue.cjs +180 -0
  8. package/bin/gsd-t-orchestrator-recover.cjs +231 -0
  9. package/bin/gsd-t-orchestrator-worker.cjs +187 -0
  10. package/bin/gsd-t-orchestrator.js +534 -0
  11. package/bin/gsd-t-stream-feed-client.cjs +151 -0
  12. package/bin/gsd-t-task-brief-compactor.cjs +89 -0
  13. package/bin/gsd-t-task-brief-template.cjs +96 -0
  14. package/bin/gsd-t-task-brief.js +249 -0
  15. package/bin/gsd-t-token-backfill.cjs +366 -0
  16. package/bin/gsd-t-token-capture.cjs +285 -0
  17. package/bin/gsd-t-token-dashboard.cjs +318 -0
  18. package/bin/gsd-t.js +315 -2
  19. package/commands/gsd-t-debug.md +63 -51
  20. package/commands/gsd-t-design-decompose.md +2 -7
  21. package/commands/gsd-t-doc-ripple.md +20 -11
  22. package/commands/gsd-t-execute.md +82 -50
  23. package/commands/gsd-t-help.md +1 -1
  24. package/commands/gsd-t-integrate.md +43 -16
  25. package/commands/gsd-t-plan.md +20 -7
  26. package/commands/gsd-t-prd.md +19 -12
  27. package/commands/gsd-t-quick.md +64 -29
  28. package/commands/gsd-t-resume.md +33 -0
  29. package/commands/gsd-t-unattended.md +19 -20
  30. package/commands/gsd-t-verify.md +48 -32
  31. package/commands/gsd-t-visualize.md +19 -17
  32. package/commands/gsd-t-wave.md +29 -27
  33. package/docs/m40-benchmark-report.md +35 -0
  34. package/docs/requirements.md +20 -0
  35. package/package.json +1 -1
  36. package/scripts/gsd-t-design-review-server.js +3 -1
  37. package/scripts/gsd-t-stream-feed-server.js +428 -0
  38. package/scripts/gsd-t-stream-feed.html +1168 -0
  39. package/scripts/gsd-t-token-aggregator.js +373 -0
  40. package/scripts/hooks/pre-commit-capture-lint +26 -0
  41. package/templates/CLAUDE-global.md +69 -5
  42. package/scripts/gsd-t-agent-dashboard-server.js +0 -424
  43. package/scripts/gsd-t-agent-dashboard.html +0 -1043
package/CHANGELOG.md CHANGED
@@ -2,6 +2,63 @@
2
2
 
3
3
  All notable changes to GSD-T are documented here. Updated with each release.
4
4
 
5
+ ## [3.15.10] - 2026-04-20
6
+
7
+ ### Added — Universal Token Capture Across GSD-T (M41)
8
+
9
+ Every subagent spawn across GSD-T now routes through a single shared wrapper, retiring the silent `| N/A |` Tokens convention that preceded M41. Every spawn's input/output/cache tokens and cost USD land in both the human-readable `.gsd-t/token-log.md` and the machine-readable `.gsd-t/metrics/token-usage.jsonl` (schema v1, reused from M40 D4).
10
+
11
+ **Token-capture wrapper (D1)**: `bin/gsd-t-token-capture.cjs` exports `captureSpawn({command, step, model, description, projectDir, spawnFn, domain?, task?})` and `recordSpawnRow({...})`. Parses bare + `.result`-wrapped + stream-json envelopes with assistant-vs-result precedence. Missing usage renders `—`, never `0`, never `N/A`. Migration-in-place upgrades existing `.gsd-t/token-log.md` to the canonical 12-column header (adds Tokens + Compacted columns).
12
+
13
+ **Command-file doc-ripple (D2)**: all 20 spawn-capable `commands/*.md` files converted from inline `T_START=$(date +%s)` bash blocks to `captureSpawn`/`recordSpawnRow` pattern. `templates/CLAUDE-global.md` and the project `CLAUDE.md` carry the Token Capture Rule (MANDATORY). A canonical-block drift-guard test (`test/m41-canonical-block-drift.test.js`) asserts no legacy blocks remain and every OBSERVABILITY LOGGING declaration pairs with a wrapper call.
14
+
15
+ **Historical backfill (D3)**: `bin/gsd-t-token-backfill.cjs` + `gsd-t backfill-tokens [--since YYYY-MM-DD] [--patch-log] [--dry-run]`. Walks `.gsd-t/events/*.jsonl`, `.gsd-t/stream-feed/*.jsonl`, and `.gsd-t/headless-*.log`. Handles both event-stream frames and stream-json frames. Idempotent via `source: "backfill"` key-tuple tracking. `--patch-log` atomically rewrites legacy `N/A`/`0`/`—` Tokens cells in place using tmp+rename.
16
+
17
+ **Token dashboard (D4)**: `bin/gsd-t-token-dashboard.cjs` + `gsd-t tokens [--since] [--milestone] [--format table|json]`. Streams JSONL via `readline.createInterface`; aggregates byDay/byCommand/byModel; top-10 spawns by cost desc; cache-hit rate per model; rolling 7-day projection (daily avg × 30). Injects a 3-line token block at the tail of `gsd-t status`. Perf gate: 22ms on 10k-line JSONL (budget 500ms).
18
+
19
+ **Enforcement (D5)**: `bin/gsd-t-capture-lint.cjs` + `gsd-t capture-lint [--staged|--all]`. Greps for `Task({`, `spawn('claude', ...)`, and `claude -p` without a surrounding `captureSpawn`/`recordSpawnRow` within ±20 lines. Balanced-quote heuristic excludes JS-string-literal false positives. Whitelists: wrapper/linter modules themselves, `test/**`, `commands/gsd-t-help.md`, comment-only lines, markdown prose outside fences, any line with `GSD-T-CAPTURE-LINT: skip` marker nearby. Opt-in pre-commit hook via `gsd-t init --install-hooks` — appends idempotently to `.git/hooks/pre-commit` with a `# GSD-T capture lint` marker; never overwrites existing hooks.
20
+
21
+ Tests: +27 net (1479/1479 total). No new contracts — reuses M40's `metrics-schema-contract.md` v1 and `stream-json-sink-contract.md` v1.1.0.
22
+
23
+ ## [3.14.10] - 2026-04-20
24
+
25
+ ### Added — External Task Orchestrator + Streaming Watcher UI (M40)
26
+
27
+ JS orchestrator (`bin/gsd-t-orchestrator.js`) drives `claude -p` one task per spawn: short-lived, fresh context, architecturally compaction-free. Benchmark gate PASS: 226s orchestrator vs 316s in-session on 20-task/3-wave/4-domain fixture (0.72× wall-clock, threshold 1.05×).
28
+
29
+ **Orchestrator core (D1)**: wave-barrier join, per-wave Promise.all parallelism (default 3, ceiling 15 per Team Mode §15), workerPid attribution, SIGINT handler, retry policy per completion-signal-contract (first FAIL → single retry; second FAIL → halt wave), state.json atomic writes, task-boundary + wave-boundary synthetic frames emitted to stream-feed clients.
30
+
31
+ **Task brief builder (D2)**: `bin/gsd-t-task-brief.js` composes 2–5 KB self-contained per-task briefs from `.gsd-t/domains/{domain}/{scope,constraints,tasks}.md` + named contract excerpts + stack rules + Done Signal section; drop-order compactor guarantees non-droppable sections always survive.
32
+
33
+ **Completion protocol (D3)**: `bin/gsd-t-completion-check.cjs` `assertCompletion()` returns `{ok, missing[], details}` by checking commit-on-expected-branch + progress.md entry + test exit. Ambiguous tasks (commit present but no progress entry) are flagged for operator triage — never silently claimed done.
34
+
35
+ **Stream-feed server (D4)**: `scripts/gsd-t-stream-feed-server.js` — HTTP POST /ingest, WebSocket /feed?from=N replay, 127.0.0.1:7842, JSONL persist-before-broadcast. `scripts/gsd-t-token-aggregator.js` parses assistant + result envelope usage and writes `.gsd-t/metrics/token-usage.jsonl` schema v1 + rewrites `.gsd-t/token-log.md` in place. New CLI: `gsd-t stream-feed`.
36
+
37
+ **Stream-feed UI (D5)**: `scripts/gsd-t-stream-feed.html` — 47.5 KB, zero-dep, zero-token-cost local dashboard. Dark-mode claude.ai-style continuous feed with task/wave banners (duration + cost/tokens chips), token corner bar (running total), localStorage-persisted filters (tasks/domains/waves), auto-scroll pause + "↓ Jump to live" button.
38
+
39
+ **Recovery and resume (D6)**: `bin/gsd-t-orchestrator-recover.cjs` `recoverRunState()` reconciles interrupted runs via assertCompletion replay; `--resume` + `--no-archive` flags on `gsd-t orchestrate`; `/gsd-t-resume` Step 0.3 auto-detects in-flight state.json and surfaces resume invocation; 24 recovery unit tests cover fresh/terminal/resume modes + ambiguous classification + PID liveness.
40
+
41
+ **Contracts**: `stream-json-sink-contract.md` v1.0.0 → **v1.1.0** (new §"Usage field propagation" documenting assistant vs result envelope semantics); `wave-join-contract.md`, `completion-signal-contract.md`, `metrics-schema-contract.md` — all test-backed.
42
+
43
+ **Tests**: 1421/1421 pass (up from 1240 at M39 close, +181). 16 new M40 test files. Zero coverage gaps. Zero placeholder patterns (goal-backward PASS).
44
+
45
+ **New CLI subcommands**: `gsd-t orchestrate`, `gsd-t benchmark-orchestrator`, `gsd-t stream-feed`.
46
+
47
+ **Files**: `bin/gsd-t-orchestrator.js`, `bin/gsd-t-orchestrator-worker.cjs`, `bin/gsd-t-orchestrator-queue.cjs`, `bin/gsd-t-orchestrator-config.cjs`, `bin/gsd-t-orchestrator-recover.cjs`, `bin/gsd-t-completion-check.cjs`, `bin/gsd-t-benchmark-orchestrator.js`, `bin/gsd-t-task-brief.js`, `bin/gsd-t-task-brief-template.cjs`, `bin/gsd-t-task-brief-compactor.cjs`, `scripts/gsd-t-stream-feed-server.js`, `scripts/gsd-t-stream-feed.html`, `scripts/gsd-t-token-aggregator.js`, `templates/prompts/m40-task-brief.md`, 16 M40 test files, 4 new/updated contracts, `commands/gsd-t-resume.md` Step 0.3.
48
+
49
+ ## [3.13.16] - 2026-04-17
50
+
51
+ ### Changed — Removed proactive suggestions to use `/gsd-t-unattended`; positioned as overnight/idle-only
52
+
53
+ The unattended supervisor remains supported for genuine overnight or multi-hour idle runs but is no longer pitched as a general workflow option. In practice it runs 5–10× slower than in-session execution because every worker iteration pays cold-context startup cost (re-reads CLAUDE.md, progress.md, all domain files) before doing real work, then is bounded to a 270s cache-warm budget. Daytime work belongs in-session.
54
+
55
+ **Files**:
56
+ - `templates/CLAUDE-global.md` — removed the "Unattended Execution (M36)" section that pitched it as a feature alongside in-session.
57
+ - `commands/gsd-t-help.md` — repositioned the `unattended*` rows under AUTOMATION as overnight-only with a slowness caveat.
58
+ - `README.md` — removed the top-level "Unattended execution" feature bullet; renamed the commands-table heading and the full section heading to "Overnight / Idle-Run …" with a leading callout that daytime work runs in-session; reworded the M38 headless-by-default bullet to drop "via the unattended supervisor" framing.
59
+
60
+ **No behavioral changes.** Commands `/gsd-t-unattended`, `/gsd-t-unattended-watch`, `/gsd-t-unattended-stop` continue to work exactly as before. The supervisor contract is unchanged.
61
+
5
62
  ## [3.13.15] - 2026-04-17
6
63
 
7
64
  ### Fixed — Self-protection guard now uses package-name identity + narrow `bin/*.cjs` gitignore rule
package/README.md CHANGED
@@ -10,11 +10,11 @@ A methodology for reliable, parallelizable development using Claude Code with op
10
10
  **Protects existing work** — destructive action guard prevents schema drops, architecture replacements, and data loss without explicit approval.
11
11
  **Visualizes execution in real time** — live browser dashboard renders agent hierarchy, tool activity, and phase progression from the event stream.
12
12
  **Generates visual scan reports** — every `/gsd-t-scan` produces a self-contained HTML report with 6 live architectural diagrams, a tech debt register, and domain health scores; optional DOCX/PDF export via `--export docx|pdf`.
13
- **Unattended execution** — `gsd-t unattended --hours=N` spawns a detached OS-process supervisor that drives the active milestone to completion over hours or days with zero human intervention. A ScheduleWakeup watch loop ticks every 270 seconds; `/clear` + resume transparently re-attaches to the running supervisor.
14
13
  **Self-learning rule engine** — declarative rules in rules.jsonl detect failure patterns from task metrics. Candidate patches progress through a 5-stage lifecycle (candidate, applied, measured, promoted, graduated) with >55% improvement gates before becoming permanent methodology artifacts.
15
14
  **Cross-project learning** — proven rules propagate to `~/.claude/metrics/` and sync across all registered projects via `update-all`. Rules validated in 3+ projects become universal; 5+ projects qualify for npm distribution. Cross-project signal comparison and global ELO rankings available via `gsd-t-metrics --cross-project` and `gsd-t-status`.
16
15
  **Stack Rules Engine** — auto-detects project tech stack (React, TypeScript, Node API, Python, Go, Rust) from manifest files and injects mandatory best-practice rules into subagent prompts at execute-time. Universal security rules always apply; stack-specific rules layer on top. Includes **design-to-code** rules for pixel-perfect frontend implementation from Figma, screenshots, or design images — with Figma MCP integration, design token extraction, stack capability evaluation, and mandatory visual verification: every screen is rendered in a real browser, screenshotted at mobile/tablet/desktop, and compared pixel-by-pixel against the Figma design. Auto-bootstraps during partition when design references are detected. Extensible: drop a `.md` file in `templates/stacks/` to add a new stack.
17
- **Headless-by-Default Spawn (M38, v3.12.10)** — long-running workflow commands (execute, wave, integrate, debug repair loops) spawn detached by default via the unattended supervisor. The interactive session prints a launch banner, logs the event-stream path, and exits. Pass `--watch` to keep a live status block in the session (270s `ScheduleWakeup` ticks, cache-window-safe). The supervisor emits JSONL events to `.gsd-t/events/YYYY-MM-DD.jsonl` at every phase boundary shared by watch command and dashboard. See `.gsd-t/contracts/headless-default-contract.md` v1.0.0 and `unattended-event-stream-contract.md` v1.0.0.
16
+ **External Task Orchestrator + Streaming Watcher UI (M40, v3.14.10)** — JS orchestrator drives `claude -p` one task per spawn: short-lived, fresh context, architecturally compaction-free. Benchmarks 0.72× wall-clock vs in-session on 20-task/3-wave workloads. Paired with a zero-Claude-cost local streaming UI at `127.0.0.1:7842` that renders all workers' stream-json output as a continuous claude.ai-style feed task/wave banners, duration + usage chips, token corner bar, localStorage filters, replay via `WS /feed?from=N`. Recovery: `--resume` reconciles interrupted runs using commit + progress.md evidence; ambiguous tasks (commit without progress entry) are flagged for operator triage, never silently claimed done. CLI: `gsd-t orchestrate`, `gsd-t benchmark-orchestrator`, `gsd-t stream-feed`. Contracts: `stream-json-sink-contract.md` v1.1.0, `wave-join-contract.md`, `completion-signal-contract.md`, `metrics-schema-contract.md`.
17
+ **Headless-by-Default Spawn (M38, v3.12.10)** — long-running workflow commands (execute, wave, integrate, debug repair loops) spawn detached by default. The interactive session prints a launch banner, logs the event-stream path, and exits. Pass `--watch` to keep a live status block in the session (270s `ScheduleWakeup` ticks, cache-window-safe). Detached workers emit JSONL events to `.gsd-t/events/YYYY-MM-DD.jsonl` at every phase boundary — shared by watch command and dashboard. See `.gsd-t/contracts/headless-default-contract.md` v1.0.0 and `unattended-event-stream-contract.md` v1.0.0.
18
18
  - **Surgical model selection** — `bin/model-selector.js` assigns haiku/sonnet/opus per phase via a declarative rules table; `/advisor` escalation path with convention-based fallback.
19
19
  - **Per-spawn token telemetry** — `.gsd-t/token-metrics.jsonl` records one 18-field row per Task subagent spawn.
20
20
  **Context Meter (M34/M38)** — PostToolUse hook writes `.gsd-t/.context-meter-state.json` via local token estimation. Single-band model (`context-meter-contract.md` v1.3.0): one threshold (default 85%), one action — hand off to a detached headless spawn. The meter informs spawn-time routing, not in-flight pauses.
@@ -172,11 +172,11 @@ This will replace changed command files, back up your CLAUDE.md if customized, a
172
172
  | `/gsd-t-verify` | Run quality gates + goal-backward behavior verification | In wave |
173
173
  | `/gsd-t-complete-milestone` | Archive + git tag (goal-backward gate required) | In wave |
174
174
 
175
- ### Unattended Execution
175
+ ### Overnight / Idle-Run Commands (slower than in-session — use only for unattended overnight or multi-hour idle runs)
176
176
 
177
177
  | Command | Purpose | Auto |
178
178
  |---------|---------|------|
179
- | `/gsd-t-unattended` | Launch detached supervisor runs active milestone to completion with zero human intervention | Manual |
179
+ | `/gsd-t-unattended` | Launch detached supervisor for overnight/idle runs only | Manual |
180
180
  | `/gsd-t-unattended-watch` | Watch tick — fires every 270s via ScheduleWakeup, reports supervisor status | Auto |
181
181
  | `/gsd-t-unattended-stop` | Touch stop sentinel — supervisor halts after current worker finishes | Manual |
182
182
 
@@ -314,15 +314,15 @@ your-project/
314
314
 
315
315
  ---
316
316
 
317
- ## Unattended Execution (M36 — v3.10.10+)
317
+ ## Overnight / Idle-Run Supervisor (M36 — v3.10.10+)
318
318
 
319
- Run the active milestone to completion over hours or daysno human in the loop.
319
+ > **Daytime work runs in-session.** This supervisor is provided for unattended overnight or multi-hour idle runs only it is dramatically slower than in-session execution because every worker iteration pays cold-context startup cost (re-reads CLAUDE.md, progress.md, all domain files) before doing real work, then is bounded to a 270s cache-warm budget. Reach for it only when you genuinely cannot supervise the run.
320
320
 
321
321
  ```bash
322
322
  # Launch from the CLI (detached OS process)
323
323
  gsd-t unattended --hours=24
324
324
 
325
- # Or from within Claude Code (starts a 270s watch loop)
325
+ # Or from within Claude Code
326
326
  /gsd-t-unattended
327
327
 
328
328
  # Stop (graceful — supervisor halts after the current worker finishes)
@@ -0,0 +1,437 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ // D0-T2 — M40 speed-benchmark gate driver.
5
+ // Compares wall-clock time for two ways of executing the same fixed workload:
6
+ // (a) orchestrator path — one `claude -p` spawn per task, wave-join loop
7
+ // driven by bin/gsd-t-orchestrator.js
8
+ // (b) in-session path — a single `claude -p` session that runs the same
9
+ // tasks sequentially (the honest "one Claude Code
10
+ // window" comparison)
11
+ // Contract: .gsd-t/contracts/wave-join-contract.md v1.0.0
12
+
13
+ const fs = require('fs');
14
+ const os = require('os');
15
+ const path = require('path');
16
+ const { spawnSync } = require('child_process');
17
+
18
+ const PKG_ROOT = path.resolve(__dirname, '..');
19
+ const DEFAULT_FIXTURE = path.join(PKG_ROOT, 'test', 'fixtures', 'm40-benchmark-workload');
20
+ const PASS_TOLERANCE = 1.05;
21
+
22
+ function parseCliArgs(argv) {
23
+ const args = {
24
+ runs: 3,
25
+ reportPath: null,
26
+ resultsPath: null,
27
+ fixtureDir: DEFAULT_FIXTURE,
28
+ mockClaude: null,
29
+ projectDir: process.cwd(),
30
+ keepTmp: false,
31
+ help: false,
32
+ };
33
+ for (let i = 0; i < argv.length; i++) {
34
+ const a = argv[i];
35
+ if (a === '-h' || a === '--help') { args.help = true; }
36
+ else if (a === '--runs') { args.runs = parseInt(argv[++i], 10); }
37
+ else if (a === '--report-path') { args.reportPath = argv[++i]; }
38
+ else if (a === '--results-path') { args.resultsPath = argv[++i]; }
39
+ else if (a === '--fixture-dir') { args.fixtureDir = path.resolve(argv[++i]); }
40
+ else if (a === '--mock-claude') { args.mockClaude = path.resolve(argv[++i]); }
41
+ else if (a === '--project-dir') { args.projectDir = path.resolve(argv[++i]); }
42
+ else if (a === '--keep-tmp') { args.keepTmp = true; }
43
+ }
44
+ if (!Number.isInteger(args.runs) || args.runs < 1) {
45
+ throw new Error('--runs must be a positive integer');
46
+ }
47
+ return args;
48
+ }
49
+
50
+ function printHelp() {
51
+ process.stdout.write([
52
+ 'Usage: gsd-t benchmark-orchestrator [options]',
53
+ '',
54
+ 'Runs the M40 speed-benchmark kill-switch gate: compares orchestrator',
55
+ 'path vs in-session path on a fixed workload and emits a verdict.',
56
+ '',
57
+ 'Options:',
58
+ ' --runs <n> Number of runs per side (default 3).',
59
+ ' --report-path <path> Human-readable report (default docs/m40-benchmark-report.md).',
60
+ ' --results-path <path> Machine-readable JSON (default .gsd-t/benchmark-results.json).',
61
+ ' --fixture-dir <path> Override benchmark workload fixture directory.',
62
+ ' --mock-claude <path> Use this binary as `claude` (for smoke tests).',
63
+ ' --project-dir <path> Project directory to write outputs into (default cwd).',
64
+ ' --keep-tmp Preserve per-run tmp directories for diagnosis.',
65
+ ' -h, --help Show this help.',
66
+ '',
67
+ 'Verdict is PASS when median(orchestrator_ms) <= median(in-session_ms) * 1.05.',
68
+ '',
69
+ ].join('\n'));
70
+ }
71
+
72
+ function median(nums) {
73
+ if (!nums.length) return 0;
74
+ const sorted = [...nums].sort((a, b) => a - b);
75
+ const mid = Math.floor(sorted.length / 2);
76
+ return sorted.length % 2 ? sorted[mid] : Math.round((sorted[mid - 1] + sorted[mid]) / 2);
77
+ }
78
+
79
+ function copyDirSync(src, dst) {
80
+ fs.mkdirSync(dst, { recursive: true });
81
+ for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
82
+ const s = path.join(src, entry.name);
83
+ const d = path.join(dst, entry.name);
84
+ if (entry.isDirectory()) copyDirSync(s, d);
85
+ else if (entry.isSymbolicLink()) fs.symlinkSync(fs.readlinkSync(s), d);
86
+ else fs.copyFileSync(s, d);
87
+ }
88
+ }
89
+
90
+ function gitInitRepo(dir) {
91
+ const opts = { cwd: dir, stdio: 'ignore' };
92
+ spawnSync('git', ['init', '-q', '-b', 'main'], opts);
93
+ spawnSync('git', ['config', 'user.email', 'bench@gsd-t.local'], opts);
94
+ spawnSync('git', ['config', 'user.name', 'benchmark'], opts);
95
+ spawnSync('git', ['config', 'commit.gpgsign', 'false'], opts);
96
+ spawnSync('git', ['add', '-A'], opts);
97
+ spawnSync('git', ['commit', '-q', '-m', 'benchmark fixture baseline'], opts);
98
+ }
99
+
100
+ function prepareRun(fixtureDir, label, runIdx) {
101
+ const tmpBase = fs.mkdtempSync(path.join(os.tmpdir(), `m40-bench-${label}-${runIdx}-`));
102
+ const dest = path.join(tmpBase, 'workload');
103
+ copyDirSync(fixtureDir, dest);
104
+ gitInitRepo(dest);
105
+ return { tmpBase, dest };
106
+ }
107
+
108
+ function cleanupRun(tmpBase) {
109
+ try { fs.rmSync(tmpBase, { recursive: true, force: true }); } catch (_) { /* best effort */ }
110
+ }
111
+
112
+ function buildChildEnv(mockClaude) {
113
+ const env = { ...process.env };
114
+ if (mockClaude) env.GSD_T_CLAUDE_BIN = mockClaude;
115
+ return env;
116
+ }
117
+
118
+ function captureOutput(res) {
119
+ const parts = [];
120
+ if (res && res.stdout) parts.push('--- stdout ---\n' + String(res.stdout));
121
+ if (res && res.stderr) parts.push('--- stderr ---\n' + String(res.stderr));
122
+ return parts.join('\n').slice(-4096);
123
+ }
124
+
125
+ function runOrchestratorSide({ fixtureDir, runIdx, mockClaude, logger, keepTmp = false, spawnImpl = spawnSync }) {
126
+ const { tmpBase, dest } = prepareRun(fixtureDir, 'orch', runIdx);
127
+ const bin = path.join(__dirname, 'gsd-t-orchestrator.js');
128
+ const t0 = Date.now();
129
+ let exitCode = null;
130
+ let output = '';
131
+ try {
132
+ const res = spawnImpl(process.execPath, [
133
+ bin,
134
+ '--milestone', 'M40-bench',
135
+ '--project-dir', dest,
136
+ '--max-parallel', '8',
137
+ '--worker-timeout', '180000',
138
+ ], {
139
+ env: buildChildEnv(mockClaude),
140
+ encoding: 'utf8',
141
+ timeout: 900000,
142
+ });
143
+ exitCode = res.status;
144
+ output = captureOutput(res);
145
+ } catch (e) {
146
+ output = 'spawn_error: ' + (e && e.message);
147
+ }
148
+ const durationMs = Date.now() - t0;
149
+ const commitAudit = auditTaskCommits(dest);
150
+ if (logger) logger.log(`[bench orch #${runIdx}] exit=${exitCode} duration=${durationMs}ms commits=${commitAudit.taskCommitCount}/${commitAudit.expectedTaskCount}`);
151
+ if (keepTmp && logger) logger.log(` kept: ${tmpBase}`);
152
+ if (!keepTmp) cleanupRun(tmpBase);
153
+ return {
154
+ durationMs,
155
+ exitCode,
156
+ stderr: output,
157
+ tmpDir: keepTmp ? tmpBase : null,
158
+ commitAudit
159
+ };
160
+ }
161
+
162
+ function runInsessionSide({ fixtureDir, runIdx, mockClaude, logger, keepTmp = false, spawnImpl = spawnSync }) {
163
+ const { tmpBase, dest } = prepareRun(fixtureDir, 'insession', runIdx);
164
+ const claudeBin = mockClaude || process.env.GSD_T_CLAUDE_BIN || 'claude';
165
+ const prompt = [
166
+ '# In-session equivalent — M40 benchmark control',
167
+ '',
168
+ 'You are simulating a single Claude Code window running /gsd-t-execute',
169
+ 'against this fixture sequentially (no external orchestrator).',
170
+ '',
171
+ `Project dir: ${dest}`,
172
+ '',
173
+ '## Required discipline (enforces parity with orchestrator path)',
174
+ '',
175
+ 'For the benchmark to be a fair comparison, you MUST apply the same',
176
+ 'per-task discipline the external orchestrator enforces on its workers.',
177
+ 'Shortcuts that collapse multiple tasks into one batch INVALIDATE the',
178
+ 'benchmark.',
179
+ '',
180
+ '1. Read `.gsd-t/contracts/completion-signal-contract.md` once at the start.',
181
+ '2. Read every `.gsd-t/domains/*/tasks.md` to enumerate the tasks.',
182
+ '3. Process tasks strictly in wave order: ALL wave 0 before ANY wave 1,',
183
+ ' ALL wave 1 before ANY wave 2, etc. Within a wave you may interleave',
184
+ ' tasks but each task MUST complete its own commit before the next',
185
+ ' task begins writing files.',
186
+ '4. For EACH task, in order:',
187
+ ' a. Read the task body from its domain tasks.md',
188
+ ' b. Create/modify ONLY the files listed in that task\'s `**Files**:` field',
189
+ ' c. Run `npm test` and verify it passes',
190
+ ' d. `git add` ONLY that task\'s owned files',
191
+ ' e. `git commit` with a message starting with the canonical task id',
192
+ ' (e.g. `bench-d1-t1: …`) on branch `main`',
193
+ ' f. Add a Decision Log entry to `.gsd-t/progress.md` naming the task id',
194
+ ' g. Commit the progress.md update as part of (e) OR as a follow-up',
195
+ ' commit whose message also starts with the same task id',
196
+ '',
197
+ '## Hard rules',
198
+ '- ONE commit per task minimum. Twenty tasks → at least twenty commits',
199
+ ' beyond the baseline commit. DO NOT bulk-commit multiple tasks.',
200
+ '- DO NOT write files for task N+1 before task N is committed.',
201
+ '- DO NOT spawn subagents — you are the single-session baseline.',
202
+ '- DO NOT push to any git remote (no `git push`).',
203
+ '- If you finish early, STOP. Do not run extra work.',
204
+ '',
205
+ 'This is the baseline the external orchestrator is being compared to.',
206
+ 'The comparison is only meaningful if both sides obey the same rules.',
207
+ '',
208
+ ].join('\n');
209
+ const t0 = Date.now();
210
+ let exitCode = null;
211
+ let output = '';
212
+ try {
213
+ const res = spawnImpl(claudeBin, [
214
+ '-p',
215
+ '--dangerously-skip-permissions',
216
+ '--output-format', 'stream-json',
217
+ '--verbose',
218
+ '--model', 'sonnet',
219
+ ], {
220
+ cwd: dest,
221
+ env: buildChildEnv(mockClaude),
222
+ input: prompt,
223
+ encoding: 'utf8',
224
+ timeout: 900000,
225
+ });
226
+ exitCode = res.status;
227
+ output = captureOutput(res);
228
+ } catch (e) {
229
+ output = 'spawn_error: ' + (e && e.message);
230
+ }
231
+ const durationMs = Date.now() - t0;
232
+ const commitAudit = auditTaskCommits(dest);
233
+ if (logger) logger.log(`[bench insession #${runIdx}] exit=${exitCode} duration=${durationMs}ms commits=${commitAudit.taskCommitCount}/${commitAudit.expectedTaskCount}`);
234
+ if (keepTmp && logger) logger.log(` kept: ${tmpBase}`);
235
+ if (!keepTmp) cleanupRun(tmpBase);
236
+ return {
237
+ durationMs,
238
+ exitCode,
239
+ stderr: output,
240
+ tmpDir: keepTmp ? tmpBase : null,
241
+ commitAudit
242
+ };
243
+ }
244
+
245
+ function auditTaskCommits(workloadDir) {
246
+ const { spawnSync: ss } = require('child_process');
247
+ const res = ss('git', ['-C', workloadDir, 'log', '--pretty=%s'], { encoding: 'utf8' });
248
+ const subjects = String(res.stdout || '').split('\n').filter(Boolean);
249
+ const taskIdRe = /^bench-d\d+-t\d+[:\s]/;
250
+ const taskCommits = subjects.filter((s) => taskIdRe.test(s));
251
+ const uniqueTaskIds = new Set(taskCommits.map((s) => s.split(/[:\s]/)[0]));
252
+ const expectedTaskCount = 20;
253
+ return {
254
+ totalCommits: subjects.length,
255
+ taskCommitCount: taskCommits.length,
256
+ uniqueTaskIds: uniqueTaskIds.size,
257
+ expectedTaskCount,
258
+ discipline: uniqueTaskIds.size >= expectedTaskCount ? 'compliant' : 'noncompliant'
259
+ };
260
+ }
261
+
262
+ function collectEnv() {
263
+ return {
264
+ node: process.version,
265
+ platform: `${process.platform}-${os.release()}`,
266
+ arch: process.arch,
267
+ cpuCount: os.cpus().length,
268
+ totalMemMb: Math.round(os.totalmem() / (1024 * 1024)),
269
+ freeMemMb: Math.round(os.freemem() / (1024 * 1024)),
270
+ };
271
+ }
272
+
273
+ function renderReportMd(results) {
274
+ const lines = [];
275
+ lines.push('# M40 Speed Benchmark — Gate Verdict');
276
+ lines.push('');
277
+ lines.push(`- **Generated**: ${results.generatedAt}`);
278
+ lines.push(`- **Runs per side**: ${results.runs}`);
279
+ lines.push(`- **Fixture**: ${results.fixtureDir}`);
280
+ lines.push(`- **Verdict**: **${results.verdict}** — ${results.verdictDetail}`);
281
+ lines.push('');
282
+ lines.push('## Environment');
283
+ lines.push('');
284
+ lines.push(`- Node: ${results.env.node}`);
285
+ lines.push(`- Platform: ${results.env.platform} (${results.env.arch})`);
286
+ lines.push(`- CPUs: ${results.env.cpuCount}`);
287
+ lines.push(`- RAM: ${results.env.freeMemMb} MB free / ${results.env.totalMemMb} MB total`);
288
+ lines.push('');
289
+ lines.push('## Per-run timings (ms) and commit-discipline audit');
290
+ lines.push('');
291
+ lines.push('| # | Orchestrator (ms / exit / commits) | In-session (ms / exit / commits) |');
292
+ lines.push('|---|------------------------------------|----------------------------------|');
293
+ for (let i = 0; i < results.runs; i++) {
294
+ const o = results.orchestrator[i] || {};
295
+ const s = results.insession[i] || {};
296
+ const oc = o.commitAudit || {};
297
+ const sc = s.commitAudit || {};
298
+ const oCommits = `${oc.uniqueTaskIds ?? '—'}/${oc.expectedTaskCount ?? '—'}`;
299
+ const sCommits = `${sc.uniqueTaskIds ?? '—'}/${sc.expectedTaskCount ?? '—'}`;
300
+ lines.push(`| ${i + 1} | ${o.durationMs ?? '—'} / ${o.exitCode ?? '—'} / ${oCommits} | ${s.durationMs ?? '—'} / ${s.exitCode ?? '—'} / ${sCommits} |`);
301
+ }
302
+ lines.push('');
303
+ lines.push(`- **Median orchestrator**: ${results.summary.medianOrchMs} ms`);
304
+ lines.push(`- **Median in-session**: ${results.summary.medianInsessionMs} ms`);
305
+ lines.push(`- **Threshold** (insession × ${PASS_TOLERANCE}): ${results.summary.thresholdMs} ms`);
306
+ lines.push('');
307
+ lines.push('## Methodology');
308
+ lines.push('');
309
+ lines.push('- Same fixture (`test/fixtures/m40-benchmark-workload/`) copied to a fresh');
310
+ lines.push(' tmp dir per run; git initialized; no cross-run state.');
311
+ lines.push('- Orchestrator path: `bin/gsd-t-orchestrator.js` drives waves via the');
312
+ lines.push(' D1 spawn loop + D2 brief builder.');
313
+ lines.push('- In-session path: a single `claude -p` session handed the tasks');
314
+ lines.push(' sequentially — no subagents.');
315
+ lines.push('- `Date.now()` wall-clock, millisecond precision. Both sides include');
316
+ lines.push(' their full lifecycle (startup + work + teardown).');
317
+ lines.push('- PASS when `median(orchestrator_ms) ≤ median(in-session_ms) × 1.05`.');
318
+ lines.push('');
319
+ return lines.join('\n');
320
+ }
321
+
322
+ function computeVerdict(orchTimings, insessionTimings) {
323
+ const orchOk = orchTimings.length && orchTimings.every((r) => r.exitCode === 0);
324
+ const insOk = insessionTimings.length && insessionTimings.every((r) => r.exitCode === 0);
325
+ const orchCompliant = orchTimings.every((r) => !r.commitAudit || r.commitAudit.discipline === 'compliant');
326
+ const insCompliant = insessionTimings.every((r) => !r.commitAudit || r.commitAudit.discipline === 'compliant');
327
+ const medianOrchMs = median(orchTimings.map((r) => r.durationMs));
328
+ const medianInsessionMs = median(insessionTimings.map((r) => r.durationMs));
329
+ const thresholdMs = Math.round(medianInsessionMs * PASS_TOLERANCE);
330
+ let verdict = 'FAIL';
331
+ let verdictDetail;
332
+ if (!orchOk || !insOk) {
333
+ verdictDetail = `one or more runs failed (orchestrator_ok=${orchOk}, insession_ok=${insOk}) — cannot trust comparison — M40 HALT RECOMMENDED`;
334
+ } else if (!orchCompliant || !insCompliant) {
335
+ verdict = 'INVALID';
336
+ verdictDetail = `commit-discipline audit failed (orchestrator_compliant=${orchCompliant}, insession_compliant=${insCompliant}) — one side did not produce one-commit-per-task; timing comparison is meaningless until both sides obey the same rules`;
337
+ } else if (medianOrchMs <= thresholdMs) {
338
+ verdict = 'PASS';
339
+ verdictDetail = `orchestrator ${medianOrchMs}ms ≤ in-session ${medianInsessionMs}ms × ${PASS_TOLERANCE} (${thresholdMs}ms) — Waves 2+3+4 unlocked`;
340
+ } else {
341
+ verdictDetail = `orchestrator ${medianOrchMs}ms > in-session ${medianInsessionMs}ms × ${PASS_TOLERANCE} (${thresholdMs}ms) — M40 HALT RECOMMENDED`;
342
+ }
343
+ return { verdict, verdictDetail, medianOrchMs, medianInsessionMs, thresholdMs, orchCompliant, insCompliant };
344
+ }
345
+
346
+ async function runBenchmark(opts) {
347
+ const {
348
+ runs,
349
+ fixtureDir,
350
+ mockClaude,
351
+ projectDir,
352
+ reportPath,
353
+ resultsPath,
354
+ keepTmp = false,
355
+ logger = console,
356
+ orchestratorImpl = runOrchestratorSide,
357
+ insessionImpl = runInsessionSide,
358
+ } = opts;
359
+
360
+ if (!fs.existsSync(fixtureDir)) {
361
+ throw new Error(`Fixture dir not found: ${fixtureDir}`);
362
+ }
363
+
364
+ const orchestrator = [];
365
+ const insession = [];
366
+ for (let i = 1; i <= runs; i++) {
367
+ orchestrator.push(orchestratorImpl({ fixtureDir, runIdx: i, mockClaude, logger, keepTmp }));
368
+ insession.push(insessionImpl({ fixtureDir, runIdx: i, mockClaude, logger, keepTmp }));
369
+ }
370
+
371
+ const summary = computeVerdict(orchestrator, insession);
372
+ const results = {
373
+ schemaVersion: 1,
374
+ generatedAt: new Date().toISOString(),
375
+ runs,
376
+ fixtureDir,
377
+ env: collectEnv(),
378
+ orchestrator,
379
+ insession,
380
+ summary,
381
+ verdict: summary.verdict,
382
+ verdictDetail: summary.verdictDetail,
383
+ };
384
+
385
+ const rp = resultsPath || path.join(projectDir, '.gsd-t', 'benchmark-results.json');
386
+ const rrp = reportPath || path.join(projectDir, 'docs', 'm40-benchmark-report.md');
387
+ fs.mkdirSync(path.dirname(rp), { recursive: true });
388
+ fs.mkdirSync(path.dirname(rrp), { recursive: true });
389
+ fs.writeFileSync(rp, JSON.stringify(results, null, 2));
390
+ fs.writeFileSync(rrp, renderReportMd(results));
391
+
392
+ const verdictLine = results.verdict === 'PASS'
393
+ ? `BENCHMARK: PASS — orchestrator ${summary.medianOrchMs}ms vs in-session ${summary.medianInsessionMs}ms — Waves 2+3+4 unlocked`
394
+ : `BENCHMARK: FAIL — orchestrator ${summary.medianOrchMs}ms vs in-session ${summary.medianInsessionMs}ms — M40 HALT RECOMMENDED`;
395
+ logger.log('');
396
+ logger.log(verdictLine);
397
+ logger.log(` Results: ${rp}`);
398
+ logger.log(` Report: ${rrp}`);
399
+
400
+ return { results, resultsPath: rp, reportPath: rrp };
401
+ }
402
+
403
+ async function main() {
404
+ let args;
405
+ try { args = parseCliArgs(process.argv.slice(2)); }
406
+ catch (e) { process.stderr.write(`Error: ${e.message}\n\n`); printHelp(); process.exit(2); }
407
+ if (args.help) { printHelp(); process.exit(0); }
408
+
409
+ try {
410
+ const { results } = await runBenchmark(args);
411
+ process.exit(results.verdict === 'PASS' ? 0 : 1);
412
+ } catch (e) {
413
+ process.stderr.write(`benchmark-orchestrator failed: ${e && e.message}\n`);
414
+ if (e && e.stack) process.stderr.write(e.stack + '\n');
415
+ process.exit(2);
416
+ }
417
+ }
418
+
419
+ if (require.main === module) {
420
+ main();
421
+ }
422
+
423
+ module.exports = {
424
+ parseCliArgs,
425
+ median,
426
+ copyDirSync,
427
+ gitInitRepo,
428
+ prepareRun,
429
+ cleanupRun,
430
+ collectEnv,
431
+ renderReportMd,
432
+ computeVerdict,
433
+ runOrchestratorSide,
434
+ runInsessionSide,
435
+ runBenchmark,
436
+ PASS_TOLERANCE,
437
+ };