pan-wizard 3.5.2 → 3.7.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/README.md +8 -8
  2. package/agents/pan-executor.md +18 -0
  3. package/agents/pan-experiment-runner.md +126 -0
  4. package/agents/pan-phase-researcher.md +16 -0
  5. package/agents/pan-plan-checker.md +80 -0
  6. package/agents/pan-planner.md +19 -0
  7. package/agents/pan-reviewer.md +2 -0
  8. package/agents/pan-verifier.md +41 -0
  9. package/bin/install-lib.cjs +55 -0
  10. package/bin/install.js +71 -22
  11. package/commands/pan/debug.md +1 -1
  12. package/commands/pan/experiment.md +219 -0
  13. package/commands/pan/health.md +1 -1
  14. package/commands/pan/learn.md +15 -1
  15. package/commands/pan/optimize.md +13 -0
  16. package/commands/pan/patches.md +10 -1
  17. package/commands/pan/phase-tests.md +1 -4
  18. package/commands/pan/todo-add.md +1 -1
  19. package/commands/pan/todo-check.md +1 -1
  20. package/hooks/dist/pan-cost-logger.js +54 -4
  21. package/hooks/dist/pan-trace-logger.js +72 -3
  22. package/package.json +67 -66
  23. package/pan-wizard-core/bin/lib/commands.cjs +8 -0
  24. package/pan-wizard-core/bin/lib/config.cjs +13 -2
  25. package/pan-wizard-core/bin/lib/context-budget.cjs +73 -0
  26. package/pan-wizard-core/bin/lib/core.cjs +13 -0
  27. package/pan-wizard-core/bin/lib/doc-lint/frontmatter.js +270 -0
  28. package/pan-wizard-core/bin/lib/doc-lint/reporter.js +45 -0
  29. package/pan-wizard-core/bin/lib/doc-lint/schema.js +202 -0
  30. package/pan-wizard-core/bin/lib/doc-lint/validate.js +190 -0
  31. package/pan-wizard-core/bin/lib/doc-lint/walk.js +135 -0
  32. package/pan-wizard-core/bin/lib/doc-lint.cjs +287 -0
  33. package/pan-wizard-core/bin/lib/experiment.cjs +501 -0
  34. package/pan-wizard-core/bin/lib/learn-index.cjs +235 -0
  35. package/pan-wizard-core/bin/lib/learn-lint.cjs +292 -0
  36. package/pan-wizard-core/bin/lib/optimize.cjs +474 -1
  37. package/pan-wizard-core/bin/lib/runner.cjs +472 -0
  38. package/pan-wizard-core/bin/pan-tools.cjs +222 -2
  39. package/pan-wizard-core/learnings/README.md +70 -0
  40. package/pan-wizard-core/learnings/index.json +540 -0
  41. package/pan-wizard-core/learnings/internal/.gitkeep +2 -0
  42. package/pan-wizard-core/learnings/internal/experiment-runner.md +81 -0
  43. package/pan-wizard-core/learnings/internal/external-research.md +93 -0
  44. package/pan-wizard-core/learnings/internal/loop-design.md +33 -0
  45. package/pan-wizard-core/learnings/internal/pan-dev-bugs.md +181 -0
  46. package/pan-wizard-core/learnings/universal/.gitkeep +2 -0
  47. package/pan-wizard-core/learnings/universal/atomic-state.md +21 -0
  48. package/pan-wizard-core/learnings/universal/binary-io.md +21 -0
  49. package/pan-wizard-core/learnings/universal/comment-syntax.md +21 -0
  50. package/pan-wizard-core/learnings/universal/composition.md +33 -0
  51. package/pan-wizard-core/learnings/universal/concurrency.md +33 -0
  52. package/pan-wizard-core/learnings/universal/dag-scheduler.md +33 -0
  53. package/pan-wizard-core/learnings/universal/data-driven-design.md +21 -0
  54. package/pan-wizard-core/learnings/universal/design-process.md +21 -0
  55. package/pan-wizard-core/learnings/universal/empirical-spike.md +21 -0
  56. package/pan-wizard-core/learnings/universal/error-handling.md +23 -0
  57. package/pan-wizard-core/learnings/universal/error-paths.md +21 -0
  58. package/pan-wizard-core/learnings/universal/glob-semantics.md +21 -0
  59. package/pan-wizard-core/learnings/universal/idempotency.md +21 -0
  60. package/pan-wizard-core/learnings/universal/invariants.md +21 -0
  61. package/pan-wizard-core/learnings/universal/io-patterns.md +21 -0
  62. package/pan-wizard-core/learnings/universal/numeric-edge-cases.md +21 -0
  63. package/pan-wizard-core/learnings/universal/output-conventions.md +21 -0
  64. package/pan-wizard-core/learnings/universal/parser-design.md +21 -0
  65. package/pan-wizard-core/learnings/universal/phase-locking.md +21 -0
  66. package/pan-wizard-core/learnings/universal/pipe-friendly-cli.md +21 -0
  67. package/pan-wizard-core/learnings/universal/schema-design.md +21 -0
  68. package/pan-wizard-core/learnings/universal/secret-handling.md +21 -0
  69. package/pan-wizard-core/learnings/universal/streaming-io.md +21 -0
  70. package/pan-wizard-core/learnings/universal/test-patterns.md +57 -0
  71. package/pan-wizard-core/learnings/universal/test-strategy.md +33 -0
  72. package/pan-wizard-core/learnings/universal/unicode.md +21 -0
  73. package/pan-wizard-core/learnings/universal/vendor-pattern.md +21 -0
  74. package/pan-wizard-core/references/guardrails.md +58 -0
  75. package/pan-wizard-core/references/handoff-decisions.md +156 -0
  76. package/pan-wizard-core/references/schemas/pan-command.schema.yml +39 -0
  77. package/pan-wizard-core/references/verification-patterns.md +31 -0
  78. package/pan-wizard-core/templates/config.json +2 -1
  79. package/pan-wizard-core/templates/idea.md +52 -0
  80. package/pan-wizard-core/templates/summary-complex.md +14 -5
  81. package/pan-wizard-core/templates/summary-minimal.md +6 -0
  82. package/pan-wizard-core/templates/summary-standard.md +14 -3
  83. package/pan-wizard-core/workflows/discuss-phase.md +108 -1
  84. package/pan-wizard-core/workflows/exec-phase.md +37 -1
  85. package/pan-wizard-core/workflows/execute-plan.md +14 -0
  86. package/pan-wizard-core/workflows/health.md +23 -0
  87. package/pan-wizard-core/workflows/new-project.md +65 -81
  88. package/pan-wizard-core/workflows/plan-phase.md +58 -0
  89. package/pan-wizard-core/workflows/transition.md +102 -7
  90. package/pan-wizard-core/workflows/verify-phase.md +14 -0
  91. package/scripts/build-hooks.js +7 -1
  92. package/scripts/generate-skills-docs.py +10 -8
  93. package/scripts/release-check.js +184 -0
@@ -0,0 +1,472 @@
1
+ 'use strict';
2
+ /**
3
+ * runner.cjs — Self-improvement loop W2: external agent runner.
4
+ *
5
+ * Spec: docs/specs/self_improvement_loop_featureai.md §3.2
6
+ *
7
+ * Spawns an external AI coding session (Claude/Codex/Gemini/OpenCode) against
8
+ * an experiment folder, observes progress via run-state.json, enforces timeout
9
+ * + circuit breaker. The external instance runs autonomously; this runner
10
+ * observes only — it does NOT inject prompts mid-flight.
11
+ *
12
+ * Exports:
13
+ * - runExperiment(slug, opts) — spawn + observe + return result
14
+ * - tailExperimentState(slug, opts) — read run-state.json snapshot
15
+ * - stopExperiment(slug, opts) — graceful halt of a running experiment
16
+ * - RUNTIME_RUNNERS — adapter map (per-runtime headless invocation)
17
+ */
18
+
19
+ const fs = require('fs');
20
+ const path = require('path');
21
+ const { spawnSync } = require('child_process');
22
+ const { getExperimentManifest, PAN_EXPERIMENTS_ROOT_DEFAULT } = require('./experiment.cjs');
23
+
24
+ // ── Runtime adapter map ─────────────────────────────────────────────────────
25
+
26
+ /**
27
+ * Each adapter knows how to invoke its runtime headlessly with a prompt.
28
+ * `bin` is the binary name (PATH lookup at spawn time).
29
+ * `buildArgs(prompt)` returns argv to pass after the bin.
30
+ * `shell: 'win32'` opts the adapter into shell-based spawn ON WINDOWS ONLY —
31
+ * needed for CLI tools that ship as .cmd shims (npx/npm-installed binaries
32
+ * like claude/codex/gemini/opencode) which Node's spawnSync cannot resolve
33
+ * without a shell.
34
+ *
35
+ * Runtime overrides (--runtime-override / opts.runtimeOverride) do NOT inherit
36
+ * shell: 'win32' — they default to direct spawn, which suits test mocks like
37
+ * `node -e '...'` that are resolvable directly. P-102 fix (v3.7.1).
38
+ *
39
+ * GitHub Copilot CLI has no documented headless prompt mode, so it's null.
40
+ */
41
+ // P-1302 fix (v3.7.2): autonomous claude/gemini runs default to non-interactive
42
+ // permissions. Without these flags, the CLI prompts for tool approval, which
43
+ // can't be answered in headless mode and exits 1 silently. Surfaced by the
44
+ // first real autonomous loop run (panloop experiment). The runner's purpose IS
45
+ // autonomous execution — defaulting to interactive permission prompts contradicts
46
+ // the design.
47
+ //
48
+ // Safety: the flags trust the prompt's tool choices. Acceptable because the
49
+ // runner only spawns inside isolated experiment folders (PAN_SOURCE_ROOT-guarded
50
+ // by experiment.cjs) — blast radius is bounded to the experiment dir.
51
+ // P-1603 (v3.7.5): when `opts.captureMetrics` is true the runner switches
52
+ // claude into `--output-format json` so the trailing usage envelope can be
53
+ // parsed for cost/token metrics. Other runtimes are unchanged — token
54
+ // metering for codex/gemini/opencode is deferred (no equivalent flag).
55
+ const RUNTIME_RUNNERS = Object.freeze({
56
+ claude: {
57
+ bin: 'claude',
58
+ buildArgs: (prompt, opts) => {
59
+ const args = ['-p', '--dangerously-skip-permissions'];
60
+ if (opts && opts.captureMetrics) args.push('--output-format', 'json');
61
+ args.push(prompt);
62
+ return args;
63
+ },
64
+ shell: 'win32',
65
+ },
66
+ codex: { bin: 'codex', buildArgs: (prompt) => ['exec', prompt], shell: 'win32' },
67
+ gemini: { bin: 'gemini', buildArgs: (prompt) => ['-p', '--yolo', prompt], shell: 'win32' },
68
+ opencode: { bin: 'opencode', buildArgs: (prompt) => [prompt], shell: 'win32' },
69
+ copilot: null,
70
+ });
71
+
72
+ // ── Stop reasons (enum-ish) ─────────────────────────────────────────────────
73
+
74
+ const STOP_REASONS = Object.freeze({
75
+ SUCCESS: 'success',
76
+ ERROR: 'error',
77
+ TIMEOUT: 'timeout',
78
+ CIRCUIT_BREAKER: 'circuit_breaker',
79
+ MANUAL: 'manual',
80
+ INCOMPLETE: 'incomplete', // P-1502 (v3.7.4): exit 0 but workflow didn't reach milestone-completion
81
+ });
82
+
83
+ // P-EXP-004 (2026-05-02): bumped from 30 min to 60 min — 30 min cut off real
84
+ // 3-plan phases mid-execution (whoolog Phase 1 first run hit this).
85
+ const DEFAULT_TIMEOUT_MS = 60 * 60 * 1000; // 60 min
86
+
87
+ // ── Helpers ─────────────────────────────────────────────────────────────────
88
+
89
+ function getRunStatePath(experimentPath) {
90
+ return path.join(experimentPath, '.planning', 'run-state.json');
91
+ }
92
+
93
+ // P-1502 helper: read state.md and extract the milestone status field.
94
+ // Returns the status string or null if state.md is missing/malformed.
95
+ function readMilestoneStatus(experimentPath) {
96
+ const statePath = path.join(experimentPath, '.planning', 'state.md');
97
+ try {
98
+ const text = fs.readFileSync(statePath, 'utf-8');
99
+ const m = text.match(/^status:\s*(\S+)/m);
100
+ return m ? m[1].trim() : null;
101
+ } catch {
102
+ return null;
103
+ }
104
+ }
105
+
106
+ // P-1603 (v3.7.5): parse the trailing `--output-format json` envelope claude
107
+ // emits at end of a `claude -p --output-format json` session. The envelope is
108
+ // a single JSON object on its own line containing `{result, total_cost_usd,
109
+ // num_turns, session_id, usage: {input_tokens, output_tokens, ...}}`. We
110
+ // scan from end of stdout for the last `{...}` block and JSON-parse it.
111
+ // Returns null if claude was not invoked with --output-format json or the
112
+ // envelope is malformed.
113
+ function parseClaudeJsonEnvelope(stdout) {
114
+ if (!stdout || typeof stdout !== 'string') return null;
115
+ const trimmed = stdout.trimEnd();
116
+ if (!trimmed.endsWith('}')) return null;
117
+ // Walk back to find the matching opening brace at column 0.
118
+ const lines = trimmed.split(/\r?\n/);
119
+ for (let i = lines.length - 1; i >= 0; i -= 1) {
120
+ const line = lines[i].trimEnd();
121
+ if (!line.startsWith('{')) continue;
122
+ try {
123
+ const obj = JSON.parse(lines.slice(i).join('\n'));
124
+ if (obj && typeof obj === 'object' && (obj.total_cost_usd != null || obj.usage)) {
125
+ return obj;
126
+ }
127
+ } catch {
128
+ // try next earlier line
129
+ }
130
+ }
131
+ return null;
132
+ }
133
+
134
+ function writeRunState(experimentPath, state) {
135
+ const file = getRunStatePath(experimentPath);
136
+ try {
137
+ fs.writeFileSync(file, JSON.stringify(state, null, 2));
138
+ } catch {
139
+ // best-effort; runner does not fail on persistence errors
140
+ }
141
+ }
142
+
143
+ function readRunState(experimentPath) {
144
+ const file = getRunStatePath(experimentPath);
145
+ try {
146
+ return JSON.parse(fs.readFileSync(file, 'utf-8'));
147
+ } catch (err) {
148
+ if (err.code === 'ENOENT') return null;
149
+ return null;
150
+ }
151
+ }
152
+
153
+ function appendEvent(state, type, details) {
154
+ state.events = state.events || [];
155
+ state.events.push({
156
+ ts: new Date().toISOString(),
157
+ type,
158
+ details: details || null,
159
+ });
160
+ }
161
+
162
+ // ── runExperiment ───────────────────────────────────────────────────────────
163
+
164
+ /**
165
+ * Spawn the external runtime and wait for it to finish (or be aborted).
166
+ *
167
+ * @param {string} slug - experiment id
168
+ * @param {object} opts
169
+ * @param {string} [opts.root] - experiment root (default PAN_EXPERIMENTS_ROOT_DEFAULT)
170
+ * @param {string} [opts.prompt] - prompt passed to the external runtime; default
171
+ * is `/pan:new-project --auto @.planning/idea.md`
172
+ * @param {number} [opts.timeoutMs] - hard timeout (default 30 min)
173
+ * @param {object} [opts.runtimeOverride] - { bin, buildArgs } to bypass the manifest's
174
+ * runtime adapter (used by tests)
175
+ * @param {function} [opts.onProgress] - callback invoked per line of stdout/stderr
176
+ * @param {boolean} [opts.captureMetrics] - when true, claude is invoked with
177
+ * --output-format json so the trailing usage envelope can be parsed and
178
+ * stored under runState.metrics (P-1603, v3.7.5). Other runtimes ignore.
179
+ * @returns {object} { exit_code, status, stop_reason, elapsed_ms, error? }
180
+ */
181
+ function runExperiment(slug, opts = {}) {
182
+ const root = opts.root || PAN_EXPERIMENTS_ROOT_DEFAULT;
183
+ const manifest = getExperimentManifest(slug, { root });
184
+ if (manifest.error) return { error: manifest.error };
185
+
186
+ const expPath = path.join(root, slug);
187
+ if (!fs.existsSync(expPath)) {
188
+ return { error: `experiment folder missing: ${expPath}` };
189
+ }
190
+
191
+ // Adapter selection
192
+ let adapter = opts.runtimeOverride;
193
+ if (!adapter) {
194
+ const runtime = manifest.runtime;
195
+ adapter = RUNTIME_RUNNERS[runtime];
196
+ if (adapter == null) {
197
+ return {
198
+ error: `runtime "${runtime}" is not supported by the experiment runner ` +
199
+ `(known: ${Object.keys(RUNTIME_RUNNERS).filter(r => RUNTIME_RUNNERS[r]).join(', ')})`,
200
+ };
201
+ }
202
+ }
203
+
204
+ const prompt = opts.prompt || '/pan:new-project --auto @.planning/idea.md';
205
+ const timeoutMs = opts.timeoutMs || DEFAULT_TIMEOUT_MS;
206
+ const onProgress = typeof opts.onProgress === 'function' ? opts.onProgress : null;
207
+
208
+ const startedAt = new Date().toISOString();
209
+ const startTime = Date.now();
210
+
211
+ // Initialize run-state.json
212
+ const runState = {
213
+ experiment_id: slug,
214
+ status: 'running',
215
+ started_at: startedAt,
216
+ ended_at: null,
217
+ pid: null,
218
+ exit_code: null,
219
+ stop_reason: null,
220
+ elapsed_ms: null,
221
+ events: [],
222
+ };
223
+ appendEvent(runState, 'started', `runtime=${manifest.runtime}, prompt=${prompt}`);
224
+ writeRunState(expPath, runState);
225
+
226
+ // Synchronous spawn with native timeout. spawnSync delivers the child's
227
+ // exit signal cleanly even on Windows, and supports a `timeout` option
228
+ // that sends SIGTERM if the child runs past the deadline.
229
+ //
230
+ // Streaming progress is deferred to W3 (async/Promise variant) — for v3.7.0
231
+ // W2 we capture stdout/stderr after exit and emit a single onProgress call
232
+ // with the full text. A real-time stream would require child_process.spawn
233
+ // + an async runner, which clashes with the rest of pan-tools.cjs's
234
+ // synchronous CLI shape.
235
+ // P-102 fix (v3.7.1): on Windows, CLI tools that ship as .cmd shims
236
+ // (npx-installed binaries like claude/codex/gemini/opencode) cannot be
237
+ // spawned with shell:false — Node's spawnSync doesn't resolve the .cmd
238
+ // extension. Adapters opt into shell-based spawn via `shell: 'win32'`.
239
+ //
240
+ // Runtime overrides (test mocks, ad-hoc dev) do NOT inherit shell:'win32',
241
+ // so `node -e '...'` works without shell-based arg mangling.
242
+ const useShell = adapter.shell === 'win32' && process.platform === 'win32';
243
+
244
+ // P-1304 fix (v3.7.2): under shell:true Node joins args with spaces but
245
+ // does NOT quote them. Multi-word args (the prompt has spaces) get re-split
246
+ // by cmd.exe. Surfaced by panloop second autonomous run: prompt was split
247
+ // into ['claude', '-p', '--dangerously-skip-permissions', '/pan:new-project',
248
+ // '--auto', '@.planning/idea.md'] instead of preserving the prompt as one arg.
249
+ // Solution: quote any arg containing whitespace when useShell is true.
250
+ // Escapes embedded double-quotes by doubling (cmd.exe convention).
251
+ // buildArgs may accept opts (claude uses it for --output-format json metric
252
+ // capture). Pass opts safely; legacy adapters that ignore the second arg
253
+ // work unchanged.
254
+ const captureMetrics = Boolean(opts.captureMetrics);
255
+ let rawArgs = adapter.buildArgs(prompt, { captureMetrics });
256
+ const quotedArgs = useShell
257
+ ? rawArgs.map(a => /\s/.test(a) ? `"${String(a).replace(/"/g, '""')}"` : a)
258
+ : rawArgs;
259
+
260
+ // P-1501-r2 fix (v3.7.4): inherit parent's stdin so the spawned claude -p
261
+ // sees a TTY (when the runner is invoked from a terminal) and continues its
262
+ // autonomous tool-use loop. With stdio:[ignore,...] claude detects no-TTY
263
+ // → "scripted single-shot" mode → exits after first response. Manual bash
264
+ // invocation of the same flags worked because bash's stdin IS a TTY.
265
+ // Trade-off: `inherit` means the child reads from the same TTY as the
266
+ // parent. Acceptable because the runner is short-lived and the user
267
+ // typically isn't typing while a run is in flight.
268
+ let result;
269
+ try {
270
+ result = spawnSync(adapter.bin, quotedArgs, {
271
+ cwd: expPath,
272
+ stdio: ['inherit', 'pipe', 'pipe'],
273
+ shell: useShell,
274
+ timeout: timeoutMs,
275
+ encoding: 'utf-8',
276
+ });
277
+ } catch (err) {
278
+ runState.status = 'failed';
279
+ runState.stop_reason = STOP_REASONS.ERROR;
280
+ runState.ended_at = new Date().toISOString();
281
+ runState.elapsed_ms = Date.now() - startTime;
282
+ appendEvent(runState, 'spawn_failed', err.message);
283
+ writeRunState(expPath, runState);
284
+ return {
285
+ error: `failed to spawn ${adapter.bin}: ${err.message}`,
286
+ status: 'failed',
287
+ stop_reason: STOP_REASONS.ERROR,
288
+ elapsed_ms: runState.elapsed_ms,
289
+ };
290
+ }
291
+
292
+ runState.pid = result.pid || null;
293
+
294
+ // Emit captured output if a progress handler is set
295
+ if (onProgress) {
296
+ if (result.stdout) onProgress({ stream: 'stdout', text: result.stdout });
297
+ if (result.stderr) onProgress({ stream: 'stderr', text: result.stderr });
298
+ }
299
+
300
+ // P-1603 (v3.7.5): when captureMetrics was requested, parse the trailing
301
+ // claude --output-format json envelope from stdout and persist metrics into
302
+ // run-state.json so downstream `/pan:learn` analysis can attribute real cost
303
+ // and token usage instead of inferring from event counts.
304
+ if (captureMetrics && result.stdout) {
305
+ const envelope = parseClaudeJsonEnvelope(result.stdout);
306
+ if (envelope) {
307
+ runState.metrics = {
308
+ total_cost_usd: envelope.total_cost_usd ?? null,
309
+ num_turns: envelope.num_turns ?? null,
310
+ session_id: envelope.session_id ?? null,
311
+ input_tokens: envelope.usage?.input_tokens ?? null,
312
+ output_tokens: envelope.usage?.output_tokens ?? null,
313
+ cache_creation_input_tokens: envelope.usage?.cache_creation_input_tokens ?? null,
314
+ cache_read_input_tokens: envelope.usage?.cache_read_input_tokens ?? null,
315
+ };
316
+ appendEvent(runState, 'metrics_captured', `cost=$${envelope.total_cost_usd ?? '?'}, turns=${envelope.num_turns ?? '?'}`);
317
+ } else {
318
+ appendEvent(runState, 'metrics_unavailable', 'no JSON envelope in stdout');
319
+ }
320
+ }
321
+
322
+ const endedAt = new Date().toISOString();
323
+ const elapsedMs = Date.now() - startTime;
324
+
325
+ runState.ended_at = endedAt;
326
+ runState.elapsed_ms = elapsedMs;
327
+ runState.exit_code = result.status;
328
+
329
+ // Detect timeout. spawnSync sets result.signal to 'SIGTERM' when the timeout
330
+ // fires (on Unix) or kills via taskkill on Windows. We also check elapsed
331
+ // time as a fallback heuristic.
332
+ const timedOut =
333
+ result.signal === 'SIGTERM' ||
334
+ (result.error && result.error.code === 'ETIMEDOUT') ||
335
+ (result.status === null && elapsedMs >= timeoutMs - 50);
336
+
337
+ if (timedOut) {
338
+ runState.status = 'failed';
339
+ runState.stop_reason = STOP_REASONS.TIMEOUT;
340
+ appendEvent(runState, 'timeout', `aborted after ${timeoutMs}ms`);
341
+ } else if (result.error) {
342
+ runState.status = 'failed';
343
+ runState.stop_reason = STOP_REASONS.ERROR;
344
+ appendEvent(runState, 'spawn_error', result.error.message);
345
+ } else if (result.status === 0) {
346
+ // P-1502 fix (v3.7.4): exit_code=0 alone is too coarse. Read state.md
347
+ // to verify the workflow actually reached milestone-completion. If it
348
+ // exited cleanly but the project is stuck in 'planning' or 'in_progress',
349
+ // mark as 'incomplete' so /pan:learn analysis can distinguish real
350
+ // success from premature exits (P-1501 / P-1701 patterns).
351
+ //
352
+ // Skip the milestone check when runtimeOverride is set (tests/dev path
353
+ // simulating with `node -e` mocks that don't write state.md). The check
354
+ // is meaningful only for real production-runtime invocations.
355
+ if (opts.runtimeOverride) {
356
+ runState.status = 'done';
357
+ runState.stop_reason = STOP_REASONS.SUCCESS;
358
+ appendEvent(runState, 'completed', 'exit_code=0 (runtime override; milestone check skipped)');
359
+ } else {
360
+ const milestone = readMilestoneStatus(expPath);
361
+ if (milestone === 'completed') {
362
+ runState.status = 'done';
363
+ runState.stop_reason = STOP_REASONS.SUCCESS;
364
+ appendEvent(runState, 'completed', 'exit_code=0, milestone=completed');
365
+ } else {
366
+ runState.status = 'incomplete';
367
+ runState.stop_reason = STOP_REASONS.INCOMPLETE;
368
+ appendEvent(runState, 'incomplete', `exit_code=0 but milestone status=${milestone || 'unknown'}`);
369
+ }
370
+ }
371
+ } else {
372
+ runState.status = 'failed';
373
+ runState.stop_reason = STOP_REASONS.ERROR;
374
+ appendEvent(runState, 'completed', `exit_code=${result.status}`);
375
+ }
376
+
377
+ writeRunState(expPath, runState);
378
+
379
+ return {
380
+ experiment_id: slug,
381
+ status: runState.status,
382
+ stop_reason: runState.stop_reason,
383
+ exit_code: result.status,
384
+ elapsed_ms: elapsedMs,
385
+ started_at: startedAt,
386
+ ended_at: endedAt,
387
+ };
388
+ }
389
+
390
+ // ── tailExperimentState ─────────────────────────────────────────────────────
391
+
392
+ /**
393
+ * Read the current run-state.json for an experiment.
394
+ * Snapshot semantics — no streaming. (W3 may add a poll-loop variant.)
395
+ */
396
+ function tailExperimentState(slug, opts = {}) {
397
+ const root = opts.root || PAN_EXPERIMENTS_ROOT_DEFAULT;
398
+ const manifest = getExperimentManifest(slug, { root });
399
+ if (manifest.error) return { error: manifest.error };
400
+
401
+ const expPath = path.join(root, slug);
402
+ const state = readRunState(expPath);
403
+ if (!state) {
404
+ return { error: `experiment "${slug}" has no run state (not started yet)` };
405
+ }
406
+ return state;
407
+ }
408
+
409
+ // ── stopExperiment ──────────────────────────────────────────────────────────
410
+
411
+ /**
412
+ * Stop a running experiment.
413
+ *
414
+ * If the experiment is currently running (run-state.json shows status=running
415
+ * and pid is alive), send SIGTERM. If still alive after a short grace period,
416
+ * SIGKILL.
417
+ *
418
+ * If the experiment has already finished, return its current state (no error).
419
+ */
420
+ function stopExperiment(slug, opts = {}) {
421
+ const root = opts.root || PAN_EXPERIMENTS_ROOT_DEFAULT;
422
+ const manifest = getExperimentManifest(slug, { root });
423
+ if (manifest.error) return { error: manifest.error };
424
+
425
+ const expPath = path.join(root, slug);
426
+ const state = readRunState(expPath);
427
+ if (!state) {
428
+ return { error: `experiment "${slug}" has no active run` };
429
+ }
430
+
431
+ if (state.status !== 'running') {
432
+ // Already finished — return current state, not an error
433
+ return state;
434
+ }
435
+
436
+ if (!state.pid) {
437
+ return { error: `experiment "${slug}" has no recorded pid` };
438
+ }
439
+
440
+ // Try graceful term, then kill
441
+ try {
442
+ process.kill(state.pid, 'SIGTERM');
443
+ } catch {
444
+ // Process likely already dead
445
+ state.status = 'failed';
446
+ state.stop_reason = STOP_REASONS.MANUAL;
447
+ state.ended_at = new Date().toISOString();
448
+ appendEvent(state, 'stop_no_pid', `pid ${state.pid} already gone`);
449
+ writeRunState(expPath, state);
450
+ return state;
451
+ }
452
+
453
+ // Update state to reflect manual stop
454
+ state.status = 'failed';
455
+ state.stop_reason = STOP_REASONS.MANUAL;
456
+ state.ended_at = new Date().toISOString();
457
+ appendEvent(state, 'stopped', 'SIGTERM sent');
458
+ writeRunState(expPath, state);
459
+
460
+ return state;
461
+ }
462
+
463
+ // ── Exports ─────────────────────────────────────────────────────────────────
464
+
465
+ module.exports = {
466
+ runExperiment,
467
+ tailExperimentState,
468
+ stopExperiment,
469
+ RUNTIME_RUNNERS,
470
+ STOP_REASONS,
471
+ DEFAULT_TIMEOUT_MS,
472
+ };