claude-code-session-manager 0.21.2 → 0.21.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/bin/cli.cjs +5 -0
  2. package/dist/assets/{TiptapBody-CepFtp62.js → TiptapBody-CZLSQ6pj.js} +2 -2
  3. package/dist/assets/cssMode-DfqZGMQs.js +1 -0
  4. package/dist/assets/{freemarker2-DqQlU_4i.js → freemarker2-XTPYh37h.js} +1 -1
  5. package/dist/assets/handlebars-DKUF5VyH.js +1 -0
  6. package/dist/assets/html-uqoqsIeI.js +1 -0
  7. package/dist/assets/htmlMode-aMTQs1su.js +1 -0
  8. package/dist/assets/index-BUrrcj7x.js +3525 -0
  9. package/dist/assets/index-DeQI4oVI.css +32 -0
  10. package/dist/assets/javascript-BVxRZMds.js +1 -0
  11. package/dist/assets/{jsonMode-CFEryxme.js → jsonMode-D04xP2s5.js} +4 -4
  12. package/dist/assets/liquid-BkQHTH2P.js +1 -0
  13. package/dist/assets/lspLanguageFeatures-By9uLznH.js +4 -0
  14. package/dist/assets/mdx-Du1IlbjV.js +1 -0
  15. package/dist/assets/{index-CrE67_1W.css → monaco-editor-BTnBOi8r.css} +1 -32
  16. package/dist/assets/monaco-editor-BW5C4Iv1.js +908 -0
  17. package/dist/assets/python-DSlImqXd.js +1 -0
  18. package/dist/assets/razor-BmUVyvSK.js +1 -0
  19. package/dist/assets/{tsMode-CNLm8WAZ.js → tsMode-Btj0TTH7.js} +1 -1
  20. package/dist/assets/typescript-Bzelq9vO.js +1 -0
  21. package/dist/assets/xml-Whd9EaSd.js +1 -0
  22. package/dist/assets/yaml-QYf0-IN8.js +1 -0
  23. package/dist/index.html +4 -2
  24. package/package.json +1 -1
  25. package/src/main/__tests__/runVerify.test.cjs +138 -0
  26. package/src/main/config.cjs +36 -4
  27. package/src/main/historyAggregator.cjs +400 -149
  28. package/src/main/index.cjs +8 -0
  29. package/src/main/ipcSchemas.cjs +42 -13
  30. package/src/main/kg.cjs +87 -30
  31. package/src/main/lib/credentials.cjs +7 -0
  32. package/src/main/lib/e2eStateMachine.cjs +39 -0
  33. package/src/main/runVerify.cjs +51 -5
  34. package/src/main/scheduler/prdParser.cjs +16 -1
  35. package/src/main/scheduler.cjs +171 -13
  36. package/src/main/transcripts.cjs +141 -19
  37. package/src/main/usageMatrix.cjs +7 -3
  38. package/src/main/webRemote.cjs +196 -31
  39. package/src/preload/api.d.ts +40 -0
  40. package/src/preload/index.cjs +7 -0
  41. package/dist/assets/cssMode-8hR_Zezu.js +0 -1
  42. package/dist/assets/handlebars-Ts2NzFcS.js +0 -1
  43. package/dist/assets/html-QjLxt2p6.js +0 -1
  44. package/dist/assets/htmlMode-Dst38sy3.js +0 -1
  45. package/dist/assets/index-XKsJ4Pk3.js +0 -4431
  46. package/dist/assets/javascript-CNxLjNGz.js +0 -1
  47. package/dist/assets/liquid-BBfKLTB_.js +0 -1
  48. package/dist/assets/lspLanguageFeatures-BNyh7ouG.js +0 -4
  49. package/dist/assets/mdx-SaTyS1xC.js +0 -1
  50. package/dist/assets/python-C84TNhMd.js +0 -1
  51. package/dist/assets/razor-BaVJM3L8.js +0 -1
  52. package/dist/assets/typescript-BdrDpzPy.js +0 -1
  53. package/dist/assets/xml-CHJ3Xjjj.js +0 -1
  54. package/dist/assets/yaml-Cg2-K8t3.js +0 -1
@@ -394,35 +394,64 @@ function validated(schema, handler) {
394
394
  }
395
395
 
396
396
  // ──────────────────────────────────────────── Web Remote command allowlist
397
- // Single source of truth imported by webRemote.cjs and by the unit test.
398
- // Only these type strings will ever reach a handler; all others are silently
399
- // dropped without leaking error details back to the relay (ADR §6.2).
400
- const ALLOWED_COMMANDS = new Set([
397
+ // Commands are split into three tiers:
398
+ // READ_COMMANDS — return data; allowed when remoteEnabled=true.
399
+ // SAS_GATED_READS — return sensitive user data (sessions, PRDs, logs,
400
+ // transcript summaries); additionally require
401
+ // _e2eAuthenticated=true (SAS confirmed by user).
402
+ // A compromised relay cannot exfiltrate this data from
403
+ // a session that has not been SAS-confirmed.
404
+ // MUTATE_COMMANDS — write files, spawn processes, or mutate persisted
405
+ // state; gated behind remoteControlEnabled=true AND
406
+ // _e2eAuthenticated=true.
407
+ // ALLOWED_COMMANDS is the union, kept for existing import compatibility.
408
+ //
409
+ // Ungated READ_COMMANDS (justify each):
410
+ // cmd:app:version — exposes only the app semver string; no user data.
411
+ // cmd:session:unsubscribe — teardown lifecycle; returns nothing sensitive.
412
+ const READ_COMMANDS = new Set([
413
+ 'cmd:app:version',
414
+ // v2 mobile: unsubscribe is a teardown lifecycle call with no data payload.
415
+ 'cmd:session:unsubscribe',
416
+ ]);
417
+
418
+ // Sensitive reads — return user data; require SAS confirmation same as MUTATE.
419
+ const SAS_GATED_READS = new Set([
401
420
  'cmd:sessions:load',
421
+ 'cmd:schedule:state',
422
+ 'cmd:schedule:read-prd',
423
+ 'cmd:schedule:read-log',
424
+ 'cmd:history:aggregate',
425
+ // subscribe initiates a live stream of session state/summary — sensitive.
426
+ 'cmd:session:subscribe',
427
+ ]);
428
+
429
+ const MUTATE_COMMANDS = new Set([
402
430
  'cmd:sessions:save',
403
431
  'cmd:pty:spawn',
404
432
  'cmd:pty:write',
405
- 'cmd:pty:resize',
433
+ // pty:kill terminates a live session; pty:resize drives the geometry of the
434
+ // user's interactive PTY — both write live process state, so they are gated
435
+ // behind remoteControlEnabled + SAS like every other mutation. A read-only
436
+ // mobile mirror has no business killing or resizing the desktop's session.
406
437
  'cmd:pty:kill',
407
- 'cmd:schedule:state',
408
- 'cmd:schedule:read-prd',
409
- 'cmd:schedule:read-log',
438
+ 'cmd:pty:resize',
410
439
  'cmd:schedule:write-prd',
411
440
  'cmd:schedule:reset-job',
412
441
  'cmd:schedule:run-now',
413
442
  'cmd:schedule:set-config',
414
- 'cmd:history:aggregate',
415
- 'cmd:app:version',
416
- // v2 mobile: per-session live state + summary push (ARCHITECTURE-V2-MOBILE.md §3)
417
- 'cmd:session:subscribe',
418
- 'cmd:session:unsubscribe',
419
443
  ]);
420
444
 
445
+ const ALLOWED_COMMANDS = new Set([...READ_COMMANDS, ...SAS_GATED_READS, ...MUTATE_COMMANDS]);
446
+
421
447
  module.exports = {
422
448
  // Centralized slug regex — used by scheduler.cjs and queueOps.cjs for
423
449
  // direct test()/match() containment checks alongside the zod parses.
424
450
  SCHEDULE_SLUG_RE,
425
451
  SCHEDULE_RUN_ID_RE,
452
+ READ_COMMANDS,
453
+ SAS_GATED_READS,
454
+ MUTATE_COMMANDS,
426
455
  ALLOWED_COMMANDS,
427
456
  schemas: {
428
457
  webRemotePair,
package/src/main/kg.cjs CHANGED
@@ -39,16 +39,24 @@ const path = require('node:path');
39
39
  const os = require('node:os');
40
40
  const { resolveClaudeBin } = require('./lib/claudeBin.cjs');
41
41
  const { encodeCwd } = require('./lib/encodeCwd.cjs');
42
+ const { writeJson } = require('./config.cjs');
42
43
 
43
44
  const HOME = os.homedir();
44
45
  const KG_DIR = path.join(HOME, '.claude', 'knowledge-log');
45
46
  const LOG_PATH = path.join(KG_DIR, 'prompts.jsonl');
46
47
  const GRAPHS_DIR = path.join(KG_DIR, 'graphs');
47
48
  const INGEST_STATE_PATH = path.join(KG_DIR, 'ingest-state.json');
49
+ const PROMPT_INDEX_PATH = path.join(KG_DIR, 'prompt-index.json');
48
50
  const BATCH = 20; // prompts per extraction call (also a per-project cap)
49
51
  const KNOWN_VOCAB = 200; // top node names pre-seeded for dedup-at-extraction
50
52
  const MAX_TAIL_BYTES = 8 * 1024 * 1024; // bound bytes scanned per ingest run
51
53
  const MAX_EXTRACTIONS_PER_RUN = 30; // bound claude calls per run (cost/time)
54
+ // Coalescing window before an auto-ingest after new prompts land. Units never
55
+ // mix projects, and a project switch in the log closes the current batch — so
56
+ // with concurrent sessions a short window yields 1-2-prompt batches and one
57
+ // claude spawn each (~1.2K extraction runs in one 48h period). A long window
58
+ // lets prompts accumulate into fuller batches; the KG tab tolerates the lag.
59
+ const WATCH_COALESCE_MS = 5 * 60_000;
52
60
 
53
61
  const ENTITY_TYPES = ['project', 'feature', 'tool', 'tech', 'concept', 'goal', 'person'];
54
62
 
@@ -137,11 +145,7 @@ async function loadGraphFor(cwd) {
137
145
  }
138
146
 
139
147
  async function saveGraph(g) {
140
- await fsp.mkdir(GRAPHS_DIR, { recursive: true });
141
- const p = graphPath(g.cwd);
142
- const tmp = `${p}.tmp`;
143
- await fsp.writeFile(tmp, JSON.stringify(g, null, 2));
144
- await fsp.rename(tmp, p); // atomic
148
+ await writeJson(graphPath(g.cwd), g);
145
149
  }
146
150
 
147
151
  async function loadIngestState() {
@@ -152,10 +156,20 @@ async function loadIngestState() {
152
156
  }
153
157
 
154
158
  async function saveIngestState(s) {
155
- await fsp.mkdir(KG_DIR, { recursive: true });
156
- const tmp = `${INGEST_STATE_PATH}.tmp`;
157
- await fsp.writeFile(tmp, JSON.stringify(s, null, 2));
158
- await fsp.rename(tmp, INGEST_STATE_PATH);
159
+ await writeJson(INGEST_STATE_PATH, s);
160
+ }
161
+
162
+ /**
163
+ * Per-project prompt-count sidecar: { [encodedCwd]: { count: number, cwd: string } }
164
+ * Returns null when the file does not yet exist (triggers a one-time migration scan).
165
+ */
166
+ async function readPromptIndex() {
167
+ try { return JSON.parse(await fsp.readFile(PROMPT_INDEX_PATH, 'utf8')); }
168
+ catch { return null; }
169
+ }
170
+
171
+ async function savePromptIndex(idx) {
172
+ await writeJson(PROMPT_INDEX_PATH, idx);
159
173
  }
160
174
 
161
175
  /** Canonical dedup key: lowercase, strip leading article, collapse whitespace. */
@@ -337,6 +351,7 @@ async function ingest() {
337
351
  broadcast('kg:ingest-progress', { phase: 'start', ingesting: true });
338
352
  try {
339
353
  const st = await loadIngestState();
354
+ const promptIdx = await readPromptIndex() ?? {};
340
355
  let stat;
341
356
  try { stat = await fsp.stat(LOG_PATH); }
342
357
  catch { broadcast('kg:ingest-progress', { phase: 'done', ingesting: false, added: 0 }); return { ok: true, added: 0, note: 'no log yet' }; }
@@ -423,6 +438,14 @@ async function ingest() {
423
438
  st.lastOffset += u.bytes;
424
439
  st.lastTs = u.entries[u.entries.length - 1].ts || st.lastTs;
425
440
  st.updatedAt = new Date().toISOString();
441
+ // Write index before advancing watermark: if we crash between these two
442
+ // writes, the watermark hasn't moved so the batch will be re-processed
443
+ // (the index count may be slightly high) rather than advanced past a
444
+ // batch whose index entry was never written.
445
+ if (!promptIdx[u.enc]) promptIdx[u.enc] = { count: 0, cwd: u.cwd };
446
+ promptIdx[u.enc].count += u.entries.length;
447
+ promptIdx[u.enc].cwd = u.cwd;
448
+ await savePromptIndex(promptIdx);
426
449
  await saveIngestState(st);
427
450
  if (extractions >= MAX_EXTRACTIONS_PER_RUN) { capped = true; break; }
428
451
  continue;
@@ -435,12 +458,19 @@ async function ingest() {
435
458
  g.updatedAt = new Date().toISOString();
436
459
 
437
460
  // Commit this batch: graph first (so a crash can't advance the watermark
438
- // past unsaved work), then the watermark.
461
+ // past unsaved work), then the watermark + sidecar index.
439
462
  await saveGraph(g);
440
463
  st.lastOffset += u.bytes;
441
464
  st.promptCount += u.entries.length;
442
465
  st.lastTs = batchTs;
443
466
  st.updatedAt = new Date().toISOString();
467
+ // Write index before advancing watermark so a crash between the two
468
+ // leaves the watermark un-advanced (re-processable) rather than
469
+ // advancing past a batch whose index entry was never committed.
470
+ if (!promptIdx[u.enc]) promptIdx[u.enc] = { count: 0, cwd: u.cwd };
471
+ promptIdx[u.enc].count += u.entries.length;
472
+ promptIdx[u.enc].cwd = u.cwd;
473
+ await savePromptIndex(promptIdx);
444
474
  await saveIngestState(st);
445
475
 
446
476
  committedPrompts += u.entries.length;
@@ -473,25 +503,29 @@ async function ingest() {
473
503
 
474
504
  /** Enumerate projects seen in the log, enriched with per-project graph stats. */
475
505
  async function listProjects() {
476
- const prompts = await readAllPrompts();
477
- const byEnc = new Map();
478
- for (const p of prompts) {
479
- if (!p.cwd) continue;
480
- const enc = encodeCwd(p.cwd);
481
- let e = byEnc.get(enc);
482
- if (!e) { e = { cwd: p.cwd, enc, total: 0 }; byEnc.set(enc, e); }
483
- e.total++;
484
- e.cwd = p.cwd; // keep most recent spelling
506
+ let idx = await readPromptIndex();
507
+ if (idx === null) {
508
+ // One-time migration: build sidecar from the full log.
509
+ idx = {};
510
+ const prompts = await readAllPrompts();
511
+ for (const p of prompts) {
512
+ if (!p.cwd) continue;
513
+ const enc = encodeCwd(p.cwd);
514
+ if (!idx[enc]) idx[enc] = { count: 0, cwd: p.cwd };
515
+ idx[enc].count++;
516
+ idx[enc].cwd = p.cwd;
517
+ }
518
+ await savePromptIndex(idx).catch(() => {});
485
519
  }
486
520
  const out = [];
487
- for (const e of byEnc.values()) {
488
- const g = await loadGraphFor(e.cwd);
521
+ for (const [enc, entry] of Object.entries(idx)) {
522
+ const g = await loadGraphFor(entry.cwd);
489
523
  out.push({
490
- cwd: e.cwd,
491
- label: shortLabel(e.cwd),
492
- total: e.total,
524
+ cwd: entry.cwd,
525
+ label: shortLabel(entry.cwd),
526
+ total: entry.count,
493
527
  processed: g.promptCount || 0,
494
- pending: Math.max(0, e.total - (g.promptCount || 0)),
528
+ pending: Math.max(0, entry.count - (g.promptCount || 0)),
495
529
  nodes: g.nodes.length,
496
530
  edges: g.edges.length,
497
531
  lastIngest: g.updatedAt,
@@ -510,7 +544,24 @@ async function getState(cwd) {
510
544
  const target = cwd || await defaultCwd();
511
545
  const enc = encodeCwd(target);
512
546
  const g = await loadGraphFor(target);
513
- const prompts = (await readAllPrompts()).filter((p) => encodeCwd(p.cwd) === enc);
547
+ let idx = await readPromptIndex();
548
+ let totalPrompts;
549
+ if (idx === null) {
550
+ // One-time migration fallback — build from full log.
551
+ idx = {};
552
+ const prompts = await readAllPrompts();
553
+ for (const p of prompts) {
554
+ if (!p.cwd) continue;
555
+ const e2 = encodeCwd(p.cwd);
556
+ if (!idx[e2]) idx[e2] = { count: 0, cwd: p.cwd };
557
+ idx[e2].count++;
558
+ idx[e2].cwd = p.cwd;
559
+ }
560
+ await savePromptIndex(idx).catch(() => {});
561
+ totalPrompts = idx[enc]?.count ?? 0;
562
+ } else {
563
+ totalPrompts = idx[enc]?.count ?? 0;
564
+ }
514
565
  return {
515
566
  cwd: target,
516
567
  label: shortLabel(target),
@@ -518,8 +569,8 @@ async function getState(cwd) {
518
569
  edges: g.edges,
519
570
  status: {
520
571
  promptCount: g.promptCount || 0,
521
- totalPrompts: prompts.length,
522
- pending: Math.max(0, prompts.length - (g.promptCount || 0)),
572
+ totalPrompts,
573
+ pending: Math.max(0, totalPrompts - (g.promptCount || 0)),
523
574
  lastIngest: g.updatedAt,
524
575
  ingesting,
525
576
  logPath: LOG_PATH,
@@ -584,8 +635,14 @@ function init(opts = {}) {
584
635
  fs.mkdirSync(KG_DIR, { recursive: true });
585
636
  fs.watch(KG_DIR, (_evt, file) => {
586
637
  if (file && file !== 'prompts.jsonl') return;
587
- if (watchTimer) clearTimeout(watchTimer);
588
- watchTimer = setTimeout(() => { ingest().catch(() => {}); }, 8_000);
638
+ // Leading-edge coalesce: first new prompt arms the timer; later prompts
639
+ // ride along instead of resetting it, so busy periods can't starve
640
+ // ingest and every run sees a full window's worth of prompts.
641
+ if (watchTimer) return;
642
+ watchTimer = setTimeout(() => {
643
+ watchTimer = null;
644
+ ingest().catch(() => {});
645
+ }, WATCH_COALESCE_MS);
589
646
  });
590
647
  } catch { /* watch is best-effort */ }
591
648
  }
@@ -168,6 +168,13 @@ async function refreshIfNeeded(forceRefresh = false) {
168
168
  }
169
169
 
170
170
  if (alreadyExpired) {
171
+ // Re-read from disk in case credentials were externally refreshed (e.g. via
172
+ // `claude login`) between our initial read and the failed OAuth attempt.
173
+ const recheckCr = await readCredentials();
174
+ if (recheckCr.kind === 'ok' && !isExpired(recheckCr.creds)) {
175
+ appendRefreshLog({ event: 'externally_refreshed_ok', recheckExpiresAt: recheckCr.creds.expiresAt ?? null });
176
+ return { kind: 'ok', creds: recheckCr.creds };
177
+ }
171
178
  const ms = expiresAtMs(creds);
172
179
  appendRefreshLog({ event: 'auth_failed_expired', expiredAtMs: ms });
173
180
  return {
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Pure E2E session state machine for the web-remote relay.
3
+ * No Electron, no I/O — importable in unit tests.
4
+ *
5
+ * State transitions:
6
+ * idle → pending_sas : successful deriveSessionKey + deriveSas
7
+ * idle → failed : crypto derivation error
8
+ * pending_sas → authenticated : user confirms SAS
9
+ * pending_sas → failed : deriveSas threw after sessionKey succeeded
10
+ * any → idle : disconnect / reset
11
+ */
12
+
13
+ /** @returns {{ state: string, sessionKey: Buffer|null, pendingSas: string|null }} */
14
+ function makeState(state = 'idle', sessionKey = null, pendingSas = null) {
15
+ return { state, sessionKey, pendingSas };
16
+ }
17
+
18
+ /**
19
+ * Attempt to confirm the SAS. Pure — does not mutate; returns the next state.
20
+ * @param {{ state: string, sessionKey: Buffer|null, pendingSas: string|null }} e2eState
21
+ * @returns {{ ok: boolean, error?: string, next: { state: string, sessionKey: Buffer|null, pendingSas: string|null } }}
22
+ */
23
+ function confirmSas(e2eState) {
24
+ if (e2eState.state !== 'pending_sas') {
25
+ const errorMap = {
26
+ idle: 'no_e2e_session',
27
+ failed: 'e2e_failed',
28
+ authenticated: 'already_authenticated',
29
+ };
30
+ const error = errorMap[e2eState.state] ?? 'unexpected_state';
31
+ return { ok: false, error, next: e2eState };
32
+ }
33
+ return {
34
+ ok: true,
35
+ next: makeState('authenticated', e2eState.sessionKey, null),
36
+ };
37
+ }
38
+
39
+ module.exports = { makeState, confirmSas };
@@ -50,6 +50,19 @@ const VERDICTS_SCHEMA_VERSION = 1;
50
50
  * 2. Traceback + Error within 10 lines (Python exception)
51
51
  * 3. ModuleNotFoundError / ImportError (missing venv / broken deps)
52
52
  */
53
+ /**
54
+ * True when a tool_result content is a Claude Code harness tool error rather
55
+ * than task output — emitted when the model calls a tool that doesn't exist or
56
+ * isn't allowed (e.g. `<tool_use_error>Error: No such tool available: bash`).
57
+ * The harness rejects the call; the model recovers by retrying with a valid
58
+ * tool. Never a task failure, so the verifier must not downgrade on it.
59
+ */
60
+ function isHarnessToolError(content) {
61
+ if (typeof content !== 'string' || !content) return false;
62
+ return content.includes('<tool_use_error>')
63
+ || /\bNo such tool available\b/.test(content);
64
+ }
65
+
53
66
  function detectPattern(content) {
54
67
  if (typeof content !== 'string' || !content) return null;
55
68
 
@@ -58,20 +71,24 @@ function detectPattern(content) {
58
71
  return { verdict: 'transcript_errors', pattern: 'FAIL/FATAL at line start' };
59
72
  }
60
73
 
61
- // (2) Python Traceback + Error line within next 10 lines.
74
+ // (2) Python Traceback + exception line within next 10 lines. Both anchored
75
+ // to line starts: reviewer prose quoting "will crash with ImportError" or
76
+ // embedding "...Error:" mid-sentence must not match (feedback 2026-06-10-01).
62
77
  const lines = content.split('\n');
63
78
  for (let i = 0; i < lines.length; i++) {
64
- if (lines[i].includes('Traceback (most recent call last):')) {
79
+ if (/^\s*Traceback \(most recent call last\):/.test(lines[i])) {
65
80
  for (let j = i + 1; j < Math.min(i + 11, lines.length); j++) {
66
- if (lines[j].includes('Error:')) {
81
+ if (/^\s*[A-Za-z_][\w.]*(?:Error|Exception)\s*:/.test(lines[j])) {
67
82
  return { verdict: 'transcript_errors', pattern: 'Traceback + Error within 10 lines' };
68
83
  }
69
84
  }
70
85
  }
71
86
  }
72
87
 
73
- // (3) Import / module errors (verification was skipped).
74
- if (content.includes('ModuleNotFoundError') || content.includes('ImportError')) {
88
+ // (3) Import / module errors (verification was skipped). Line-anchored:
89
+ // real interpreter output starts the line with the exception name
90
+ // ("ModuleNotFoundError: No module named 'x'"); prose never does.
91
+ if (/^\s*(?:ModuleNotFoundError|ImportError)\s*(?::|$)/m.test(content)) {
75
92
  return { verdict: 'verify_unavailable', pattern: 'ModuleNotFoundError/ImportError' };
76
93
  }
77
94
 
@@ -195,6 +212,18 @@ function toolUseDesc(events, toolUseId) {
195
212
  return '';
196
213
  }
197
214
 
215
+ /**
216
+ * Return the tool name of the tool_use that produced a given tool_result.
217
+ * Returns '' if not found.
218
+ */
219
+ function toolUseName(events, toolUseId) {
220
+ if (!toolUseId) return '';
221
+ for (const ev of events) {
222
+ if (ev.kind === 'tool_use' && ev.toolUseId === toolUseId) return ev.toolName ?? '';
223
+ }
224
+ return '';
225
+ }
226
+
198
227
  /**
199
228
  * Check whether the next ≤5 tool_use calls after `fromSeq` include a package
200
229
  * install command (pip install, pip3 install, uv sync, uv pip install).
@@ -456,6 +485,15 @@ async function verifyRun({ runDir, prdPath, queueEntry, allJobs = [] }) {
456
485
  const ev = events[i];
457
486
  if (ev.kind !== 'tool_result') continue;
458
487
 
488
+ // Harness tool errors (`<tool_use_error>…`) are emitted when the model
489
+ // requests a tool that isn't available — e.g. a wrong-case name like
490
+ // "bash" instead of "Bash", or a tool outside the allowlist. The harness
491
+ // rejects the call and the model retries with a valid tool; the task is
492
+ // unaffected. These are never task failures, so they are exempt from both
493
+ // the is_error scan and the content pattern scan (false-positive class
494
+ // seen in 58-web-remote-correctness-batch, 2026-06-10).
495
+ if (isHarnessToolError(ev.content)) continue;
496
+
459
497
  // is_error:true in the final 20% of the transcript.
460
498
  if (ev.isError && i >= last20pctStart) {
461
499
  const desc = toolUseDesc(events, ev.toolUseId);
@@ -471,6 +509,12 @@ async function verifyRun({ runDir, prdPath, queueEntry, allJobs = [] }) {
471
509
 
472
510
  if (!ev.content) continue;
473
511
 
512
+ // Subagent (Task) results are structured prose — review findings that
513
+ // *describe* exceptions ("will crash with ImportError") are the dominant
514
+ // false-positive source (feedback 2026-06-10-01). Real runtime errors
515
+ // surface through Bash/test tool_results, which are still scanned.
516
+ if (toolUseName(events, ev.toolUseId) === 'Task') continue;
517
+
474
518
  const hit = detectPattern(ev.content);
475
519
  if (!hit) continue;
476
520
 
@@ -520,6 +564,8 @@ module.exports = {
520
564
  verifyRun,
521
565
  // Exposed for unit tests.
522
566
  detectPattern,
567
+ isHarnessToolError,
568
+ toolUseName,
523
569
  extractSoakFromBody,
524
570
  parsePrdBodyDepFragments,
525
571
  checkDeps,
@@ -15,9 +15,24 @@
15
15
 
16
16
  const fs = require('node:fs');
17
17
  const fsp = require('node:fs/promises');
18
+ const os = require('node:os');
18
19
  const path = require('node:path');
19
20
  const { splitFrontmatter } = require('../lib/prdFrontmatter.cjs');
20
21
 
22
+ /**
23
+ * Expand a PRD `cwd` value to an absolute path.
24
+ * - `~/...` or `~` alone → absolute under os.homedir()
25
+ * - Already-absolute paths pass through unchanged.
26
+ * - Bare relative paths → joined onto os.homedir().
27
+ * null/empty returns null (caller falls back to defaultCwd).
28
+ */
29
+ function expandCwd(cwd) {
30
+ if (!cwd) return null;
31
+ if (cwd === '~' || cwd.startsWith('~/')) return path.join(os.homedir(), cwd.slice(1));
32
+ if (path.isAbsolute(cwd)) return cwd;
33
+ return path.join(os.homedir(), cwd);
34
+ }
35
+
21
36
  // Hard cap to keep one malformed PRD (e.g. a binary blob accidentally renamed
22
37
  // .md) from wedging the main thread. PRDs are PRDs, not media files; 1 MB is
23
38
  // already ~25k lines and well beyond any legitimate authored doc.
@@ -46,7 +61,7 @@ async function parsePrdRaw(filePath) {
46
61
  slug: base,
47
62
  path: filePath,
48
63
  title: fm.title || base,
49
- cwd: fm.cwd || null,
64
+ cwd: expandCwd(fm.cwd || null),
50
65
  estimateMinutes: fm.estimateMinutes ? Number(fm.estimateMinutes) || null : null,
51
66
  parallelGroup: (fm.parallelGroup ? Number(fm.parallelGroup) || null : null) ?? groupFromName ?? 99,
52
67
  body: body.trim(),