claude-code-session-manager 0.21.3 → 0.21.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.html CHANGED
@@ -7,7 +7,7 @@
7
7
  <link rel="preconnect" href="https://fonts.googleapis.com">
8
8
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
9
9
  <link href="https://fonts.googleapis.com/css2?family=Newsreader:ital,opsz,wght@0,6..72,400;0,6..72,500;0,6..72,600;0,6..72,700;1,6..72,400&family=Geist:wght@300;400;500;600;700&family=IBM+Plex+Mono:wght@400;500;600&display=swap" rel="stylesheet">
10
- <script type="module" crossorigin src="./assets/index-DO3ROR11.js"></script>
10
+ <script type="module" crossorigin src="./assets/index-BUrrcj7x.js"></script>
11
11
  <link rel="modulepreload" crossorigin href="./assets/monaco-editor-BW5C4Iv1.js">
12
12
  <link rel="stylesheet" crossorigin href="./assets/monaco-editor-BTnBOi8r.css">
13
13
  <link rel="stylesheet" crossorigin href="./assets/index-DeQI4oVI.css">
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-code-session-manager",
3
- "version": "0.21.3",
3
+ "version": "0.21.4",
4
4
  "description": "Local cockpit for the Claude Code CLI — multi-tab terminal, full config surface, scheduler, voice dictation, and live observability.",
5
5
  "type": "module",
6
6
  "main": "src/main/index.cjs",
@@ -487,3 +487,40 @@ test('feedback 01: quoted "Traceback..." line (leading quote) → clean', async
487
487
  assert.equal(verdict.verdict, 'clean', `quoted traceback prose must not flag, got ${verdict.verdict}: ${verdict.reason}`);
488
488
  } finally { rmdir(tmp); }
489
489
  });
490
+
491
+ // ─── harness tool errors (feedback follow-up 2026-06-10) ─────────────────────
492
+
493
+ test('harness tool error (<tool_use_error>) in final 20% → clean', async () => {
494
+ const tmp = makeTmpDir();
495
+ try {
496
+ const slug = '58-harness-tool-error';
497
+ // Pad with benign events so the error lands in the final 20%, then a
498
+ // successful result — mirrors the real 58-web-remote-correctness-batch run.
499
+ const events = [];
500
+ for (let k = 0; k < 8; k++) {
501
+ events.push({ type: 'assistant', message: { role: 'assistant', content: [
502
+ { type: 'tool_use', id: `t${k}`, name: 'Read', input: { description: `read ${k}` } }] } });
503
+ events.push({ type: 'user', message: { role: 'user', content: [
504
+ { type: 'tool_result', tool_use_id: `t${k}`, content: 'ok', is_error: false }] } });
505
+ }
506
+ events.push({ type: 'assistant', message: { role: 'assistant', content: [
507
+ { type: 'tool_use', id: 'tbad', name: 'bash', input: { description: 'run tests' } }] } });
508
+ events.push({ type: 'user', message: { role: 'user', content: [
509
+ { type: 'tool_result', tool_use_id: 'tbad',
510
+ content: '<tool_use_error>Error: No such tool available: bash</tool_use_error>', is_error: true }] } });
511
+ events.push({ type: 'result', subtype: 'success', result: 'All acceptance criteria verified.' });
512
+
513
+ writeLog(tmp, slug, events);
514
+ const prdPath = writePrd(tmp, slug, '# Correctness batch');
515
+ const verdict = await verifyRun({ runDir: tmp, prdPath, queueEntry: { slug, status: 'running' }, allJobs: [] });
516
+ assert.equal(verdict.verdict, 'clean', `harness tool error must not flag, got ${verdict.verdict}: ${verdict.reason}`);
517
+ } finally { rmdir(tmp); }
518
+ });
519
+
520
+ test('isHarnessToolError detects wrapper and "No such tool available"', () => {
521
+ const { isHarnessToolError } = require('../runVerify.cjs');
522
+ assert.equal(isHarnessToolError('<tool_use_error>Error: No such tool available: bash</tool_use_error>'), true);
523
+ assert.equal(isHarnessToolError('No such tool available: Foo'), true);
524
+ assert.equal(isHarnessToolError('ModuleNotFoundError: No module named x'), false);
525
+ assert.equal(isHarnessToolError(''), false);
526
+ });
@@ -50,6 +50,19 @@ const VERDICTS_SCHEMA_VERSION = 1;
50
50
  * 2. Traceback + Error within 10 lines (Python exception)
51
51
  * 3. ModuleNotFoundError / ImportError (missing venv / broken deps)
52
52
  */
53
+ /**
54
+ * True when a tool_result content is a Claude Code harness tool error rather
55
+ * than task output — emitted when the model calls a tool that doesn't exist or
56
+ * isn't allowed (e.g. `<tool_use_error>Error: No such tool available: bash`).
57
+ * The harness rejects the call; the model recovers by retrying with a valid
58
+ * tool. Never a task failure, so the verifier must not downgrade on it.
59
+ */
60
+ function isHarnessToolError(content) {
61
+ if (typeof content !== 'string' || !content) return false;
62
+ return content.includes('<tool_use_error>')
63
+ || /\bNo such tool available\b/.test(content);
64
+ }
65
+
53
66
  function detectPattern(content) {
54
67
  if (typeof content !== 'string' || !content) return null;
55
68
 
@@ -472,6 +485,15 @@ async function verifyRun({ runDir, prdPath, queueEntry, allJobs = [] }) {
472
485
  const ev = events[i];
473
486
  if (ev.kind !== 'tool_result') continue;
474
487
 
488
+ // Harness tool errors (`<tool_use_error>…`) are emitted when the model
489
+ // requests a tool that isn't available — e.g. a wrong-case name like
490
+ // "bash" instead of "Bash", or a tool outside the allowlist. The harness
491
+ // rejects the call and the model retries with a valid tool; the task is
492
+ // unaffected. These are never task failures, so they are exempt from both
493
+ // the is_error scan and the content pattern scan (false-positive class
494
+ // seen in 58-web-remote-correctness-batch, 2026-06-10).
495
+ if (isHarnessToolError(ev.content)) continue;
496
+
475
497
  // is_error:true in the final 20% of the transcript.
476
498
  if (ev.isError && i >= last20pctStart) {
477
499
  const desc = toolUseDesc(events, ev.toolUseId);
@@ -542,6 +564,7 @@ module.exports = {
542
564
  verifyRun,
543
565
  // Exposed for unit tests.
544
566
  detectPattern,
567
+ isHarnessToolError,
545
568
  toolUseName,
546
569
  extractSoakFromBody,
547
570
  parsePrdBodyDepFragments,
@@ -1629,6 +1629,66 @@ function selectHistoryJobs(jobs, limit) {
1629
1629
  .slice(0, cap);
1630
1630
  }
1631
1631
 
1632
+ // Transcript-scan verdicts that re-running verifyRun can re-evaluate. NOT
1633
+ // 'uncommitted_changes' — that comes from the git commit-guard, which verifyRun
1634
+ // does not inspect, so re-scanning it would always return 'clean' and wrongly
1635
+ // heal a genuinely-unfinished job.
1636
+ const RESCANNABLE_VERDICTS = new Set(['transcript_errors', 'verify_unavailable']);
1637
+
1638
+ /**
1639
+ * Pure predicate: is this job eligible for the boot re-verify self-heal? Only
1640
+ * needs_review jobs with a run log AND a transcript-scan verdict. Crucially
1641
+ * EXCLUDES 'uncommitted_changes' (git commit-guard) — verifyRun can't see git,
1642
+ * so re-scanning it would falsely heal an unfinished job. Exported for tests.
1643
+ */
1644
+ function isRescanCandidate(job) {
1645
+ return !!job
1646
+ && job.status === 'needs_review'
1647
+ && !!job.runId
1648
+ && RESCANNABLE_VERDICTS.has(job.verifierVerdict);
1649
+ }
1650
+
1651
+ /**
1652
+ * Self-healing pass over needs_review jobs. The verifier runs in-process, so a
1653
+ * fix to runVerify.cjs only takes effect for jobs verified AFTER an app
1654
+ * restart — jobs flagged by the old (buggy) verifier stay stuck in needs_review
1655
+ * forever. On boot we re-run the CURRENT verifier over every transcript-scan
1656
+ * needs_review job and auto-complete the ones that now pass clean, so verifier
1657
+ * improvements retroactively clear their own false positives (2026-06-10:
1658
+ * anchored ImportError detectors + harness-tool-error exemption healed 8 jobs).
1659
+ *
1660
+ * @returns {Promise<{rescanned:number, healed:string[]}>}
1661
+ */
1662
+ async function reverifyNeedsReview() {
1663
+ const snap = await readQueue();
1664
+ const candidates = snap.jobs.filter(isRescanCandidate);
1665
+ const healed = [];
1666
+ for (const job of candidates) {
1667
+ const runDir = path.join(RUNS_DIR, job.runId);
1668
+ const prdPath = path.join(PRDS_DIR, `${job.slug}.md`);
1669
+ let v = null;
1670
+ try {
1671
+ v = await verifyRun({ runDir, prdPath, queueEntry: job, allJobs: snap.jobs });
1672
+ } catch { continue; } // unreadable log etc. — leave for human review
1673
+ if (v && v.verdict === 'clean') healed.push(job.slug);
1674
+ }
1675
+ if (healed.length) {
1676
+ const healSet = new Set(healed);
1677
+ await mutate((s) => {
1678
+ for (const j of s.jobs) {
1679
+ if (j.status === 'needs_review' && healSet.has(j.slug)) {
1680
+ j.status = 'completed';
1681
+ j.error = null;
1682
+ delete j.verifierVerdict;
1683
+ }
1684
+ }
1685
+ });
1686
+ console.log(`[scheduler] boot reverify: healed ${healed.length} stale needs_review → completed (${healed.join(', ')})`);
1687
+ await broadcast();
1688
+ }
1689
+ return { rescanned: candidates.length, healed };
1690
+ }
1691
+
1632
1692
  function registerScheduleHandlers() {
1633
1693
  ensureDirs();
1634
1694
  supervisor.registerHandlers();
@@ -1666,6 +1726,13 @@ function registerScheduleHandlers() {
1666
1726
  };
1667
1727
  });
1668
1728
 
1729
+ ipcMain.handle('schedule:reverify-needs-review', async () => {
1730
+ // Manual trigger for the boot self-heal pass — re-scan needs_review jobs
1731
+ // with the current verifier and auto-complete the ones that now pass clean.
1732
+ const result = await reverifyNeedsReview();
1733
+ return { ok: true, ...result };
1734
+ });
1735
+
1669
1736
  ipcMain.handle('schedule:force-tick', async () => {
1670
1737
  // Bypass the billing-poll gate entirely — fire pending jobs immediately regardless of meter state.
1671
1738
  // Clears any existing pause first (same semantics as run-now).
@@ -1921,6 +1988,13 @@ async function init() {
1921
1988
  await setPaused(boot.paused.reason, boot.paused.resumeAt);
1922
1989
  }
1923
1990
 
1991
+ // Self-heal stale needs_review flags using the current verifier (see
1992
+ // reverifyNeedsReview). Runs once on boot so a shipped verifier fix clears
1993
+ // its own historical false positives without manual retagging.
1994
+ await reverifyNeedsReview().catch((e) => {
1995
+ console.error(`[scheduler] boot reverify failed: ${e?.message ?? e}`);
1996
+ });
1997
+
1924
1998
  await rescheduleTimer();
1925
1999
  // Refresh next-reset every 10 minutes — billing window can shift if usage
1926
2000
  // resets early or the auth token rotates. Tracked so re-init doesn't leak.
@@ -2089,4 +2163,4 @@ const remote = {
2089
2163
  },
2090
2164
  };
2091
2165
 
2092
- module.exports = { registerScheduleHandlers, attachWindow, init, ROOT, PRDS_DIR, selectHistoryJobs, parsePorcelain, FINISH_PROTOCOL, remote, pickNextBatch, pickForProject, reapDeadRunningJobs, pollRecoveryClearSource, memoryLimitedBatchSize };
2166
+ module.exports = { registerScheduleHandlers, attachWindow, init, ROOT, PRDS_DIR, selectHistoryJobs, parsePorcelain, FINISH_PROTOCOL, remote, pickNextBatch, pickForProject, reapDeadRunningJobs, pollRecoveryClearSource, memoryLimitedBatchSize, reverifyNeedsReview, isRescanCandidate };
@@ -589,9 +589,13 @@ async function tailLines(filePath, fromOffset) {
589
589
  const len = stat.size - start;
590
590
  const buf = Buffer.alloc(len);
591
591
  await fd.read(buf, 0, len, start);
592
- const parts = buf.toString('utf8').split('\n').filter(Boolean);
592
+ // Shift before filter: the fragment may be an empty string (when the
593
+ // buffer starts with '\n', completing the previous partial line). If we
594
+ // filter(Boolean) first, the empty fragment disappears and shift() would
595
+ // remove the first valid line instead.
596
+ const parts = buf.toString('utf8').split('\n');
593
597
  if (dropFirst && parts.length) parts.shift();
594
- return { lines: parts, size: stat.size, inode: stat.ino };
598
+ return { lines: parts.filter(Boolean), size: stat.size, inode: stat.ino };
595
599
  } finally {
596
600
  await fd.close();
597
601
  }