switchroom 0.14.22 → 0.14.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49420,8 +49420,8 @@ var {
49420
49420
  } = import__.default;
49421
49421
 
49422
49422
  // src/build-info.ts
49423
- var VERSION = "0.14.22";
49424
- var COMMIT_SHA = "ab2692b9";
49423
+ var VERSION = "0.14.24";
49424
+ var COMMIT_SHA = "2711d052";
49425
49425
 
49426
49426
  // src/cli/agent.ts
49427
49427
  init_source();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "switchroom",
3
- "version": "0.14.22",
3
+ "version": "0.14.24",
4
4
  "description": "Run Claude Code 24/7 on your Claude Pro/Max subscription over Telegram. Open-source alternative to OpenClaw and NanoClaw — no API keys.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -48827,6 +48827,67 @@ import {
48827
48827
  import { join as join21 } from "path";
48828
48828
 
48829
48829
  // operator-events.ts
48830
+ function classifyClaudeError(raw) {
48831
+ try {
48832
+ return classifyInner(raw);
48833
+ } catch {
48834
+ return "unknown-4xx";
48835
+ }
48836
+ }
48837
+ function classifyInner(raw) {
48838
+ if (raw == null)
48839
+ return "unknown-4xx";
48840
+ const obj = typeof raw === "object" ? raw : {};
48841
+ const errorType = extractString(obj, "error_type") ?? extractString(obj, "type") ?? extractString(getNestedObj(obj, "error"), "type") ?? "";
48842
+ const errorCode = extractString(obj, "code") ?? extractString(getNestedObj(obj, "error"), "code") ?? "";
48843
+ const message = extractString(obj, "message") ?? extractString(getNestedObj(obj, "error"), "message") ?? (typeof raw === "string" ? raw : "") ?? "";
48844
+ const status = extractNumber(obj, "status") ?? extractNumber(obj, "statusCode") ?? extractNumber(obj, "status_code") ?? null;
48845
+ const sdkCode = extractString(obj, "error_code") ?? "";
48846
+ if (errorType === "authentication_error" || errorCode === "authentication_error" || sdkCode === "authentication_error" || message.toLowerCase().includes("authentication_error")) {
48847
+ const msg = message.toLowerCase();
48848
+ if (msg.includes("expired") || msg.includes("refresh")) {
48849
+ return "credentials-expired";
48850
+ }
48851
+ return "credentials-invalid";
48852
+ }
48853
+ if (errorType === "invalid_api_key" || errorCode === "invalid_api_key" || sdkCode === "invalid_api_key" || message.toLowerCase().includes("invalid_api_key") || message.toLowerCase().includes("invalid api key")) {
48854
+ return "credentials-invalid";
48855
+ }
48856
+ if (errorType === "credit_balance_too_low" || errorCode === "credit_balance_too_low" || sdkCode === "credit_balance_too_low" || message.toLowerCase().includes("credit_balance_too_low") || message.toLowerCase().includes("credit balance")) {
48857
+ return "credit-exhausted";
48858
+ }
48859
+ if (errorType === "rate_limit_error" || errorCode === "rate_limit_error" || sdkCode === "rate_limit_error" || message.toLowerCase().includes("rate_limit_error") || message.toLowerCase().includes("rate limit")) {
48860
+ return "rate-limited";
48861
+ }
48862
+ if (errorType === "overloaded_error" || errorCode === "overloaded_error" || sdkCode === "overloaded_error" || message.toLowerCase().includes("overloaded_error") || message.toLowerCase().includes("overloaded")) {
48863
+ return "rate-limited";
48864
+ }
48865
+ if (errorType === "agent-crashed" || errorCode === "agent-crashed") {
48866
+ return "agent-crashed";
48867
+ }
48868
+ if (errorType === "agent-restarted-unexpectedly" || errorCode === "agent-restarted-unexpectedly") {
48869
+ return "agent-restarted-unexpectedly";
48870
+ }
48871
+ if (status != null) {
48872
+ if (status >= 400 && status < 500)
48873
+ return "unknown-4xx";
48874
+ if (status >= 500 && status < 600)
48875
+ return "unknown-5xx";
48876
+ }
48877
+ return "unknown-4xx";
48878
+ }
48879
+ function extractString(obj, key) {
48880
+ const v = obj[key];
48881
+ return typeof v === "string" && v.length > 0 ? v : null;
48882
+ }
48883
+ function extractNumber(obj, key) {
48884
+ const v = obj[key];
48885
+ return typeof v === "number" ? v : null;
48886
+ }
48887
+ function getNestedObj(obj, key) {
48888
+ const v = obj[key];
48889
+ return typeof v === "object" && v != null ? v : {};
48890
+ }
48830
48891
  var DEFAULT_OPERATOR_EVENT_COOLDOWN_MS2 = 5 * 60000;
48831
48892
  var cooldownMap2 = new Map;
48832
48893
 
@@ -48936,6 +48997,72 @@ function projectSubagentLine(line, agentId, state4) {
48936
48997
  }
48937
48998
  return [];
48938
48999
  }
49000
+ function extractRetryState(obj) {
49001
+ return {
49002
+ retryAttempt: typeof obj.retryAttempt === "number" ? obj.retryAttempt : null,
49003
+ maxRetries: typeof obj.maxRetries === "number" ? obj.maxRetries : null
49004
+ };
49005
+ }
49006
+ function detectErrorInTranscriptLine(line) {
49007
+ if (!line || line.length > 2 * 1024 * 1024)
49008
+ return null;
49009
+ let obj;
49010
+ try {
49011
+ obj = JSON.parse(line);
49012
+ } catch {
49013
+ return null;
49014
+ }
49015
+ if (typeof obj !== "object" || obj == null)
49016
+ return null;
49017
+ const type = obj.type;
49018
+ if (obj.isApiErrorMessage === true) {
49019
+ const status = typeof obj.apiErrorStatus === "number" ? obj.apiErrorStatus : null;
49020
+ const errStr = typeof obj.error === "string" ? obj.error : "";
49021
+ const text = extractAssistantText(obj);
49022
+ const kind2 = status === 429 ? "quota-exhausted" : classifyClaudeError({ type: errStr, status, message: text });
49023
+ return {
49024
+ kind: kind2,
49025
+ raw: obj,
49026
+ detail: text || errStr || "api error",
49027
+ transient: kind2 === "rate-limited",
49028
+ terminal: true
49029
+ };
49030
+ }
49031
+ const isErrorLine = type === "api_error" || type === "error";
49032
+ const embeddedError = typeof obj.error === "object" && obj.error != null ? obj.error : null;
49033
+ if (!isErrorLine && !embeddedError)
49034
+ return null;
49035
+ const raw = embeddedError ?? obj;
49036
+ const kind = classifyClaudeError(embeddedError ?? obj);
49037
+ const detail = extractDetailMessage(embeddedError) ?? extractDetailMessage(obj) ?? String(type ?? "");
49038
+ const transient = kind === "rate-limited";
49039
+ const retry = extractRetryState(obj);
49040
+ const terminal = !transient ? true : retry.retryAttempt != null && retry.maxRetries != null ? retry.retryAttempt >= retry.maxRetries : isErrorLine;
49041
+ return { kind, raw, detail, transient, terminal };
49042
+ }
49043
+ function extractDetailMessage(obj) {
49044
+ if (!obj)
49045
+ return null;
49046
+ const msg = obj.message;
49047
+ return typeof msg === "string" && msg.length > 0 ? msg : null;
49048
+ }
49049
+ function extractAssistantText(obj) {
49050
+ const message = obj.message;
49051
+ if (typeof message !== "object" || message == null)
49052
+ return "";
49053
+ const content = message.content;
49054
+ if (!Array.isArray(content))
49055
+ return "";
49056
+ const parts = [];
49057
+ for (const block of content) {
49058
+ if (typeof block === "object" && block != null && block.type === "text") {
49059
+ const t = block.text;
49060
+ if (typeof t === "string")
49061
+ parts.push(t);
49062
+ }
49063
+ }
49064
+ return parts.join(" ").trim();
49065
+ }
48939
49066
 
48940
49067
  // fleet-state.ts
48941
49068
  var SANITISE_MAX_LEN = 120;
@@ -49098,6 +49225,7 @@ var DEFAULT_RESCAN_MS = 1000;
49098
49225
  var DEFAULT_STALL_THRESHOLD_MS = 60000;
49099
49226
  var DEFAULT_SILENT_SYNTHESIS_STALL_THRESHOLD_MS = 300000;
49100
49227
  var DEFAULT_SILENT_STALL_TERMINAL_MS = 300000;
49228
+ var DEFAULT_INFLIGHT_PROMOTE_MAX_AGE_MS = 15 * 60000;
49101
49229
  var SUBAGENT_RESULT_TEXT_MAX = 3000;
49102
49230
  function parseEnvMs(varName) {
49103
49231
  const raw = process.env[varName];
@@ -49189,6 +49317,12 @@ function readSubTail(entry, tail, now, onDescriptionUpdate, fs2, log, db2, paren
49189
49317
  for (const line of lines) {
49190
49318
  if (!line)
49191
49319
  continue;
49320
+ const errInfo = detectErrorInTranscriptLine(line);
49321
+ if (errInfo?.terminal) {
49322
+ entry.errored = true;
49323
+ if (errInfo.detail)
49324
+ entry.errorDetail = errInfo.detail.slice(0, SUBAGENT_RESULT_TEXT_MAX);
49325
+ }
49192
49326
  const events = projectSubagentLine(line, entry.agentId, startState);
49193
49327
  for (const ev of events) {
49194
49328
  const idleSecBeforeBump = Math.round((now - entry.lastActivityAt) / 1000);
@@ -49253,7 +49387,7 @@ function readSubTail(entry, tail, now, onDescriptionUpdate, fs2, log, db2, paren
49253
49387
  recordSubagentEnd(db2, {
49254
49388
  id: rowRef.id,
49255
49389
  endedAt: now,
49256
- status: "completed"
49390
+ status: entry.errored ? "failed" : "completed"
49257
49391
  });
49258
49392
  }
49259
49393
  } catch (dbErr) {
@@ -49282,6 +49416,8 @@ function startSubagentWatcher(config) {
49282
49416
  const stallThresholdMs = config.stallThresholdMs ?? parseEnvMs("SWITCHROOM_SUBAGENT_STALL_MS") ?? DEFAULT_STALL_THRESHOLD_MS;
49283
49417
  const silentSynthesisStallThresholdMs = config.silentSynthesisStallThresholdMs ?? parseEnvMs("SWITCHROOM_SUBAGENT_SILENT_SYNTH_STALL_MS") ?? DEFAULT_SILENT_SYNTHESIS_STALL_THRESHOLD_MS;
49284
49418
  const silentStallTerminalMs = config.silentStallTerminalMs ?? parseEnvMs("SWITCHROOM_SUBAGENT_STALL_TERMINAL_MS") ?? DEFAULT_SILENT_STALL_TERMINAL_MS;
49419
+ const inflightPromoteMaxAgeMs = config.inflightPromoteMaxAgeMs ?? parseEnvMs("SWITCHROOM_SUBAGENT_INFLIGHT_MAX_AGE_MS") ?? DEFAULT_INFLIGHT_PROMOTE_MAX_AGE_MS;
49420
+ const bootPromoteEnabled = config.bootPromoteEnabled ?? process.env.SWITCHROOM_SUBAGENT_BOOT_PROMOTE !== "0";
49285
49421
  const reaperTtlMs = config.reaperTtlMs ?? DEFAULT_REAPER_TTL_MS;
49286
49422
  const reaperIntervalMs = config.reaperIntervalMs ?? DEFAULT_REAPER_INTERVAL_MS;
49287
49423
  const rescanMs = config.rescanMs ?? DEFAULT_RESCAN_MS;
@@ -49363,6 +49499,29 @@ function startSubagentWatcher(config) {
49363
49499
  readSubTail(entry, tail, n, (desc) => {
49364
49500
  log?.(`subagent-watcher: description updated for ${agentId}: ${desc}`);
49365
49501
  }, fs2, log, db2, parentStateDir, config.onUnstall, undefined, config.onProgress);
49502
+ if (isHistorical && entry.state === "running") {
49503
+ let fileAgeMs = Infinity;
49504
+ try {
49505
+ const st = fs2.statSync(filePath);
49506
+ if (typeof st.mtimeMs === "number")
49507
+ fileAgeMs = n - st.mtimeMs;
49508
+ } catch {}
49509
+ if (!bootPromoteEnabled) {
49510
+ log?.(`subagent-watcher: ${agentId} running at boot but promotion disabled (SWITCHROOM_SUBAGENT_BOOT_PROMOTE=0) \u2014 leaving historical`);
49511
+ } else if (fileAgeMs > inflightPromoteMaxAgeMs) {
49512
+ log?.(`subagent-watcher: ${agentId} running at boot but stale (last write ${Math.round(fileAgeMs / 1000)}s ago > ${Math.round(inflightPromoteMaxAgeMs / 1000)}s) \u2014 leaving historical (dead prior-session worker, not in-flight)`);
49513
+ } else {
49514
+ entry.historical = false;
49515
+ log?.(`subagent-watcher: ${agentId} was in-flight at boot \u2014 promoting to live (last write ${Math.round(fileAgeMs / 1000)}s ago; user still awaiting handback)`);
49516
+ if (db2 != null) {
49517
+ try {
49518
+ backfillJsonlAgentId(db2, filePath, agentId, log);
49519
+ } catch (err) {
49520
+ log?.(`subagent-watcher: backfill error for ${agentId}: ${err.message}`);
49521
+ }
49522
+ }
49523
+ }
49524
+ }
49366
49525
  if (isHistorical && entry.state === "done") {
49367
49526
  entry.completionNotified = true;
49368
49527
  scheduleTerminalCleanup(agentId);
@@ -49397,11 +49556,11 @@ function startSubagentWatcher(config) {
49397
49556
  config.onFinish({
49398
49557
  agentId,
49399
49558
  state: entry.state,
49400
- outcome: entry.historical ? "orphan" : "completed",
49559
+ outcome: entry.errored ? "failed" : entry.historical ? "orphan" : "completed",
49401
49560
  toolCount: entry.toolCount,
49402
49561
  durationMs: nowFn() - entry.dispatchedAt,
49403
49562
  description: entry.description,
49404
- resultText: entry.lastResultText
49563
+ resultText: entry.errored ? entry.lastResultText || entry.errorDetail || "" : entry.lastResultText
49405
49564
  });
49406
49565
  } catch (cbErr) {
49407
49566
  log?.(`subagent-watcher: onFinish callback error ${agentId}: ${cbErr.message}`);
@@ -49518,7 +49677,7 @@ function startSubagentWatcher(config) {
49518
49677
  recordSubagentEnd(db2, {
49519
49678
  id: rowRef.id,
49520
49679
  endedAt: n,
49521
- status: "completed"
49680
+ status: entry.errored ? "failed" : "completed"
49522
49681
  });
49523
49682
  }
49524
49683
  } catch (dbErr) {
@@ -51298,10 +51457,10 @@ function sweepStaleTurnActiveMarker(stateDir, opts) {
51298
51457
  }
51299
51458
 
51300
51459
  // ../src/build-info.ts
51301
- var VERSION = "0.14.22";
51302
- var COMMIT_SHA = "ab2692b9";
51303
- var COMMIT_DATE = "2026-05-31T06:26:06Z";
51304
- var LATEST_PR = 2028;
51460
+ var VERSION = "0.14.24";
51461
+ var COMMIT_SHA = "2711d052";
51462
+ var COMMIT_DATE = "2026-05-31T22:59:44Z";
51463
+ var LATEST_PR = 2033;
51305
51464
  var COMMITS_AHEAD_OF_TAG = 0;
51306
51465
 
51307
51466
  // gateway/boot-version.ts
@@ -40,7 +40,7 @@ import {
40
40
  } from 'fs'
41
41
  import { basename, join } from 'path'
42
42
  import { homedir } from 'os'
43
- import { projectSubagentLine, sanitizeCwdToProjectName } from './session-tail.js'
43
+ import { projectSubagentLine, sanitizeCwdToProjectName, detectErrorInTranscriptLine } from './session-tail.js'
44
44
  import { sanitiseToolArg } from './fleet-state.js'
45
45
  import { escapeHtml, truncate } from './card-format.js'
46
46
  import { bumpSubagentActivity, recordSubagentStall, recordSubagentResume, recordSubagentEnd, reapStuckRunningRows } from './registry/subagents-schema.js'
@@ -142,6 +142,21 @@ export interface WorkerEntry {
142
142
  * dead, the file is just left over from a prior session.
143
143
  */
144
144
  historical: boolean
145
+ /**
146
+ * True once a TERMINAL error line — a model API failure / quota
147
+ * exhaustion / crash, NOT an in-flight retry or a routine tool-level
148
+ * `is_error` result — has been observed in this worker's own
149
+ * transcript. Drives the `failed` terminal outcome so the handback
150
+ * tells the user the delegated work did NOT complete, instead of
151
+ * dressing a dead worker up as `completed`. Classified by
152
+ * `detectErrorInTranscriptLine` (the same gate the operator-event
153
+ * path uses), so transient mid-retry errors are excluded.
154
+ */
155
+ errored?: boolean
156
+ /** Human-readable detail from the terminal error line, surfaced in the
157
+ * failed handback's "what it reported before failing" slot when the
158
+ * worker left no narrative result of its own. */
159
+ errorDetail?: string
145
160
  }
146
161
 
147
162
  export interface SubagentWatcherConfig {
@@ -193,6 +208,23 @@ export interface SubagentWatcherConfig {
193
208
  * synthesis; tests use a tiny value to exercise the path.
194
209
  */
195
210
  silentStallTerminalMs?: number
211
+ /**
212
+ * Freshness window (ms) for promoting a running-at-boot worker file to
213
+ * live. A file whose last write (mtime) is older than this is treated as
214
+ * a dead prior-session worker and stays historical/suppressed, NOT
215
+ * promoted. Default 15 min (DEFAULT_INFLIGHT_PROMOTE_MAX_AGE_MS); env
216
+ * override `SWITCHROOM_SUBAGENT_INFLIGHT_MAX_AGE_MS`. Guards the v0.14.23
217
+ * stale-handback replay regression.
218
+ */
219
+ inflightPromoteMaxAgeMs?: number
220
+ /**
221
+ * Kill-switch for the boot-scan promotion path. When false, a
222
+ * running-at-boot worker is never promoted — the watcher reverts to the
223
+ * pre-v0.14.23 behaviour of leaving every boot-scan file historical
224
+ * (suppressed). Default true; env `SWITCHROOM_SUBAGENT_BOOT_PROMOTE=0`
225
+ * disables it fleet-wide without a code change (emergency lever).
226
+ */
227
+ bootPromoteEnabled?: boolean
196
228
  /**
197
229
  * Reaper TTL (ms): background rows in `status='running'` whose
198
230
  * `last_activity_at` (or `started_at` if liveness never wrote) is older
@@ -367,6 +399,29 @@ const DEFAULT_SILENT_SYNTHESIS_STALL_THRESHOLD_MS = 300_000
367
399
  */
368
400
  const DEFAULT_SILENT_STALL_TERMINAL_MS = 300_000
369
401
 
402
+ /**
403
+ * Freshness window for the boot-scan "in-flight at boot → promote to
404
+ * live" path. A worker file still in `running` state at boot is only
405
+ * promoted (un-suppressed) if its last write (file mtime) is within this
406
+ * window of now. The signal cleanly separates the two populations:
407
+ *
408
+ * - A worker genuinely in-flight across a restart / fleet rollout was
409
+ * writing right up until the container was recreated, so its mtime is
410
+ * seconds-to-minutes before the new gateway boots — well inside the
411
+ * window. The user is still awaiting it; promote it.
412
+ * - A worker that died in a PRIOR session without writing a terminal
413
+ * `turn_end` is also `running` in the file, but its mtime is hours-to-
414
+ * weeks old. These accumulate by the dozen-to-hundred in a long-lived
415
+ * agent's subagents dir. Promoting them replays stale handbacks
416
+ * (often `failed`, from old error lines) on every boot — the v0.14.23
417
+ * regression. Leave them historical/suppressed, exactly as before.
418
+ *
419
+ * 15 min is generous for any plausible restart gap (container recreate +
420
+ * image pull) yet far below the staleness of a dead prior-session file.
421
+ * Override with `SWITCHROOM_SUBAGENT_INFLIGHT_MAX_AGE_MS`.
422
+ */
423
+ const DEFAULT_INFLIGHT_PROMOTE_MAX_AGE_MS = 15 * 60_000
424
+
370
425
  /**
371
426
  * Cap on the result text retained per sub-agent (`entry.lastResultText`)
372
427
  * and carried to the gateway via `onFinish`. The gateway feeds this into
@@ -611,6 +666,20 @@ export function readSubTail(
611
666
  const startState = { hasEmittedStart: tail.hasEmittedStart }
612
667
  for (const line of lines) {
613
668
  if (!line) continue
669
+ // Gap 2 (failure honesty): a terminal error line in the worker's
670
+ // OWN transcript — a model API failure, quota exhaustion, or crash —
671
+ // means the worker FAILED, not finished. Reuse the operator-event
672
+ // classifier: `terminal:true` excludes in-flight retries (a 529 mid-
673
+ // backoff is `terminal:false`), and tool-level `is_error` results
674
+ // never reach here (they parse as `sub_agent_tool_result`, which is
675
+ // routine mid-run noise, not a worker death). The flag persists on
676
+ // the entry; the terminal transition (real turn_end OR stall
677
+ // synthesis) reads it to emit `failed` instead of `completed`.
678
+ const errInfo = detectErrorInTranscriptLine(line)
679
+ if (errInfo?.terminal) {
680
+ entry.errored = true
681
+ if (errInfo.detail) entry.errorDetail = errInfo.detail.slice(0, SUBAGENT_RESULT_TEXT_MAX)
682
+ }
614
683
  const events = projectSubagentLine(line, entry.agentId, startState)
615
684
  for (const ev of events) {
616
685
  const idleSecBeforeBump = Math.round((now - entry.lastActivityAt) / 1000)
@@ -716,7 +785,10 @@ export function readSubTail(
716
785
  recordSubagentEnd(db, {
717
786
  id: rowRef.id,
718
787
  endedAt: now,
719
- status: 'completed',
788
+ // Gap 2: keep the audit row honest — a worker that hit a
789
+ // terminal transcript error is `failed`, matching the
790
+ // handback outcome computed in maybySendStateTransition.
791
+ status: entry.errored ? 'failed' : 'completed',
720
792
  })
721
793
  }
722
794
  } catch (dbErr) {
@@ -778,6 +850,14 @@ export function startSubagentWatcher(config: SubagentWatcherConfig): SubagentWat
778
850
  config.silentStallTerminalMs
779
851
  ?? parseEnvMs('SWITCHROOM_SUBAGENT_STALL_TERMINAL_MS')
780
852
  ?? DEFAULT_SILENT_STALL_TERMINAL_MS
853
+ const inflightPromoteMaxAgeMs =
854
+ config.inflightPromoteMaxAgeMs
855
+ ?? parseEnvMs('SWITCHROOM_SUBAGENT_INFLIGHT_MAX_AGE_MS')
856
+ ?? DEFAULT_INFLIGHT_PROMOTE_MAX_AGE_MS
857
+ // Kill-switch: not parseEnvMs (which rejects `0`) — an explicit `=0`
858
+ // here MUST disable promotion (revert to pre-v0.14.23 suppression).
859
+ const bootPromoteEnabled =
860
+ config.bootPromoteEnabled ?? (process.env.SWITCHROOM_SUBAGENT_BOOT_PROMOTE !== '0')
781
861
  const reaperTtlMs = config.reaperTtlMs ?? DEFAULT_REAPER_TTL_MS
782
862
  const reaperIntervalMs = config.reaperIntervalMs ?? DEFAULT_REAPER_INTERVAL_MS
783
863
  const rescanMs = config.rescanMs ?? DEFAULT_RESCAN_MS
@@ -917,6 +997,56 @@ export function startSubagentWatcher(config: SubagentWatcherConfig): SubagentWat
917
997
  log?.(`subagent-watcher: description updated for ${agentId}: ${desc}`)
918
998
  }, fs, log, db, parentStateDir, config.onUnstall, undefined, config.onProgress)
919
999
 
1000
+ // Gap 1 (restart survival): a file still RUNNING at boot is a LIVE
1001
+ // worker that predates this watcher — typically one dispatched in a
1002
+ // prior gateway life and still in-flight across a restart / fleet
1003
+ // rollout, NOT a stale already-finished file. `historical` must
1004
+ // suppress replay only for done-at-boot files; an in-flight-at-boot
1005
+ // worker the user is still waiting on must get full live treatment:
1006
+ // progress nudges, the stall-synthesis safety net (checkStalls skips
1007
+ // historical entries), and a real `completed`/`failed` handback rather
1008
+ // than a dropped `orphan`. Promote it to a live entry here. (A file
1009
+ // already `done` at boot stays historical and is short-circuited just
1010
+ // below — it finished before this session.)
1011
+ if (isHistorical && entry.state === 'running') {
1012
+ // Freshness gate (v0.14.24): only promote a file whose LAST WRITE is
1013
+ // recent. A genuinely in-flight-across-a-restart worker was writing
1014
+ // until the container was recreated (mtime seconds-to-minutes old); a
1015
+ // dead prior-session worker that never wrote a terminal turn_end is
1016
+ // also `running` but hours-to-weeks stale. Promoting the latter
1017
+ // replayed stale `failed` handbacks on every boot (the v0.14.23
1018
+ // fleet-wide regression). Unreadable mtime → treat as stale (suppress
1019
+ // rather than risk re-spamming). The kill-switch reverts to pre-fix
1020
+ // suppression entirely.
1021
+ let fileAgeMs = Infinity
1022
+ try {
1023
+ const st = fs.statSync(filePath)
1024
+ if (typeof st.mtimeMs === 'number') fileAgeMs = n - st.mtimeMs
1025
+ } catch {
1026
+ /* unreadable → Infinity → treated as stale below */
1027
+ }
1028
+ if (!bootPromoteEnabled) {
1029
+ log?.(`subagent-watcher: ${agentId} running at boot but promotion disabled (SWITCHROOM_SUBAGENT_BOOT_PROMOTE=0) — leaving historical`)
1030
+ } else if (fileAgeMs > inflightPromoteMaxAgeMs) {
1031
+ log?.(`subagent-watcher: ${agentId} running at boot but stale (last write ${Math.round(fileAgeMs / 1000)}s ago > ${Math.round(inflightPromoteMaxAgeMs / 1000)}s) — leaving historical (dead prior-session worker, not in-flight)`)
1032
+ } else {
1033
+ entry.historical = false
1034
+ log?.(`subagent-watcher: ${agentId} was in-flight at boot — promoting to live (last write ${Math.round(fileAgeMs / 1000)}s ago; user still awaiting handback)`)
1035
+ // The prior gateway life's registration normally linked
1036
+ // jsonl_agent_id already, but re-run the backfill idempotently in
1037
+ // case that life crashed before the link persisted — the handback's
1038
+ // isBackground lookup is keyed on jsonl_agent_id, and an unlinked row
1039
+ // would mis-resolve the worker as foreground and drop the handback.
1040
+ if (db != null) {
1041
+ try {
1042
+ backfillJsonlAgentId(db, filePath, agentId, log)
1043
+ } catch (err) {
1044
+ log?.(`subagent-watcher: backfill error for ${agentId}: ${(err as Error).message}`)
1045
+ }
1046
+ }
1047
+ }
1048
+ }
1049
+
920
1050
  // If the JSONL already contained a turn_end at registration time
921
1051
  // (file written-then-watched), fire the state-transition + completion
922
1052
  // notification now. Otherwise the FSWatcher callback handles it on
@@ -980,11 +1110,22 @@ export function startSubagentWatcher(config: SubagentWatcherConfig): SubagentWat
980
1110
  config.onFinish({
981
1111
  agentId,
982
1112
  state: entry.state,
983
- outcome: entry.historical ? 'orphan' : 'completed',
1113
+ // Gap 2: a terminal error observed in the transcript wins over
1114
+ // the completed/orphan classification — a worker that crashed
1115
+ // is `failed`, even if it later wrote a turn_end or aged into
1116
+ // stall synthesis. `orphan` remains for genuinely stale
1117
+ // done-at-boot rows (which never reach this path; see
1118
+ // registerAgent's short-circuit + Gap 1 promotion).
1119
+ outcome: entry.errored ? 'failed' : entry.historical ? 'orphan' : 'completed',
984
1120
  toolCount: entry.toolCount,
985
1121
  durationMs: nowFn() - entry.dispatchedAt,
986
1122
  description: entry.description,
987
- resultText: entry.lastResultText,
1123
+ // For a failure, fall back to the error detail when the worker
1124
+ // left no narrative of its own — so the handback's "what it
1125
+ // reported before failing" slot is never empty on a crash.
1126
+ resultText: entry.errored
1127
+ ? entry.lastResultText || entry.errorDetail || ''
1128
+ : entry.lastResultText,
988
1129
  })
989
1130
  } catch (cbErr) {
990
1131
  log?.(`subagent-watcher: onFinish callback error ${agentId}: ${(cbErr as Error).message}`)
@@ -1151,7 +1292,10 @@ export function startSubagentWatcher(config: SubagentWatcherConfig): SubagentWat
1151
1292
  recordSubagentEnd(db, {
1152
1293
  id: rowRef.id,
1153
1294
  endedAt: n,
1154
- status: 'completed',
1295
+ // Gap 2: a worker that hit a terminal transcript error before
1296
+ // going silent is `failed`, not `completed` — keep the audit
1297
+ // row consistent with the handback outcome.
1298
+ status: entry.errored ? 'failed' : 'completed',
1155
1299
  })
1156
1300
  }
1157
1301
  } catch (dbErr) {
@@ -624,13 +624,17 @@ describe('Bug 3 — stalled-row sweeper: watcher must call recordSubagentStall i
624
624
  h.watcher.stop()
625
625
  })
626
626
 
627
- it('does not call stall for historical entries (pre-existing at boot)', () => {
627
+ it('does not call stall for historical (done-at-boot) entries', () => {
628
+ // A worker that already FINISHED before boot (turn_end present) stays
629
+ // historical and must not write stall rows. A still-RUNNING file at
630
+ // boot is a different case — Gap 1 promotes it to live so it DOES get
631
+ // the stall safety net (covered in subagent-watcher-handback-gaps).
628
632
  const agentDir = '/home/user/.switchroom/agents/myagent'
629
633
  const subagentsDir = `${agentDir}/.claude/projects/p1/session-abc/subagents`
630
634
  const jsonlStem = 'hist-agent'
631
635
  const toolUseId = 'toolu_hist001'
632
636
  const jsonlPath = `${subagentsDir}/agent-${jsonlStem}.jsonl`
633
- const content = buildJSONL(subAgentUserMsg('Old task'))
637
+ const content = buildJSONL(subAgentUserMsg('Old task'), subAgentTurnDuration())
634
638
 
635
639
  const db = makeInMemoryDb({
636
640
  [toolUseId]: { id: toolUseId, jsonl_agent_id: jsonlStem, status: 'running' },
@@ -648,7 +652,7 @@ describe('Bug 3 — stalled-row sweeper: watcher must call recordSubagentStall i
648
652
  db,
649
653
  })
650
654
 
651
- // Do NOT flip historical entry is historical by default (file at boot)
655
+ // Done-at-boot stays historical (not promoted); no stall write fires.
652
656
  h.advance(65_000)
653
657
 
654
658
  const stallDbCalls = db._calls.filter(
@@ -0,0 +1,380 @@
1
+ /**
2
+ * Tests for the two background-worker handback gaps closed in
3
+ * `fix/subagent-handback-restart-and-failure`:
4
+ *
5
+ * Gap 1 — restart survival. A background worker that is in-flight when
6
+ * the gateway restarts is discovered by the boot scan and tagged
7
+ * `historical`. That flag is meant to suppress replay for workers that
8
+ * ALREADY finished before boot — but it was also applied to workers
9
+ * still running, which then completed with outcome `orphan`, and the
10
+ * handback gate drops `orphan`. Net: dispatched worker + any gateway
11
+ * bounce (incl. a fleet rollout) + worker finishes = user never told.
12
+ * Fix: a file still `running` at boot is promoted to a LIVE entry, so
13
+ * it gets the stall-synthesis safety net and a real `completed`/`failed`
14
+ * handback. A file already `done` at boot stays suppressed.
15
+ *
16
+ * Gap 2 — failure honesty. The `failed` outcome was dead code (no caller
17
+ * set it), so every dead worker was reported `completed`. Fix: a
18
+ * TERMINAL error line in the worker's own transcript (model API failure
19
+ * / quota exhaustion / crash — not an in-flight retry, not a routine
20
+ * tool-level is_error) flips the terminal outcome to `failed` and
21
+ * carries the error detail into the handback result.
22
+ */
23
+
24
+ import { describe, it, expect, vi } from 'vitest'
25
+ import { startSubagentWatcher } from '../subagent-watcher.js'
26
+ import * as fs from 'fs'
27
+
28
+ function buildJSONL(...lines: object[]): string {
29
+ return lines.map((l) => JSON.stringify(l)).join('\n') + '\n'
30
+ }
31
+ function subAgentUserMsg(promptText: string) {
32
+ return { type: 'user', message: { content: [{ type: 'text', text: promptText }] } }
33
+ }
34
+ function subAgentText(text: string) {
35
+ return { type: 'assistant', message: { content: [{ type: 'text', text }] } }
36
+ }
37
+ function subAgentTurnEnd() {
38
+ return { type: 'system', subtype: 'turn_duration', duration_ms: 1234 }
39
+ }
40
+ // A terminal error line in the worker's OWN transcript — the model call
41
+ // itself failed (here an invalid_request_error). `detectErrorInTranscriptLine`
42
+ // classifies an explicit `type:"error"` line with a non-rate-limit kind as
43
+ // terminal:true.
44
+ function subAgentTerminalError(message: string) {
45
+ return { type: 'error', error: { type: 'invalid_request_error', message } }
46
+ }
47
+ // A routine mid-run tool failure (e.g. a grep that found nothing). This is a
48
+ // `sub_agent_tool_result` with is_error — NOT a worker death. Must NOT trip
49
+ // the failed classification.
50
+ function subAgentToolResultError() {
51
+ return {
52
+ type: 'user',
53
+ message: {
54
+ content: [{ type: 'tool_result', tool_use_id: 'toolu_x', is_error: true, content: 'no matches found' }],
55
+ },
56
+ }
57
+ }
58
+
59
+ interface FinishCall {
60
+ agentId: string
61
+ outcome: string
62
+ resultText: string
63
+ }
64
+
65
+ interface Harness {
66
+ stallTerminalCalls: Array<{ agentId: string }>
67
+ finishCalls: FinishCall[]
68
+ logs: string[]
69
+ advance: (ms: number) => void
70
+ watcher: ReturnType<typeof startSubagentWatcher>
71
+ fileContents: Map<string, Buffer>
72
+ jsonlPath: string
73
+ append: (...lines: object[]) => void
74
+ }
75
+
76
+ function makeHarness(opts: {
77
+ agentId?: string
78
+ /** Lines present in the JSONL at boot (before the watcher starts). */
79
+ bootLines: object[]
80
+ stallThresholdMs?: number
81
+ silentStallTerminalMs?: number
82
+ rescanMs?: number
83
+ /** How long ago (ms) the boot file was last written, i.e. its mtime is
84
+ * `currentTime - bootFileAgeMs` at registration. Default 0 (fresh, so the
85
+ * freshness gate promotes it). Set large to simulate a dead prior-session
86
+ * worker that must NOT be promoted. */
87
+ bootFileAgeMs?: number
88
+ /** Kill-switch passthrough; default true (promotion enabled). */
89
+ bootPromoteEnabled?: boolean
90
+ inflightPromoteMaxAgeMs?: number
91
+ }): Harness {
92
+ const {
93
+ agentId = 'gap-agent',
94
+ bootLines,
95
+ stallThresholdMs = 60_000,
96
+ silentStallTerminalMs = 300_000,
97
+ rescanMs = 500,
98
+ bootFileAgeMs = 0,
99
+ bootPromoteEnabled = true,
100
+ inflightPromoteMaxAgeMs,
101
+ } = opts
102
+
103
+ let currentTime = 1000
104
+ const stallTerminalCalls: Array<{ agentId: string }> = []
105
+ const finishCalls: FinishCall[] = []
106
+ const logs: string[] = []
107
+
108
+ const agentDir = '/home/user/.switchroom/agents/myagent'
109
+ const sessionId = 'mock-session'
110
+ const projectsRoot = `${agentDir}/.claude/projects`
111
+ const projectDir = `${projectsRoot}/mock-cwd`
112
+ const sessionDir = `${projectDir}/${sessionId}`
113
+ const subagentsDir = `${sessionDir}/subagents`
114
+ const jsonlPath = `${subagentsDir}/agent-${agentId}.jsonl`
115
+
116
+ const fileContents = new Map<string, Buffer>()
117
+ fileContents.set(jsonlPath, Buffer.from(buildJSONL(...bootLines), 'utf-8'))
118
+ // Per-file mtime (ms). The boot file's last write is `bootFileAgeMs` in the
119
+ // past; appends bump it to currentTime. The freshness gate reads this.
120
+ const fileMtimes = new Map<string, number>()
121
+ fileMtimes.set(jsonlPath, 1000 - bootFileAgeMs)
122
+
123
+ let lastOpenedPath: string | null = null
124
+ const mockFs = {
125
+ existsSync: ((p: fs.PathLike) => {
126
+ const ps = String(p)
127
+ if (ps === projectsRoot || ps === projectDir || ps === sessionDir || ps === subagentsDir) return true
128
+ if (fileContents.has(ps)) return true
129
+ return false
130
+ }) as typeof fs.existsSync,
131
+ readdirSync: ((p: fs.PathLike) => {
132
+ const ps = String(p)
133
+ if (ps === projectsRoot) return ['mock-cwd']
134
+ if (ps === projectDir) return [sessionId]
135
+ if (ps === sessionDir) return ['subagents']
136
+ if (ps === subagentsDir) return [`agent-${agentId}.jsonl`]
137
+ return []
138
+ }) as unknown as typeof fs.readdirSync,
139
+ statSync: ((p: fs.PathLike) => ({ size: fileContents.get(String(p))?.length ?? 0, mtimeMs: fileMtimes.get(String(p)) ?? currentTime }) as fs.Stats) as typeof fs.statSync,
140
+ openSync: ((p: fs.PathLike) => {
141
+ lastOpenedPath = String(p)
142
+ return 42
143
+ }) as unknown as typeof fs.openSync,
144
+ closeSync: (() => { lastOpenedPath = null }) as typeof fs.closeSync,
145
+ readSync: ((
146
+ _fd: number,
147
+ buf: NodeJS.ArrayBufferView,
148
+ offset: number,
149
+ length: number,
150
+ position: number | null,
151
+ ): number => {
152
+ const content = lastOpenedPath != null ? fileContents.get(lastOpenedPath) : undefined
153
+ if (!content) return 0
154
+ const pos = position ?? 0
155
+ const src = content.slice(pos, pos + length)
156
+ ;(src as Buffer).copy(buf as Buffer, offset)
157
+ return src.length
158
+ }) as unknown as typeof fs.readSync,
159
+ watch: (() => ({ close: vi.fn() }) as unknown as fs.FSWatcher) as unknown as typeof fs.watch,
160
+ }
161
+
162
+ const intervals: Array<{ fn: () => void; ms: number; ref: number; fireAt: number }> = []
163
+ let nextRef = 1
164
+
165
+ const watcher = startSubagentWatcher({
166
+ agentDir,
167
+ stallThresholdMs,
168
+ silentSynthesisStallThresholdMs: stallThresholdMs,
169
+ silentStallTerminalMs,
170
+ rescanMs,
171
+ bootPromoteEnabled,
172
+ ...(inflightPromoteMaxAgeMs != null ? { inflightPromoteMaxAgeMs } : {}),
173
+ onStallTerminal: (id) => stallTerminalCalls.push({ agentId: id }),
174
+ onFinish: ({ agentId: id, outcome, resultText }) =>
175
+ finishCalls.push({ agentId: id, outcome, resultText }),
176
+ now: () => currentTime,
177
+ setInterval: (fn, ms) => {
178
+ const ref = nextRef++
179
+ intervals.push({ fn, ms, ref, fireAt: currentTime + ms })
180
+ return { ref }
181
+ },
182
+ clearInterval: (handle) => {
183
+ const { ref } = handle as { ref: number }
184
+ const idx = intervals.findIndex((i) => i.ref === ref)
185
+ if (idx !== -1) intervals.splice(idx, 1)
186
+ },
187
+ fs: mockFs,
188
+ log: (msg) => logs.push(msg),
189
+ })
190
+
191
+ const advance = (ms: number): void => {
192
+ currentTime += ms
193
+ for (;;) {
194
+ intervals.sort((a, b) => a.fireAt - b.fireAt)
195
+ const next = intervals[0]
196
+ if (!next || next.fireAt > currentTime) break
197
+ next.fireAt += next.ms
198
+ next.fn()
199
+ }
200
+ }
201
+
202
+ const append = (...lines: object[]): void => {
203
+ const cur = fileContents.get(jsonlPath) ?? Buffer.alloc(0)
204
+ const more = buildJSONL(...lines)
205
+ fileContents.set(jsonlPath, Buffer.concat([cur, Buffer.from(more, 'utf-8')]))
206
+ fileMtimes.set(jsonlPath, currentTime)
207
+ }
208
+
209
+ return { stallTerminalCalls, finishCalls, logs, advance, watcher, fileContents, jsonlPath, append }
210
+ }
211
+
212
+ describe('Gap 1 — background worker in-flight across a gateway restart', () => {
213
+ it('an in-flight-at-boot worker that completes hands back as completed (not orphan)', () => {
214
+ // Boot scan finds a running worker (prompt, no turn_end yet) → tagged
215
+ // historical. The fix promotes it to live. When it finishes under our
216
+ // watch, the outcome must be `completed` so the handback delivers.
217
+ const h = makeHarness({ agentId: 'gap1-complete', bootLines: [subAgentUserMsg('bg task')] })
218
+
219
+ // The worker finishes after the restart.
220
+ h.append(subAgentText('Found the root cause in auth.ts'), subAgentTurnEnd())
221
+ h.advance(600) // one poll reads the new bytes
222
+
223
+ expect(h.finishCalls).toHaveLength(1)
224
+ expect(h.finishCalls[0].agentId).toBe('gap1-complete')
225
+ expect(h.finishCalls[0].outcome).toBe('completed') // pre-fix: 'orphan' → dropped
226
+ expect(h.finishCalls[0].resultText).toContain('root cause')
227
+ // The promotion is logged so the path is observable in prod.
228
+ expect(h.logs.some((l) => l.includes('in-flight at boot — promoting to live'))).toBe(true)
229
+ })
230
+
231
+ it('an in-flight-at-boot worker that dies silently is rescued by stall synthesis', () => {
232
+ // Pre-fix, historical entries were skipped by stall detection, so a
233
+ // worker that crossed a restart and then went silent sat running
234
+ // forever — no handback ever. After promotion it gets the safety net.
235
+ const h = makeHarness({
236
+ agentId: 'gap1-silent',
237
+ bootLines: [subAgentUserMsg('bg task')],
238
+ stallThresholdMs: 60_000,
239
+ silentStallTerminalMs: 120_000,
240
+ })
241
+
242
+ h.advance(62_000) // stall threshold crossed
243
+ expect(h.stallTerminalCalls).toHaveLength(0)
244
+ h.advance(121_000) // silent-stall terminal window elapses → synthesis
245
+ expect(h.stallTerminalCalls).toHaveLength(1)
246
+ expect(h.finishCalls).toHaveLength(1)
247
+ expect(h.finishCalls[0].outcome).toBe('completed')
248
+ })
249
+
250
+ it('a worker already DONE at boot stays suppressed (no spurious replay)', () => {
251
+ // The legitimate use of `historical`: a worker that finished in a prior
252
+ // session must NOT re-fire a handback on every restart. This is the
253
+ // regression guard for the fix.
254
+ const h = makeHarness({
255
+ agentId: 'gap1-stale',
256
+ bootLines: [subAgentUserMsg('bg task'), subAgentText('done long ago'), subAgentTurnEnd()],
257
+ })
258
+
259
+ h.advance(600)
260
+ h.advance(600_000) // well past any stall window
261
+ expect(h.finishCalls).toHaveLength(0)
262
+ expect(h.stallTerminalCalls).toHaveLength(0)
263
+ })
264
+ })
265
+
266
+ describe('Gap 1 freshness gate — v0.14.24 stale-replay regression', () => {
267
+ // The v0.14.23 regression: promoting EVERY running-at-boot file replayed
268
+ // weeks-old dead prior-session workers as handbacks (often `failed`, from
269
+ // old error lines) on every boot, spamming the whole fleet. The gate
270
+ // promotes only files whose last write is recent.
271
+
272
+ it('a STALE running-at-boot worker (weeks-old mtime) is NOT promoted — no handback, no stall', () => {
273
+ const h = makeHarness({
274
+ agentId: 'gap1-stale-running',
275
+ bootLines: [subAgentUserMsg('bg task from weeks ago')], // running: no turn_end
276
+ bootFileAgeMs: 21 * 24 * 60 * 60_000, // 21 days old — clearly dead
277
+ silentStallTerminalMs: 120_000,
278
+ })
279
+
280
+ h.advance(600)
281
+ h.advance(600_000) // far past every stall/synthesis window
282
+ expect(h.finishCalls).toHaveLength(0) // pre-fix: a spurious (often failed) handback
283
+ expect(h.stallTerminalCalls).toHaveLength(0)
284
+ expect(h.logs.some((l) => l.includes('stale') && l.includes('leaving historical'))).toBe(true)
285
+ })
286
+
287
+ it('a FRESH running-at-boot worker (recent mtime) IS still promoted and hands back', () => {
288
+ // Preserve the genuine Gap 1 fix: a worker in-flight across a restart
289
+ // (wrote moments before the bounce) must still get promoted + handed back.
290
+ const h = makeHarness({
291
+ agentId: 'gap1-fresh-running',
292
+ bootLines: [subAgentUserMsg('bg task')],
293
+ bootFileAgeMs: 30_000, // 30s old — in-flight across a quick restart
294
+ })
295
+
296
+ h.append(subAgentText('Finished the migration'), subAgentTurnEnd())
297
+ h.advance(600)
298
+
299
+ expect(h.finishCalls).toHaveLength(1)
300
+ expect(h.finishCalls[0].outcome).toBe('completed')
301
+ expect(h.logs.some((l) => l.includes('promoting to live'))).toBe(true)
302
+ })
303
+
304
+ it('kill-switch (bootPromoteEnabled=false) suppresses even a fresh running-at-boot worker', () => {
305
+ const h = makeHarness({
306
+ agentId: 'gap1-killswitch',
307
+ bootLines: [subAgentUserMsg('bg task')],
308
+ bootFileAgeMs: 5_000, // fresh — would normally promote
309
+ bootPromoteEnabled: false,
310
+ silentStallTerminalMs: 120_000,
311
+ })
312
+
313
+ h.advance(600)
314
+ h.advance(600_000)
315
+ expect(h.finishCalls).toHaveLength(0)
316
+ expect(h.logs.some((l) => l.includes('promotion disabled'))).toBe(true)
317
+ })
318
+
319
+ it('a worker just past the freshness window is NOT promoted (boundary)', () => {
320
+ const h = makeHarness({
321
+ agentId: 'gap1-boundary',
322
+ bootLines: [subAgentUserMsg('bg task')],
323
+ inflightPromoteMaxAgeMs: 60_000, // 60s window
324
+ bootFileAgeMs: 90_000, // 90s old → just stale
325
+ silentStallTerminalMs: 120_000,
326
+ })
327
+
328
+ h.advance(600)
329
+ h.advance(600_000)
330
+ expect(h.finishCalls).toHaveLength(0)
331
+ expect(h.logs.some((l) => l.includes('stale'))).toBe(true)
332
+ })
333
+ })
334
+
335
+ describe('Gap 2 — failure honesty', () => {
336
+ it('a terminal error line flips the outcome to failed and carries the detail', () => {
337
+ const h = makeHarness({ agentId: 'gap2-failed', bootLines: [subAgentUserMsg('bg task')] })
338
+
339
+ // The worker's model call errors out, then the transcript ends.
340
+ h.append(subAgentTerminalError('tool input rejected by the API'), subAgentTurnEnd())
341
+ h.advance(600)
342
+
343
+ expect(h.finishCalls).toHaveLength(1)
344
+ expect(h.finishCalls[0].outcome).toBe('failed')
345
+ // No narrative was emitted, so the detail backfills the result slot.
346
+ expect(h.finishCalls[0].resultText).toContain('tool input rejected')
347
+ })
348
+
349
+ it('a failed worker that went silent still synthesises terminal as failed', () => {
350
+ const h = makeHarness({
351
+ agentId: 'gap2-failed-silent',
352
+ bootLines: [subAgentUserMsg('bg task')],
353
+ stallThresholdMs: 60_000,
354
+ silentStallTerminalMs: 120_000,
355
+ })
356
+
357
+ // Error line, then the worker goes silent (no turn_end).
358
+ h.append(subAgentTerminalError('worker process crashed'))
359
+ h.advance(600) // read the error line
360
+ h.advance(62_000) // stall
361
+ h.advance(121_000) // synthesis
362
+ expect(h.stallTerminalCalls).toHaveLength(1)
363
+ expect(h.finishCalls).toHaveLength(1)
364
+ expect(h.finishCalls[0].outcome).toBe('failed')
365
+ expect(h.finishCalls[0].resultText).toContain('crashed')
366
+ })
367
+
368
+ it('a routine mid-run tool error does NOT cause a false failure', () => {
369
+ const h = makeHarness({ agentId: 'gap2-toolerr', bootLines: [subAgentUserMsg('bg task')] })
370
+
371
+ // A tool_result with is_error (e.g. grep found nothing) mid-run, then
372
+ // the worker recovers and completes normally.
373
+ h.append(subAgentToolResultError(), subAgentText('Completed after a retry'), subAgentTurnEnd())
374
+ h.advance(600)
375
+
376
+ expect(h.finishCalls).toHaveLength(1)
377
+ expect(h.finishCalls[0].outcome).toBe('completed') // NOT failed
378
+ expect(h.finishCalls[0].resultText).toContain('Completed after a retry')
379
+ })
380
+ })
@@ -693,18 +693,21 @@ describe('startSubagentWatcher', () => {
693
693
  h.watcher.stop()
694
694
  })
695
695
 
696
- it('suppresses stall notifications for historical entries', () => {
697
- // Historical entries (file existed at watcher boot) must NOT fire
698
- // stall notifications. The sub-agent process is long dead; the file
699
- // is just left over from a prior session. With many historicals
700
- // present at restart, firing stalls for each would flood the chat.
696
+ it('suppresses stall notifications for historical (done-at-boot) entries', () => {
697
+ // A worker that already FINISHED before the watcher booted (turn_end
698
+ // present in the file) stays historical and must NOT fire stall
699
+ // notifications. With months of finished session history present at
700
+ // restart, firing stalls for each would flood the chat. NOTE: a worker
701
+ // still RUNNING at boot is a different case — Gap 1 promotes it to live
702
+ // so it DOES get the stall safety net (it's an in-flight worker the
703
+ // user is still awaiting), covered in subagent-watcher-handback-gaps.
701
704
  const agentDir = '/home/user/.switchroom/agents/myagent'
702
705
  const projectsRoot = `${agentDir}/.claude/projects`
703
706
  const projectDir = `${projectsRoot}/myproject`
704
707
  const sessionDir = `${projectDir}/session-abc123`
705
708
  const subagentsDir = `${sessionDir}/subagents`
706
709
  const jsonlPath = `${subagentsDir}/agent-deadbeef.jsonl`
707
- const content = buildJSONL(subAgentUserMsg('Old task'))
710
+ const content = buildJSONL(subAgentUserMsg('Old task'), subAgentTurnDuration())
708
711
 
709
712
  const h = makeHarness({
710
713
  agentDir,
@@ -809,12 +812,15 @@ describe('startSubagentWatcher', () => {
809
812
 
810
813
  describe('historical-vs-active filter', () => {
811
814
  /**
812
- * Pre-existing JSONL files at watcher boot are tagged historical=true.
813
- * Stalls and completion notifications are gated on !historical so a
814
- * restart with months of session history doesn't flood the chat.
815
+ * Pre-existing FINISHED (done-at-boot) JSONL files are tagged
816
+ * historical=true. Stalls and completion notifications are gated on
817
+ * !historical so a restart with months of session history doesn't
818
+ * flood the chat. (A still-RUNNING file at boot is promoted to live by
819
+ * Gap 1 — see subagent-watcher-handback-gaps — so it must carry a
820
+ * turn_end here to stay historical.)
815
821
  */
816
822
 
817
- it('pre-existing JSONL files at startup are tagged historical', () => {
823
+ it('pre-existing done-at-boot JSONL files are tagged historical', () => {
818
824
  const agentDir = '/home/user/.switchroom/agents/myagent'
819
825
  const projectsRoot = `${agentDir}/.claude/projects`
820
826
  const projectDir = `${projectsRoot}/myproject`
@@ -823,7 +829,7 @@ describe('startSubagentWatcher', () => {
823
829
  const jsonlA = `${subagentsDir}/agent-hist-aaaa.jsonl`
824
830
  const jsonlB = `${subagentsDir}/agent-hist-bbbb.jsonl`
825
831
 
826
- const content = buildJSONL(subAgentUserMsg('Old task'))
832
+ const content = buildJSONL(subAgentUserMsg('Old task'), subAgentTurnDuration())
827
833
 
828
834
  const h = makeHarness({
829
835
  agentDir,
@@ -895,10 +901,12 @@ describe('startSubagentWatcher', () => {
895
901
  })
896
902
 
897
903
  it('pre-existing in-flight agent that finishes after restart fires completion', () => {
898
- // Historical at boot. Then writes turn_end. Completion notification
899
- // still fires for the state transition (the file was in-flight at
900
- // boot, so the transition is meaningful even if the entry is tagged
901
- // historical for stall-suppression purposes).
904
+ // Running at boot Gap 1 promotes it to live (historical=false),
905
+ // because it's an in-flight worker the user is still awaiting across
906
+ // the restart. When it then writes turn_end, the completion
907
+ // notification fires for the state transition. (The deeper handback
908
+ // outcome — completed, not the dropped `orphan` — is covered in
909
+ // subagent-watcher-handback-gaps.)
902
910
  const agentDir = '/home/user/.switchroom/agents/myagent'
903
911
  const projectsRoot = `${agentDir}/.claude/projects`
904
912
  const projectDir = `${projectsRoot}/myproject`