switchroom 0.14.22 → 0.14.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/switchroom.js +2 -2
- package/package.json +1 -1
- package/telegram-plugin/dist/gateway/gateway.js +167 -8
- package/telegram-plugin/subagent-watcher.ts +149 -5
- package/telegram-plugin/tests/subagent-registry-bugs.test.ts +7 -3
- package/telegram-plugin/tests/subagent-watcher-handback-gaps.test.ts +380 -0
- package/telegram-plugin/tests/subagent-watcher.test.ts +23 -15
package/dist/cli/switchroom.js
CHANGED
|
@@ -49420,8 +49420,8 @@ var {
|
|
|
49420
49420
|
} = import__.default;
|
|
49421
49421
|
|
|
49422
49422
|
// src/build-info.ts
|
|
49423
|
-
var VERSION = "0.14.
|
|
49424
|
-
var COMMIT_SHA = "
|
|
49423
|
+
var VERSION = "0.14.24";
|
|
49424
|
+
var COMMIT_SHA = "2711d052";
|
|
49425
49425
|
|
|
49426
49426
|
// src/cli/agent.ts
|
|
49427
49427
|
init_source();
|
package/package.json
CHANGED
|
@@ -48827,6 +48827,67 @@ import {
|
|
|
48827
48827
|
import { join as join21 } from "path";
|
|
48828
48828
|
|
|
48829
48829
|
// operator-events.ts
|
|
48830
|
+
function classifyClaudeError(raw) {
|
|
48831
|
+
try {
|
|
48832
|
+
return classifyInner(raw);
|
|
48833
|
+
} catch {
|
|
48834
|
+
return "unknown-4xx";
|
|
48835
|
+
}
|
|
48836
|
+
}
|
|
48837
|
+
function classifyInner(raw) {
|
|
48838
|
+
if (raw == null)
|
|
48839
|
+
return "unknown-4xx";
|
|
48840
|
+
const obj = typeof raw === "object" ? raw : {};
|
|
48841
|
+
const errorType = extractString(obj, "error_type") ?? extractString(obj, "type") ?? extractString(getNestedObj(obj, "error"), "type") ?? "";
|
|
48842
|
+
const errorCode = extractString(obj, "code") ?? extractString(getNestedObj(obj, "error"), "code") ?? "";
|
|
48843
|
+
const message = extractString(obj, "message") ?? extractString(getNestedObj(obj, "error"), "message") ?? (typeof raw === "string" ? raw : "") ?? "";
|
|
48844
|
+
const status = extractNumber(obj, "status") ?? extractNumber(obj, "statusCode") ?? extractNumber(obj, "status_code") ?? null;
|
|
48845
|
+
const sdkCode = extractString(obj, "error_code") ?? "";
|
|
48846
|
+
if (errorType === "authentication_error" || errorCode === "authentication_error" || sdkCode === "authentication_error" || message.toLowerCase().includes("authentication_error")) {
|
|
48847
|
+
const msg = message.toLowerCase();
|
|
48848
|
+
if (msg.includes("expired") || msg.includes("refresh")) {
|
|
48849
|
+
return "credentials-expired";
|
|
48850
|
+
}
|
|
48851
|
+
return "credentials-invalid";
|
|
48852
|
+
}
|
|
48853
|
+
if (errorType === "invalid_api_key" || errorCode === "invalid_api_key" || sdkCode === "invalid_api_key" || message.toLowerCase().includes("invalid_api_key") || message.toLowerCase().includes("invalid api key")) {
|
|
48854
|
+
return "credentials-invalid";
|
|
48855
|
+
}
|
|
48856
|
+
if (errorType === "credit_balance_too_low" || errorCode === "credit_balance_too_low" || sdkCode === "credit_balance_too_low" || message.toLowerCase().includes("credit_balance_too_low") || message.toLowerCase().includes("credit balance")) {
|
|
48857
|
+
return "credit-exhausted";
|
|
48858
|
+
}
|
|
48859
|
+
if (errorType === "rate_limit_error" || errorCode === "rate_limit_error" || sdkCode === "rate_limit_error" || message.toLowerCase().includes("rate_limit_error") || message.toLowerCase().includes("rate limit")) {
|
|
48860
|
+
return "rate-limited";
|
|
48861
|
+
}
|
|
48862
|
+
if (errorType === "overloaded_error" || errorCode === "overloaded_error" || sdkCode === "overloaded_error" || message.toLowerCase().includes("overloaded_error") || message.toLowerCase().includes("overloaded")) {
|
|
48863
|
+
return "rate-limited";
|
|
48864
|
+
}
|
|
48865
|
+
if (errorType === "agent-crashed" || errorCode === "agent-crashed") {
|
|
48866
|
+
return "agent-crashed";
|
|
48867
|
+
}
|
|
48868
|
+
if (errorType === "agent-restarted-unexpectedly" || errorCode === "agent-restarted-unexpectedly") {
|
|
48869
|
+
return "agent-restarted-unexpectedly";
|
|
48870
|
+
}
|
|
48871
|
+
if (status != null) {
|
|
48872
|
+
if (status >= 400 && status < 500)
|
|
48873
|
+
return "unknown-4xx";
|
|
48874
|
+
if (status >= 500 && status < 600)
|
|
48875
|
+
return "unknown-5xx";
|
|
48876
|
+
}
|
|
48877
|
+
return "unknown-4xx";
|
|
48878
|
+
}
|
|
48879
|
+
function extractString(obj, key) {
|
|
48880
|
+
const v = obj[key];
|
|
48881
|
+
return typeof v === "string" && v.length > 0 ? v : null;
|
|
48882
|
+
}
|
|
48883
|
+
function extractNumber(obj, key) {
|
|
48884
|
+
const v = obj[key];
|
|
48885
|
+
return typeof v === "number" ? v : null;
|
|
48886
|
+
}
|
|
48887
|
+
function getNestedObj(obj, key) {
|
|
48888
|
+
const v = obj[key];
|
|
48889
|
+
return typeof v === "object" && v != null ? v : {};
|
|
48890
|
+
}
|
|
48830
48891
|
var DEFAULT_OPERATOR_EVENT_COOLDOWN_MS2 = 5 * 60000;
|
|
48831
48892
|
var cooldownMap2 = new Map;
|
|
48832
48893
|
|
|
@@ -48936,6 +48997,72 @@ function projectSubagentLine(line, agentId, state4) {
|
|
|
48936
48997
|
}
|
|
48937
48998
|
return [];
|
|
48938
48999
|
}
|
|
49000
|
+
function extractRetryState(obj) {
|
|
49001
|
+
return {
|
|
49002
|
+
retryAttempt: typeof obj.retryAttempt === "number" ? obj.retryAttempt : null,
|
|
49003
|
+
maxRetries: typeof obj.maxRetries === "number" ? obj.maxRetries : null
|
|
49004
|
+
};
|
|
49005
|
+
}
|
|
49006
|
+
function detectErrorInTranscriptLine(line) {
|
|
49007
|
+
if (!line || line.length > 2 * 1024 * 1024)
|
|
49008
|
+
return null;
|
|
49009
|
+
let obj;
|
|
49010
|
+
try {
|
|
49011
|
+
obj = JSON.parse(line);
|
|
49012
|
+
} catch {
|
|
49013
|
+
return null;
|
|
49014
|
+
}
|
|
49015
|
+
if (typeof obj !== "object" || obj == null)
|
|
49016
|
+
return null;
|
|
49017
|
+
const type = obj.type;
|
|
49018
|
+
if (obj.isApiErrorMessage === true) {
|
|
49019
|
+
const status = typeof obj.apiErrorStatus === "number" ? obj.apiErrorStatus : null;
|
|
49020
|
+
const errStr = typeof obj.error === "string" ? obj.error : "";
|
|
49021
|
+
const text = extractAssistantText(obj);
|
|
49022
|
+
const kind2 = status === 429 ? "quota-exhausted" : classifyClaudeError({ type: errStr, status, message: text });
|
|
49023
|
+
return {
|
|
49024
|
+
kind: kind2,
|
|
49025
|
+
raw: obj,
|
|
49026
|
+
detail: text || errStr || "api error",
|
|
49027
|
+
transient: kind2 === "rate-limited",
|
|
49028
|
+
terminal: true
|
|
49029
|
+
};
|
|
49030
|
+
}
|
|
49031
|
+
const isErrorLine = type === "api_error" || type === "error";
|
|
49032
|
+
const embeddedError = typeof obj.error === "object" && obj.error != null ? obj.error : null;
|
|
49033
|
+
if (!isErrorLine && !embeddedError)
|
|
49034
|
+
return null;
|
|
49035
|
+
const raw = embeddedError ?? obj;
|
|
49036
|
+
const kind = classifyClaudeError(embeddedError ?? obj);
|
|
49037
|
+
const detail = extractDetailMessage(embeddedError) ?? extractDetailMessage(obj) ?? String(type ?? "");
|
|
49038
|
+
const transient = kind === "rate-limited";
|
|
49039
|
+
const retry = extractRetryState(obj);
|
|
49040
|
+
const terminal = !transient ? true : retry.retryAttempt != null && retry.maxRetries != null ? retry.retryAttempt >= retry.maxRetries : isErrorLine;
|
|
49041
|
+
return { kind, raw, detail, transient, terminal };
|
|
49042
|
+
}
|
|
49043
|
+
function extractDetailMessage(obj) {
|
|
49044
|
+
if (!obj)
|
|
49045
|
+
return null;
|
|
49046
|
+
const msg = obj.message;
|
|
49047
|
+
return typeof msg === "string" && msg.length > 0 ? msg : null;
|
|
49048
|
+
}
|
|
49049
|
+
function extractAssistantText(obj) {
|
|
49050
|
+
const message = obj.message;
|
|
49051
|
+
if (typeof message !== "object" || message == null)
|
|
49052
|
+
return "";
|
|
49053
|
+
const content = message.content;
|
|
49054
|
+
if (!Array.isArray(content))
|
|
49055
|
+
return "";
|
|
49056
|
+
const parts = [];
|
|
49057
|
+
for (const block of content) {
|
|
49058
|
+
if (typeof block === "object" && block != null && block.type === "text") {
|
|
49059
|
+
const t = block.text;
|
|
49060
|
+
if (typeof t === "string")
|
|
49061
|
+
parts.push(t);
|
|
49062
|
+
}
|
|
49063
|
+
}
|
|
49064
|
+
return parts.join(" ").trim();
|
|
49065
|
+
}
|
|
48939
49066
|
|
|
48940
49067
|
// fleet-state.ts
|
|
48941
49068
|
var SANITISE_MAX_LEN = 120;
|
|
@@ -49098,6 +49225,7 @@ var DEFAULT_RESCAN_MS = 1000;
|
|
|
49098
49225
|
var DEFAULT_STALL_THRESHOLD_MS = 60000;
|
|
49099
49226
|
var DEFAULT_SILENT_SYNTHESIS_STALL_THRESHOLD_MS = 300000;
|
|
49100
49227
|
var DEFAULT_SILENT_STALL_TERMINAL_MS = 300000;
|
|
49228
|
+
var DEFAULT_INFLIGHT_PROMOTE_MAX_AGE_MS = 15 * 60000;
|
|
49101
49229
|
var SUBAGENT_RESULT_TEXT_MAX = 3000;
|
|
49102
49230
|
function parseEnvMs(varName) {
|
|
49103
49231
|
const raw = process.env[varName];
|
|
@@ -49189,6 +49317,12 @@ function readSubTail(entry, tail, now, onDescriptionUpdate, fs2, log, db2, paren
|
|
|
49189
49317
|
for (const line of lines) {
|
|
49190
49318
|
if (!line)
|
|
49191
49319
|
continue;
|
|
49320
|
+
const errInfo = detectErrorInTranscriptLine(line);
|
|
49321
|
+
if (errInfo?.terminal) {
|
|
49322
|
+
entry.errored = true;
|
|
49323
|
+
if (errInfo.detail)
|
|
49324
|
+
entry.errorDetail = errInfo.detail.slice(0, SUBAGENT_RESULT_TEXT_MAX);
|
|
49325
|
+
}
|
|
49192
49326
|
const events = projectSubagentLine(line, entry.agentId, startState);
|
|
49193
49327
|
for (const ev of events) {
|
|
49194
49328
|
const idleSecBeforeBump = Math.round((now - entry.lastActivityAt) / 1000);
|
|
@@ -49253,7 +49387,7 @@ function readSubTail(entry, tail, now, onDescriptionUpdate, fs2, log, db2, paren
|
|
|
49253
49387
|
recordSubagentEnd(db2, {
|
|
49254
49388
|
id: rowRef.id,
|
|
49255
49389
|
endedAt: now,
|
|
49256
|
-
status: "completed"
|
|
49390
|
+
status: entry.errored ? "failed" : "completed"
|
|
49257
49391
|
});
|
|
49258
49392
|
}
|
|
49259
49393
|
} catch (dbErr) {
|
|
@@ -49282,6 +49416,8 @@ function startSubagentWatcher(config) {
|
|
|
49282
49416
|
const stallThresholdMs = config.stallThresholdMs ?? parseEnvMs("SWITCHROOM_SUBAGENT_STALL_MS") ?? DEFAULT_STALL_THRESHOLD_MS;
|
|
49283
49417
|
const silentSynthesisStallThresholdMs = config.silentSynthesisStallThresholdMs ?? parseEnvMs("SWITCHROOM_SUBAGENT_SILENT_SYNTH_STALL_MS") ?? DEFAULT_SILENT_SYNTHESIS_STALL_THRESHOLD_MS;
|
|
49284
49418
|
const silentStallTerminalMs = config.silentStallTerminalMs ?? parseEnvMs("SWITCHROOM_SUBAGENT_STALL_TERMINAL_MS") ?? DEFAULT_SILENT_STALL_TERMINAL_MS;
|
|
49419
|
+
const inflightPromoteMaxAgeMs = config.inflightPromoteMaxAgeMs ?? parseEnvMs("SWITCHROOM_SUBAGENT_INFLIGHT_MAX_AGE_MS") ?? DEFAULT_INFLIGHT_PROMOTE_MAX_AGE_MS;
|
|
49420
|
+
const bootPromoteEnabled = config.bootPromoteEnabled ?? process.env.SWITCHROOM_SUBAGENT_BOOT_PROMOTE !== "0";
|
|
49285
49421
|
const reaperTtlMs = config.reaperTtlMs ?? DEFAULT_REAPER_TTL_MS;
|
|
49286
49422
|
const reaperIntervalMs = config.reaperIntervalMs ?? DEFAULT_REAPER_INTERVAL_MS;
|
|
49287
49423
|
const rescanMs = config.rescanMs ?? DEFAULT_RESCAN_MS;
|
|
@@ -49363,6 +49499,29 @@ function startSubagentWatcher(config) {
|
|
|
49363
49499
|
readSubTail(entry, tail, n, (desc) => {
|
|
49364
49500
|
log?.(`subagent-watcher: description updated for ${agentId}: ${desc}`);
|
|
49365
49501
|
}, fs2, log, db2, parentStateDir, config.onUnstall, undefined, config.onProgress);
|
|
49502
|
+
if (isHistorical && entry.state === "running") {
|
|
49503
|
+
let fileAgeMs = Infinity;
|
|
49504
|
+
try {
|
|
49505
|
+
const st = fs2.statSync(filePath);
|
|
49506
|
+
if (typeof st.mtimeMs === "number")
|
|
49507
|
+
fileAgeMs = n - st.mtimeMs;
|
|
49508
|
+
} catch {}
|
|
49509
|
+
if (!bootPromoteEnabled) {
|
|
49510
|
+
log?.(`subagent-watcher: ${agentId} running at boot but promotion disabled (SWITCHROOM_SUBAGENT_BOOT_PROMOTE=0) \u2014 leaving historical`);
|
|
49511
|
+
} else if (fileAgeMs > inflightPromoteMaxAgeMs) {
|
|
49512
|
+
log?.(`subagent-watcher: ${agentId} running at boot but stale (last write ${Math.round(fileAgeMs / 1000)}s ago > ${Math.round(inflightPromoteMaxAgeMs / 1000)}s) \u2014 leaving historical (dead prior-session worker, not in-flight)`);
|
|
49513
|
+
} else {
|
|
49514
|
+
entry.historical = false;
|
|
49515
|
+
log?.(`subagent-watcher: ${agentId} was in-flight at boot \u2014 promoting to live (last write ${Math.round(fileAgeMs / 1000)}s ago; user still awaiting handback)`);
|
|
49516
|
+
if (db2 != null) {
|
|
49517
|
+
try {
|
|
49518
|
+
backfillJsonlAgentId(db2, filePath, agentId, log);
|
|
49519
|
+
} catch (err) {
|
|
49520
|
+
log?.(`subagent-watcher: backfill error for ${agentId}: ${err.message}`);
|
|
49521
|
+
}
|
|
49522
|
+
}
|
|
49523
|
+
}
|
|
49524
|
+
}
|
|
49366
49525
|
if (isHistorical && entry.state === "done") {
|
|
49367
49526
|
entry.completionNotified = true;
|
|
49368
49527
|
scheduleTerminalCleanup(agentId);
|
|
@@ -49397,11 +49556,11 @@ function startSubagentWatcher(config) {
|
|
|
49397
49556
|
config.onFinish({
|
|
49398
49557
|
agentId,
|
|
49399
49558
|
state: entry.state,
|
|
49400
|
-
outcome: entry.historical ? "orphan" : "completed",
|
|
49559
|
+
outcome: entry.errored ? "failed" : entry.historical ? "orphan" : "completed",
|
|
49401
49560
|
toolCount: entry.toolCount,
|
|
49402
49561
|
durationMs: nowFn() - entry.dispatchedAt,
|
|
49403
49562
|
description: entry.description,
|
|
49404
|
-
resultText: entry.lastResultText
|
|
49563
|
+
resultText: entry.errored ? entry.lastResultText || entry.errorDetail || "" : entry.lastResultText
|
|
49405
49564
|
});
|
|
49406
49565
|
} catch (cbErr) {
|
|
49407
49566
|
log?.(`subagent-watcher: onFinish callback error ${agentId}: ${cbErr.message}`);
|
|
@@ -49518,7 +49677,7 @@ function startSubagentWatcher(config) {
|
|
|
49518
49677
|
recordSubagentEnd(db2, {
|
|
49519
49678
|
id: rowRef.id,
|
|
49520
49679
|
endedAt: n,
|
|
49521
|
-
status: "completed"
|
|
49680
|
+
status: entry.errored ? "failed" : "completed"
|
|
49522
49681
|
});
|
|
49523
49682
|
}
|
|
49524
49683
|
} catch (dbErr) {
|
|
@@ -51298,10 +51457,10 @@ function sweepStaleTurnActiveMarker(stateDir, opts) {
|
|
|
51298
51457
|
}
|
|
51299
51458
|
|
|
51300
51459
|
// ../src/build-info.ts
|
|
51301
|
-
var VERSION = "0.14.
|
|
51302
|
-
var COMMIT_SHA = "
|
|
51303
|
-
var COMMIT_DATE = "2026-05-
|
|
51304
|
-
var LATEST_PR =
|
|
51460
|
+
var VERSION = "0.14.24";
|
|
51461
|
+
var COMMIT_SHA = "2711d052";
|
|
51462
|
+
var COMMIT_DATE = "2026-05-31T22:59:44Z";
|
|
51463
|
+
var LATEST_PR = 2033;
|
|
51305
51464
|
var COMMITS_AHEAD_OF_TAG = 0;
|
|
51306
51465
|
|
|
51307
51466
|
// gateway/boot-version.ts
|
|
@@ -40,7 +40,7 @@ import {
|
|
|
40
40
|
} from 'fs'
|
|
41
41
|
import { basename, join } from 'path'
|
|
42
42
|
import { homedir } from 'os'
|
|
43
|
-
import { projectSubagentLine, sanitizeCwdToProjectName } from './session-tail.js'
|
|
43
|
+
import { projectSubagentLine, sanitizeCwdToProjectName, detectErrorInTranscriptLine } from './session-tail.js'
|
|
44
44
|
import { sanitiseToolArg } from './fleet-state.js'
|
|
45
45
|
import { escapeHtml, truncate } from './card-format.js'
|
|
46
46
|
import { bumpSubagentActivity, recordSubagentStall, recordSubagentResume, recordSubagentEnd, reapStuckRunningRows } from './registry/subagents-schema.js'
|
|
@@ -142,6 +142,21 @@ export interface WorkerEntry {
|
|
|
142
142
|
* dead, the file is just left over from a prior session.
|
|
143
143
|
*/
|
|
144
144
|
historical: boolean
|
|
145
|
+
/**
|
|
146
|
+
* True once a TERMINAL error line — a model API failure / quota
|
|
147
|
+
* exhaustion / crash, NOT an in-flight retry or a routine tool-level
|
|
148
|
+
* `is_error` result — has been observed in this worker's own
|
|
149
|
+
* transcript. Drives the `failed` terminal outcome so the handback
|
|
150
|
+
* tells the user the delegated work did NOT complete, instead of
|
|
151
|
+
* dressing a dead worker up as `completed`. Classified by
|
|
152
|
+
* `detectErrorInTranscriptLine` (the same gate the operator-event
|
|
153
|
+
* path uses), so transient mid-retry errors are excluded.
|
|
154
|
+
*/
|
|
155
|
+
errored?: boolean
|
|
156
|
+
/** Human-readable detail from the terminal error line, surfaced in the
|
|
157
|
+
* failed handback's "what it reported before failing" slot when the
|
|
158
|
+
* worker left no narrative result of its own. */
|
|
159
|
+
errorDetail?: string
|
|
145
160
|
}
|
|
146
161
|
|
|
147
162
|
export interface SubagentWatcherConfig {
|
|
@@ -193,6 +208,23 @@ export interface SubagentWatcherConfig {
|
|
|
193
208
|
* synthesis; tests use a tiny value to exercise the path.
|
|
194
209
|
*/
|
|
195
210
|
silentStallTerminalMs?: number
|
|
211
|
+
/**
|
|
212
|
+
* Freshness window (ms) for promoting a running-at-boot worker file to
|
|
213
|
+
* live. A file whose last write (mtime) is older than this is treated as
|
|
214
|
+
* a dead prior-session worker and stays historical/suppressed, NOT
|
|
215
|
+
* promoted. Default 15 min (DEFAULT_INFLIGHT_PROMOTE_MAX_AGE_MS); env
|
|
216
|
+
* override `SWITCHROOM_SUBAGENT_INFLIGHT_MAX_AGE_MS`. Guards the v0.14.23
|
|
217
|
+
* stale-handback replay regression.
|
|
218
|
+
*/
|
|
219
|
+
inflightPromoteMaxAgeMs?: number
|
|
220
|
+
/**
|
|
221
|
+
* Kill-switch for the boot-scan promotion path. When false, a
|
|
222
|
+
* running-at-boot worker is never promoted — the watcher reverts to the
|
|
223
|
+
* pre-v0.14.23 behaviour of leaving every boot-scan file historical
|
|
224
|
+
* (suppressed). Default true; env `SWITCHROOM_SUBAGENT_BOOT_PROMOTE=0`
|
|
225
|
+
* disables it fleet-wide without a code change (emergency lever).
|
|
226
|
+
*/
|
|
227
|
+
bootPromoteEnabled?: boolean
|
|
196
228
|
/**
|
|
197
229
|
* Reaper TTL (ms): background rows in `status='running'` whose
|
|
198
230
|
* `last_activity_at` (or `started_at` if liveness never wrote) is older
|
|
@@ -367,6 +399,29 @@ const DEFAULT_SILENT_SYNTHESIS_STALL_THRESHOLD_MS = 300_000
|
|
|
367
399
|
*/
|
|
368
400
|
const DEFAULT_SILENT_STALL_TERMINAL_MS = 300_000
|
|
369
401
|
|
|
402
|
+
/**
|
|
403
|
+
* Freshness window for the boot-scan "in-flight at boot → promote to
|
|
404
|
+
* live" path. A worker file still in `running` state at boot is only
|
|
405
|
+
* promoted (un-suppressed) if its last write (file mtime) is within this
|
|
406
|
+
* window of now. The signal cleanly separates the two populations:
|
|
407
|
+
*
|
|
408
|
+
* - A worker genuinely in-flight across a restart / fleet rollout was
|
|
409
|
+
* writing right up until the container was recreated, so its mtime is
|
|
410
|
+
* seconds-to-minutes before the new gateway boots — well inside the
|
|
411
|
+
* window. The user is still awaiting it; promote it.
|
|
412
|
+
* - A worker that died in a PRIOR session without writing a terminal
|
|
413
|
+
* `turn_end` is also `running` in the file, but its mtime is hours-to-
|
|
414
|
+
* weeks old. These accumulate by the dozen-to-hundred in a long-lived
|
|
415
|
+
* agent's subagents dir. Promoting them replays stale handbacks
|
|
416
|
+
* (often `failed`, from old error lines) on every boot — the v0.14.23
|
|
417
|
+
* regression. Leave them historical/suppressed, exactly as before.
|
|
418
|
+
*
|
|
419
|
+
* 15 min is generous for any plausible restart gap (container recreate +
|
|
420
|
+
* image pull) yet far below the staleness of a dead prior-session file.
|
|
421
|
+
* Override with `SWITCHROOM_SUBAGENT_INFLIGHT_MAX_AGE_MS`.
|
|
422
|
+
*/
|
|
423
|
+
const DEFAULT_INFLIGHT_PROMOTE_MAX_AGE_MS = 15 * 60_000
|
|
424
|
+
|
|
370
425
|
/**
|
|
371
426
|
* Cap on the result text retained per sub-agent (`entry.lastResultText`)
|
|
372
427
|
* and carried to the gateway via `onFinish`. The gateway feeds this into
|
|
@@ -611,6 +666,20 @@ export function readSubTail(
|
|
|
611
666
|
const startState = { hasEmittedStart: tail.hasEmittedStart }
|
|
612
667
|
for (const line of lines) {
|
|
613
668
|
if (!line) continue
|
|
669
|
+
// Gap 2 (failure honesty): a terminal error line in the worker's
|
|
670
|
+
// OWN transcript — a model API failure, quota exhaustion, or crash —
|
|
671
|
+
// means the worker FAILED, not finished. Reuse the operator-event
|
|
672
|
+
// classifier: `terminal:true` excludes in-flight retries (a 529 mid-
|
|
673
|
+
// backoff is `terminal:false`), and tool-level `is_error` results
|
|
674
|
+
// never reach here (they parse as `sub_agent_tool_result`, which is
|
|
675
|
+
// routine mid-run noise, not a worker death). The flag persists on
|
|
676
|
+
// the entry; the terminal transition (real turn_end OR stall
|
|
677
|
+
// synthesis) reads it to emit `failed` instead of `completed`.
|
|
678
|
+
const errInfo = detectErrorInTranscriptLine(line)
|
|
679
|
+
if (errInfo?.terminal) {
|
|
680
|
+
entry.errored = true
|
|
681
|
+
if (errInfo.detail) entry.errorDetail = errInfo.detail.slice(0, SUBAGENT_RESULT_TEXT_MAX)
|
|
682
|
+
}
|
|
614
683
|
const events = projectSubagentLine(line, entry.agentId, startState)
|
|
615
684
|
for (const ev of events) {
|
|
616
685
|
const idleSecBeforeBump = Math.round((now - entry.lastActivityAt) / 1000)
|
|
@@ -716,7 +785,10 @@ export function readSubTail(
|
|
|
716
785
|
recordSubagentEnd(db, {
|
|
717
786
|
id: rowRef.id,
|
|
718
787
|
endedAt: now,
|
|
719
|
-
|
|
788
|
+
// Gap 2: keep the audit row honest — a worker that hit a
|
|
789
|
+
// terminal transcript error is `failed`, matching the
|
|
790
|
+
// handback outcome computed in maybySendStateTransition.
|
|
791
|
+
status: entry.errored ? 'failed' : 'completed',
|
|
720
792
|
})
|
|
721
793
|
}
|
|
722
794
|
} catch (dbErr) {
|
|
@@ -778,6 +850,14 @@ export function startSubagentWatcher(config: SubagentWatcherConfig): SubagentWat
|
|
|
778
850
|
config.silentStallTerminalMs
|
|
779
851
|
?? parseEnvMs('SWITCHROOM_SUBAGENT_STALL_TERMINAL_MS')
|
|
780
852
|
?? DEFAULT_SILENT_STALL_TERMINAL_MS
|
|
853
|
+
const inflightPromoteMaxAgeMs =
|
|
854
|
+
config.inflightPromoteMaxAgeMs
|
|
855
|
+
?? parseEnvMs('SWITCHROOM_SUBAGENT_INFLIGHT_MAX_AGE_MS')
|
|
856
|
+
?? DEFAULT_INFLIGHT_PROMOTE_MAX_AGE_MS
|
|
857
|
+
// Kill-switch: not parseEnvMs (which rejects `0`) — an explicit `=0`
|
|
858
|
+
// here MUST disable promotion (revert to pre-v0.14.23 suppression).
|
|
859
|
+
const bootPromoteEnabled =
|
|
860
|
+
config.bootPromoteEnabled ?? (process.env.SWITCHROOM_SUBAGENT_BOOT_PROMOTE !== '0')
|
|
781
861
|
const reaperTtlMs = config.reaperTtlMs ?? DEFAULT_REAPER_TTL_MS
|
|
782
862
|
const reaperIntervalMs = config.reaperIntervalMs ?? DEFAULT_REAPER_INTERVAL_MS
|
|
783
863
|
const rescanMs = config.rescanMs ?? DEFAULT_RESCAN_MS
|
|
@@ -917,6 +997,56 @@ export function startSubagentWatcher(config: SubagentWatcherConfig): SubagentWat
|
|
|
917
997
|
log?.(`subagent-watcher: description updated for ${agentId}: ${desc}`)
|
|
918
998
|
}, fs, log, db, parentStateDir, config.onUnstall, undefined, config.onProgress)
|
|
919
999
|
|
|
1000
|
+
// Gap 1 (restart survival): a file still RUNNING at boot is a LIVE
|
|
1001
|
+
// worker that predates this watcher — typically one dispatched in a
|
|
1002
|
+
// prior gateway life and still in-flight across a restart / fleet
|
|
1003
|
+
// rollout, NOT a stale already-finished file. `historical` must
|
|
1004
|
+
// suppress replay only for done-at-boot files; an in-flight-at-boot
|
|
1005
|
+
// worker the user is still waiting on must get full live treatment:
|
|
1006
|
+
// progress nudges, the stall-synthesis safety net (checkStalls skips
|
|
1007
|
+
// historical entries), and a real `completed`/`failed` handback rather
|
|
1008
|
+
// than a dropped `orphan`. Promote it to a live entry here. (A file
|
|
1009
|
+
// already `done` at boot stays historical and is short-circuited just
|
|
1010
|
+
// below — it finished before this session.)
|
|
1011
|
+
if (isHistorical && entry.state === 'running') {
|
|
1012
|
+
// Freshness gate (v0.14.24): only promote a file whose LAST WRITE is
|
|
1013
|
+
// recent. A genuinely in-flight-across-a-restart worker was writing
|
|
1014
|
+
// until the container was recreated (mtime seconds-to-minutes old); a
|
|
1015
|
+
// dead prior-session worker that never wrote a terminal turn_end is
|
|
1016
|
+
// also `running` but hours-to-weeks stale. Promoting the latter
|
|
1017
|
+
// replayed stale `failed` handbacks on every boot (the v0.14.23
|
|
1018
|
+
// fleet-wide regression). Unreadable mtime → treat as stale (suppress
|
|
1019
|
+
// rather than risk re-spamming). The kill-switch reverts to pre-fix
|
|
1020
|
+
// suppression entirely.
|
|
1021
|
+
let fileAgeMs = Infinity
|
|
1022
|
+
try {
|
|
1023
|
+
const st = fs.statSync(filePath)
|
|
1024
|
+
if (typeof st.mtimeMs === 'number') fileAgeMs = n - st.mtimeMs
|
|
1025
|
+
} catch {
|
|
1026
|
+
/* unreadable → Infinity → treated as stale below */
|
|
1027
|
+
}
|
|
1028
|
+
if (!bootPromoteEnabled) {
|
|
1029
|
+
log?.(`subagent-watcher: ${agentId} running at boot but promotion disabled (SWITCHROOM_SUBAGENT_BOOT_PROMOTE=0) — leaving historical`)
|
|
1030
|
+
} else if (fileAgeMs > inflightPromoteMaxAgeMs) {
|
|
1031
|
+
log?.(`subagent-watcher: ${agentId} running at boot but stale (last write ${Math.round(fileAgeMs / 1000)}s ago > ${Math.round(inflightPromoteMaxAgeMs / 1000)}s) — leaving historical (dead prior-session worker, not in-flight)`)
|
|
1032
|
+
} else {
|
|
1033
|
+
entry.historical = false
|
|
1034
|
+
log?.(`subagent-watcher: ${agentId} was in-flight at boot — promoting to live (last write ${Math.round(fileAgeMs / 1000)}s ago; user still awaiting handback)`)
|
|
1035
|
+
// The prior gateway life's registration normally linked
|
|
1036
|
+
// jsonl_agent_id already, but re-run the backfill idempotently in
|
|
1037
|
+
// case that life crashed before the link persisted — the handback's
|
|
1038
|
+
// isBackground lookup is keyed on jsonl_agent_id, and an unlinked row
|
|
1039
|
+
// would mis-resolve the worker as foreground and drop the handback.
|
|
1040
|
+
if (db != null) {
|
|
1041
|
+
try {
|
|
1042
|
+
backfillJsonlAgentId(db, filePath, agentId, log)
|
|
1043
|
+
} catch (err) {
|
|
1044
|
+
log?.(`subagent-watcher: backfill error for ${agentId}: ${(err as Error).message}`)
|
|
1045
|
+
}
|
|
1046
|
+
}
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
|
|
920
1050
|
// If the JSONL already contained a turn_end at registration time
|
|
921
1051
|
// (file written-then-watched), fire the state-transition + completion
|
|
922
1052
|
// notification now. Otherwise the FSWatcher callback handles it on
|
|
@@ -980,11 +1110,22 @@ export function startSubagentWatcher(config: SubagentWatcherConfig): SubagentWat
|
|
|
980
1110
|
config.onFinish({
|
|
981
1111
|
agentId,
|
|
982
1112
|
state: entry.state,
|
|
983
|
-
|
|
1113
|
+
// Gap 2: a terminal error observed in the transcript wins over
|
|
1114
|
+
// the completed/orphan classification — a worker that crashed
|
|
1115
|
+
// is `failed`, even if it later wrote a turn_end or aged into
|
|
1116
|
+
// stall synthesis. `orphan` remains for genuinely stale
|
|
1117
|
+
// done-at-boot rows (which never reach this path; see
|
|
1118
|
+
// registerAgent's short-circuit + Gap 1 promotion).
|
|
1119
|
+
outcome: entry.errored ? 'failed' : entry.historical ? 'orphan' : 'completed',
|
|
984
1120
|
toolCount: entry.toolCount,
|
|
985
1121
|
durationMs: nowFn() - entry.dispatchedAt,
|
|
986
1122
|
description: entry.description,
|
|
987
|
-
|
|
1123
|
+
// For a failure, fall back to the error detail when the worker
|
|
1124
|
+
// left no narrative of its own — so the handback's "what it
|
|
1125
|
+
// reported before failing" slot is never empty on a crash.
|
|
1126
|
+
resultText: entry.errored
|
|
1127
|
+
? entry.lastResultText || entry.errorDetail || ''
|
|
1128
|
+
: entry.lastResultText,
|
|
988
1129
|
})
|
|
989
1130
|
} catch (cbErr) {
|
|
990
1131
|
log?.(`subagent-watcher: onFinish callback error ${agentId}: ${(cbErr as Error).message}`)
|
|
@@ -1151,7 +1292,10 @@ export function startSubagentWatcher(config: SubagentWatcherConfig): SubagentWat
|
|
|
1151
1292
|
recordSubagentEnd(db, {
|
|
1152
1293
|
id: rowRef.id,
|
|
1153
1294
|
endedAt: n,
|
|
1154
|
-
|
|
1295
|
+
// Gap 2: a worker that hit a terminal transcript error before
|
|
1296
|
+
// going silent is `failed`, not `completed` — keep the audit
|
|
1297
|
+
// row consistent with the handback outcome.
|
|
1298
|
+
status: entry.errored ? 'failed' : 'completed',
|
|
1155
1299
|
})
|
|
1156
1300
|
}
|
|
1157
1301
|
} catch (dbErr) {
|
|
@@ -624,13 +624,17 @@ describe('Bug 3 — stalled-row sweeper: watcher must call recordSubagentStall i
|
|
|
624
624
|
h.watcher.stop()
|
|
625
625
|
})
|
|
626
626
|
|
|
627
|
-
it('does not call stall for historical
|
|
627
|
+
it('does not call stall for historical (done-at-boot) entries', () => {
|
|
628
|
+
// A worker that already FINISHED before boot (turn_end present) stays
|
|
629
|
+
// historical and must not write stall rows. A still-RUNNING file at
|
|
630
|
+
// boot is a different case — Gap 1 promotes it to live so it DOES get
|
|
631
|
+
// the stall safety net (covered in subagent-watcher-handback-gaps).
|
|
628
632
|
const agentDir = '/home/user/.switchroom/agents/myagent'
|
|
629
633
|
const subagentsDir = `${agentDir}/.claude/projects/p1/session-abc/subagents`
|
|
630
634
|
const jsonlStem = 'hist-agent'
|
|
631
635
|
const toolUseId = 'toolu_hist001'
|
|
632
636
|
const jsonlPath = `${subagentsDir}/agent-${jsonlStem}.jsonl`
|
|
633
|
-
const content = buildJSONL(subAgentUserMsg('Old task'))
|
|
637
|
+
const content = buildJSONL(subAgentUserMsg('Old task'), subAgentTurnDuration())
|
|
634
638
|
|
|
635
639
|
const db = makeInMemoryDb({
|
|
636
640
|
[toolUseId]: { id: toolUseId, jsonl_agent_id: jsonlStem, status: 'running' },
|
|
@@ -648,7 +652,7 @@ describe('Bug 3 — stalled-row sweeper: watcher must call recordSubagentStall i
|
|
|
648
652
|
db,
|
|
649
653
|
})
|
|
650
654
|
|
|
651
|
-
//
|
|
655
|
+
// Done-at-boot → stays historical (not promoted); no stall write fires.
|
|
652
656
|
h.advance(65_000)
|
|
653
657
|
|
|
654
658
|
const stallDbCalls = db._calls.filter(
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for the two background-worker handback gaps closed in
|
|
3
|
+
* `fix/subagent-handback-restart-and-failure`:
|
|
4
|
+
*
|
|
5
|
+
* Gap 1 — restart survival. A background worker that is in-flight when
|
|
6
|
+
* the gateway restarts is discovered by the boot scan and tagged
|
|
7
|
+
* `historical`. That flag is meant to suppress replay for workers that
|
|
8
|
+
* ALREADY finished before boot — but it was also applied to workers
|
|
9
|
+
* still running, which then completed with outcome `orphan`, and the
|
|
10
|
+
* handback gate drops `orphan`. Net: dispatched worker + any gateway
|
|
11
|
+
* bounce (incl. a fleet rollout) + worker finishes = user never told.
|
|
12
|
+
* Fix: a file still `running` at boot is promoted to a LIVE entry, so
|
|
13
|
+
* it gets the stall-synthesis safety net and a real `completed`/`failed`
|
|
14
|
+
* handback. A file already `done` at boot stays suppressed.
|
|
15
|
+
*
|
|
16
|
+
* Gap 2 — failure honesty. The `failed` outcome was dead code (no caller
|
|
17
|
+
* set it), so every dead worker was reported `completed`. Fix: a
|
|
18
|
+
* TERMINAL error line in the worker's own transcript (model API failure
|
|
19
|
+
* / quota exhaustion / crash — not an in-flight retry, not a routine
|
|
20
|
+
* tool-level is_error) flips the terminal outcome to `failed` and
|
|
21
|
+
* carries the error detail into the handback result.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import { describe, it, expect, vi } from 'vitest'
|
|
25
|
+
import { startSubagentWatcher } from '../subagent-watcher.js'
|
|
26
|
+
import * as fs from 'fs'
|
|
27
|
+
|
|
28
|
+
function buildJSONL(...lines: object[]): string {
|
|
29
|
+
return lines.map((l) => JSON.stringify(l)).join('\n') + '\n'
|
|
30
|
+
}
|
|
31
|
+
function subAgentUserMsg(promptText: string) {
|
|
32
|
+
return { type: 'user', message: { content: [{ type: 'text', text: promptText }] } }
|
|
33
|
+
}
|
|
34
|
+
function subAgentText(text: string) {
|
|
35
|
+
return { type: 'assistant', message: { content: [{ type: 'text', text }] } }
|
|
36
|
+
}
|
|
37
|
+
function subAgentTurnEnd() {
|
|
38
|
+
return { type: 'system', subtype: 'turn_duration', duration_ms: 1234 }
|
|
39
|
+
}
|
|
40
|
+
// A terminal error line in the worker's OWN transcript — the model call
|
|
41
|
+
// itself failed (here an invalid_request_error). `detectErrorInTranscriptLine`
|
|
42
|
+
// classifies an explicit `type:"error"` line with a non-rate-limit kind as
|
|
43
|
+
// terminal:true.
|
|
44
|
+
function subAgentTerminalError(message: string) {
|
|
45
|
+
return { type: 'error', error: { type: 'invalid_request_error', message } }
|
|
46
|
+
}
|
|
47
|
+
// A routine mid-run tool failure (e.g. a grep that found nothing). This is a
|
|
48
|
+
// `sub_agent_tool_result` with is_error — NOT a worker death. Must NOT trip
|
|
49
|
+
// the failed classification.
|
|
50
|
+
function subAgentToolResultError() {
|
|
51
|
+
return {
|
|
52
|
+
type: 'user',
|
|
53
|
+
message: {
|
|
54
|
+
content: [{ type: 'tool_result', tool_use_id: 'toolu_x', is_error: true, content: 'no matches found' }],
|
|
55
|
+
},
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
interface FinishCall {
|
|
60
|
+
agentId: string
|
|
61
|
+
outcome: string
|
|
62
|
+
resultText: string
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
interface Harness {
|
|
66
|
+
stallTerminalCalls: Array<{ agentId: string }>
|
|
67
|
+
finishCalls: FinishCall[]
|
|
68
|
+
logs: string[]
|
|
69
|
+
advance: (ms: number) => void
|
|
70
|
+
watcher: ReturnType<typeof startSubagentWatcher>
|
|
71
|
+
fileContents: Map<string, Buffer>
|
|
72
|
+
jsonlPath: string
|
|
73
|
+
append: (...lines: object[]) => void
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
function makeHarness(opts: {
|
|
77
|
+
agentId?: string
|
|
78
|
+
/** Lines present in the JSONL at boot (before the watcher starts). */
|
|
79
|
+
bootLines: object[]
|
|
80
|
+
stallThresholdMs?: number
|
|
81
|
+
silentStallTerminalMs?: number
|
|
82
|
+
rescanMs?: number
|
|
83
|
+
/** How long ago (ms) the boot file was last written, i.e. its mtime is
|
|
84
|
+
* `currentTime - bootFileAgeMs` at registration. Default 0 (fresh, so the
|
|
85
|
+
* freshness gate promotes it). Set large to simulate a dead prior-session
|
|
86
|
+
* worker that must NOT be promoted. */
|
|
87
|
+
bootFileAgeMs?: number
|
|
88
|
+
/** Kill-switch passthrough; default true (promotion enabled). */
|
|
89
|
+
bootPromoteEnabled?: boolean
|
|
90
|
+
inflightPromoteMaxAgeMs?: number
|
|
91
|
+
}): Harness {
|
|
92
|
+
const {
|
|
93
|
+
agentId = 'gap-agent',
|
|
94
|
+
bootLines,
|
|
95
|
+
stallThresholdMs = 60_000,
|
|
96
|
+
silentStallTerminalMs = 300_000,
|
|
97
|
+
rescanMs = 500,
|
|
98
|
+
bootFileAgeMs = 0,
|
|
99
|
+
bootPromoteEnabled = true,
|
|
100
|
+
inflightPromoteMaxAgeMs,
|
|
101
|
+
} = opts
|
|
102
|
+
|
|
103
|
+
let currentTime = 1000
|
|
104
|
+
const stallTerminalCalls: Array<{ agentId: string }> = []
|
|
105
|
+
const finishCalls: FinishCall[] = []
|
|
106
|
+
const logs: string[] = []
|
|
107
|
+
|
|
108
|
+
const agentDir = '/home/user/.switchroom/agents/myagent'
|
|
109
|
+
const sessionId = 'mock-session'
|
|
110
|
+
const projectsRoot = `${agentDir}/.claude/projects`
|
|
111
|
+
const projectDir = `${projectsRoot}/mock-cwd`
|
|
112
|
+
const sessionDir = `${projectDir}/${sessionId}`
|
|
113
|
+
const subagentsDir = `${sessionDir}/subagents`
|
|
114
|
+
const jsonlPath = `${subagentsDir}/agent-${agentId}.jsonl`
|
|
115
|
+
|
|
116
|
+
const fileContents = new Map<string, Buffer>()
|
|
117
|
+
fileContents.set(jsonlPath, Buffer.from(buildJSONL(...bootLines), 'utf-8'))
|
|
118
|
+
// Per-file mtime (ms). The boot file's last write is `bootFileAgeMs` in the
|
|
119
|
+
// past; appends bump it to currentTime. The freshness gate reads this.
|
|
120
|
+
const fileMtimes = new Map<string, number>()
|
|
121
|
+
fileMtimes.set(jsonlPath, 1000 - bootFileAgeMs)
|
|
122
|
+
|
|
123
|
+
let lastOpenedPath: string | null = null
|
|
124
|
+
const mockFs = {
|
|
125
|
+
existsSync: ((p: fs.PathLike) => {
|
|
126
|
+
const ps = String(p)
|
|
127
|
+
if (ps === projectsRoot || ps === projectDir || ps === sessionDir || ps === subagentsDir) return true
|
|
128
|
+
if (fileContents.has(ps)) return true
|
|
129
|
+
return false
|
|
130
|
+
}) as typeof fs.existsSync,
|
|
131
|
+
readdirSync: ((p: fs.PathLike) => {
|
|
132
|
+
const ps = String(p)
|
|
133
|
+
if (ps === projectsRoot) return ['mock-cwd']
|
|
134
|
+
if (ps === projectDir) return [sessionId]
|
|
135
|
+
if (ps === sessionDir) return ['subagents']
|
|
136
|
+
if (ps === subagentsDir) return [`agent-${agentId}.jsonl`]
|
|
137
|
+
return []
|
|
138
|
+
}) as unknown as typeof fs.readdirSync,
|
|
139
|
+
statSync: ((p: fs.PathLike) => ({ size: fileContents.get(String(p))?.length ?? 0, mtimeMs: fileMtimes.get(String(p)) ?? currentTime }) as fs.Stats) as typeof fs.statSync,
|
|
140
|
+
openSync: ((p: fs.PathLike) => {
|
|
141
|
+
lastOpenedPath = String(p)
|
|
142
|
+
return 42
|
|
143
|
+
}) as unknown as typeof fs.openSync,
|
|
144
|
+
closeSync: (() => { lastOpenedPath = null }) as typeof fs.closeSync,
|
|
145
|
+
readSync: ((
|
|
146
|
+
_fd: number,
|
|
147
|
+
buf: NodeJS.ArrayBufferView,
|
|
148
|
+
offset: number,
|
|
149
|
+
length: number,
|
|
150
|
+
position: number | null,
|
|
151
|
+
): number => {
|
|
152
|
+
const content = lastOpenedPath != null ? fileContents.get(lastOpenedPath) : undefined
|
|
153
|
+
if (!content) return 0
|
|
154
|
+
const pos = position ?? 0
|
|
155
|
+
const src = content.slice(pos, pos + length)
|
|
156
|
+
;(src as Buffer).copy(buf as Buffer, offset)
|
|
157
|
+
return src.length
|
|
158
|
+
}) as unknown as typeof fs.readSync,
|
|
159
|
+
watch: (() => ({ close: vi.fn() }) as unknown as fs.FSWatcher) as unknown as typeof fs.watch,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const intervals: Array<{ fn: () => void; ms: number; ref: number; fireAt: number }> = []
|
|
163
|
+
let nextRef = 1
|
|
164
|
+
|
|
165
|
+
const watcher = startSubagentWatcher({
|
|
166
|
+
agentDir,
|
|
167
|
+
stallThresholdMs,
|
|
168
|
+
silentSynthesisStallThresholdMs: stallThresholdMs,
|
|
169
|
+
silentStallTerminalMs,
|
|
170
|
+
rescanMs,
|
|
171
|
+
bootPromoteEnabled,
|
|
172
|
+
...(inflightPromoteMaxAgeMs != null ? { inflightPromoteMaxAgeMs } : {}),
|
|
173
|
+
onStallTerminal: (id) => stallTerminalCalls.push({ agentId: id }),
|
|
174
|
+
onFinish: ({ agentId: id, outcome, resultText }) =>
|
|
175
|
+
finishCalls.push({ agentId: id, outcome, resultText }),
|
|
176
|
+
now: () => currentTime,
|
|
177
|
+
setInterval: (fn, ms) => {
|
|
178
|
+
const ref = nextRef++
|
|
179
|
+
intervals.push({ fn, ms, ref, fireAt: currentTime + ms })
|
|
180
|
+
return { ref }
|
|
181
|
+
},
|
|
182
|
+
clearInterval: (handle) => {
|
|
183
|
+
const { ref } = handle as { ref: number }
|
|
184
|
+
const idx = intervals.findIndex((i) => i.ref === ref)
|
|
185
|
+
if (idx !== -1) intervals.splice(idx, 1)
|
|
186
|
+
},
|
|
187
|
+
fs: mockFs,
|
|
188
|
+
log: (msg) => logs.push(msg),
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
const advance = (ms: number): void => {
|
|
192
|
+
currentTime += ms
|
|
193
|
+
for (;;) {
|
|
194
|
+
intervals.sort((a, b) => a.fireAt - b.fireAt)
|
|
195
|
+
const next = intervals[0]
|
|
196
|
+
if (!next || next.fireAt > currentTime) break
|
|
197
|
+
next.fireAt += next.ms
|
|
198
|
+
next.fn()
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const append = (...lines: object[]): void => {
|
|
203
|
+
const cur = fileContents.get(jsonlPath) ?? Buffer.alloc(0)
|
|
204
|
+
const more = buildJSONL(...lines)
|
|
205
|
+
fileContents.set(jsonlPath, Buffer.concat([cur, Buffer.from(more, 'utf-8')]))
|
|
206
|
+
fileMtimes.set(jsonlPath, currentTime)
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
return { stallTerminalCalls, finishCalls, logs, advance, watcher, fileContents, jsonlPath, append }
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
describe('Gap 1 — background worker in-flight across a gateway restart', () => {
|
|
213
|
+
it('an in-flight-at-boot worker that completes hands back as completed (not orphan)', () => {
|
|
214
|
+
// Boot scan finds a running worker (prompt, no turn_end yet) → tagged
|
|
215
|
+
// historical. The fix promotes it to live. When it finishes under our
|
|
216
|
+
// watch, the outcome must be `completed` so the handback delivers.
|
|
217
|
+
const h = makeHarness({ agentId: 'gap1-complete', bootLines: [subAgentUserMsg('bg task')] })
|
|
218
|
+
|
|
219
|
+
// The worker finishes after the restart.
|
|
220
|
+
h.append(subAgentText('Found the root cause in auth.ts'), subAgentTurnEnd())
|
|
221
|
+
h.advance(600) // one poll reads the new bytes
|
|
222
|
+
|
|
223
|
+
expect(h.finishCalls).toHaveLength(1)
|
|
224
|
+
expect(h.finishCalls[0].agentId).toBe('gap1-complete')
|
|
225
|
+
expect(h.finishCalls[0].outcome).toBe('completed') // pre-fix: 'orphan' → dropped
|
|
226
|
+
expect(h.finishCalls[0].resultText).toContain('root cause')
|
|
227
|
+
// The promotion is logged so the path is observable in prod.
|
|
228
|
+
expect(h.logs.some((l) => l.includes('in-flight at boot — promoting to live'))).toBe(true)
|
|
229
|
+
})
|
|
230
|
+
|
|
231
|
+
it('an in-flight-at-boot worker that dies silently is rescued by stall synthesis', () => {
|
|
232
|
+
// Pre-fix, historical entries were skipped by stall detection, so a
|
|
233
|
+
// worker that crossed a restart and then went silent sat running
|
|
234
|
+
// forever — no handback ever. After promotion it gets the safety net.
|
|
235
|
+
const h = makeHarness({
|
|
236
|
+
agentId: 'gap1-silent',
|
|
237
|
+
bootLines: [subAgentUserMsg('bg task')],
|
|
238
|
+
stallThresholdMs: 60_000,
|
|
239
|
+
silentStallTerminalMs: 120_000,
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
h.advance(62_000) // stall threshold crossed
|
|
243
|
+
expect(h.stallTerminalCalls).toHaveLength(0)
|
|
244
|
+
h.advance(121_000) // silent-stall terminal window elapses → synthesis
|
|
245
|
+
expect(h.stallTerminalCalls).toHaveLength(1)
|
|
246
|
+
expect(h.finishCalls).toHaveLength(1)
|
|
247
|
+
expect(h.finishCalls[0].outcome).toBe('completed')
|
|
248
|
+
})
|
|
249
|
+
|
|
250
|
+
it('a worker already DONE at boot stays suppressed (no spurious replay)', () => {
|
|
251
|
+
// The legitimate use of `historical`: a worker that finished in a prior
|
|
252
|
+
// session must NOT re-fire a handback on every restart. This is the
|
|
253
|
+
// regression guard for the fix.
|
|
254
|
+
const h = makeHarness({
|
|
255
|
+
agentId: 'gap1-stale',
|
|
256
|
+
bootLines: [subAgentUserMsg('bg task'), subAgentText('done long ago'), subAgentTurnEnd()],
|
|
257
|
+
})
|
|
258
|
+
|
|
259
|
+
h.advance(600)
|
|
260
|
+
h.advance(600_000) // well past any stall window
|
|
261
|
+
expect(h.finishCalls).toHaveLength(0)
|
|
262
|
+
expect(h.stallTerminalCalls).toHaveLength(0)
|
|
263
|
+
})
|
|
264
|
+
})
|
|
265
|
+
|
|
266
|
+
describe('Gap 1 freshness gate — v0.14.24 stale-replay regression', () => {
|
|
267
|
+
// The v0.14.23 regression: promoting EVERY running-at-boot file replayed
|
|
268
|
+
// weeks-old dead prior-session workers as handbacks (often `failed`, from
|
|
269
|
+
// old error lines) on every boot, spamming the whole fleet. The gate
|
|
270
|
+
// promotes only files whose last write is recent.
|
|
271
|
+
|
|
272
|
+
it('a STALE running-at-boot worker (weeks-old mtime) is NOT promoted — no handback, no stall', () => {
|
|
273
|
+
const h = makeHarness({
|
|
274
|
+
agentId: 'gap1-stale-running',
|
|
275
|
+
bootLines: [subAgentUserMsg('bg task from weeks ago')], // running: no turn_end
|
|
276
|
+
bootFileAgeMs: 21 * 24 * 60 * 60_000, // 21 days old — clearly dead
|
|
277
|
+
silentStallTerminalMs: 120_000,
|
|
278
|
+
})
|
|
279
|
+
|
|
280
|
+
h.advance(600)
|
|
281
|
+
h.advance(600_000) // far past every stall/synthesis window
|
|
282
|
+
expect(h.finishCalls).toHaveLength(0) // pre-fix: a spurious (often failed) handback
|
|
283
|
+
expect(h.stallTerminalCalls).toHaveLength(0)
|
|
284
|
+
expect(h.logs.some((l) => l.includes('stale') && l.includes('leaving historical'))).toBe(true)
|
|
285
|
+
})
|
|
286
|
+
|
|
287
|
+
it('a FRESH running-at-boot worker (recent mtime) IS still promoted and hands back', () => {
|
|
288
|
+
// Preserve the genuine Gap 1 fix: a worker in-flight across a restart
|
|
289
|
+
// (wrote moments before the bounce) must still get promoted + handed back.
|
|
290
|
+
const h = makeHarness({
|
|
291
|
+
agentId: 'gap1-fresh-running',
|
|
292
|
+
bootLines: [subAgentUserMsg('bg task')],
|
|
293
|
+
bootFileAgeMs: 30_000, // 30s old — in-flight across a quick restart
|
|
294
|
+
})
|
|
295
|
+
|
|
296
|
+
h.append(subAgentText('Finished the migration'), subAgentTurnEnd())
|
|
297
|
+
h.advance(600)
|
|
298
|
+
|
|
299
|
+
expect(h.finishCalls).toHaveLength(1)
|
|
300
|
+
expect(h.finishCalls[0].outcome).toBe('completed')
|
|
301
|
+
expect(h.logs.some((l) => l.includes('promoting to live'))).toBe(true)
|
|
302
|
+
})
|
|
303
|
+
|
|
304
|
+
it('kill-switch (bootPromoteEnabled=false) suppresses even a fresh running-at-boot worker', () => {
|
|
305
|
+
const h = makeHarness({
|
|
306
|
+
agentId: 'gap1-killswitch',
|
|
307
|
+
bootLines: [subAgentUserMsg('bg task')],
|
|
308
|
+
bootFileAgeMs: 5_000, // fresh — would normally promote
|
|
309
|
+
bootPromoteEnabled: false,
|
|
310
|
+
silentStallTerminalMs: 120_000,
|
|
311
|
+
})
|
|
312
|
+
|
|
313
|
+
h.advance(600)
|
|
314
|
+
h.advance(600_000)
|
|
315
|
+
expect(h.finishCalls).toHaveLength(0)
|
|
316
|
+
expect(h.logs.some((l) => l.includes('promotion disabled'))).toBe(true)
|
|
317
|
+
})
|
|
318
|
+
|
|
319
|
+
it('a worker just past the freshness window is NOT promoted (boundary)', () => {
|
|
320
|
+
const h = makeHarness({
|
|
321
|
+
agentId: 'gap1-boundary',
|
|
322
|
+
bootLines: [subAgentUserMsg('bg task')],
|
|
323
|
+
inflightPromoteMaxAgeMs: 60_000, // 60s window
|
|
324
|
+
bootFileAgeMs: 90_000, // 90s old → just stale
|
|
325
|
+
silentStallTerminalMs: 120_000,
|
|
326
|
+
})
|
|
327
|
+
|
|
328
|
+
h.advance(600)
|
|
329
|
+
h.advance(600_000)
|
|
330
|
+
expect(h.finishCalls).toHaveLength(0)
|
|
331
|
+
expect(h.logs.some((l) => l.includes('stale'))).toBe(true)
|
|
332
|
+
})
|
|
333
|
+
})
|
|
334
|
+
|
|
335
|
+
describe('Gap 2 — failure honesty', () => {
|
|
336
|
+
it('a terminal error line flips the outcome to failed and carries the detail', () => {
|
|
337
|
+
const h = makeHarness({ agentId: 'gap2-failed', bootLines: [subAgentUserMsg('bg task')] })
|
|
338
|
+
|
|
339
|
+
// The worker's model call errors out, then the transcript ends.
|
|
340
|
+
h.append(subAgentTerminalError('tool input rejected by the API'), subAgentTurnEnd())
|
|
341
|
+
h.advance(600)
|
|
342
|
+
|
|
343
|
+
expect(h.finishCalls).toHaveLength(1)
|
|
344
|
+
expect(h.finishCalls[0].outcome).toBe('failed')
|
|
345
|
+
// No narrative was emitted, so the detail backfills the result slot.
|
|
346
|
+
expect(h.finishCalls[0].resultText).toContain('tool input rejected')
|
|
347
|
+
})
|
|
348
|
+
|
|
349
|
+
it('a failed worker that went silent still synthesises terminal as failed', () => {
|
|
350
|
+
const h = makeHarness({
|
|
351
|
+
agentId: 'gap2-failed-silent',
|
|
352
|
+
bootLines: [subAgentUserMsg('bg task')],
|
|
353
|
+
stallThresholdMs: 60_000,
|
|
354
|
+
silentStallTerminalMs: 120_000,
|
|
355
|
+
})
|
|
356
|
+
|
|
357
|
+
// Error line, then the worker goes silent (no turn_end).
|
|
358
|
+
h.append(subAgentTerminalError('worker process crashed'))
|
|
359
|
+
h.advance(600) // read the error line
|
|
360
|
+
h.advance(62_000) // stall
|
|
361
|
+
h.advance(121_000) // synthesis
|
|
362
|
+
expect(h.stallTerminalCalls).toHaveLength(1)
|
|
363
|
+
expect(h.finishCalls).toHaveLength(1)
|
|
364
|
+
expect(h.finishCalls[0].outcome).toBe('failed')
|
|
365
|
+
expect(h.finishCalls[0].resultText).toContain('crashed')
|
|
366
|
+
})
|
|
367
|
+
|
|
368
|
+
it('a routine mid-run tool error does NOT cause a false failure', () => {
|
|
369
|
+
const h = makeHarness({ agentId: 'gap2-toolerr', bootLines: [subAgentUserMsg('bg task')] })
|
|
370
|
+
|
|
371
|
+
// A tool_result with is_error (e.g. grep found nothing) mid-run, then
|
|
372
|
+
// the worker recovers and completes normally.
|
|
373
|
+
h.append(subAgentToolResultError(), subAgentText('Completed after a retry'), subAgentTurnEnd())
|
|
374
|
+
h.advance(600)
|
|
375
|
+
|
|
376
|
+
expect(h.finishCalls).toHaveLength(1)
|
|
377
|
+
expect(h.finishCalls[0].outcome).toBe('completed') // NOT failed
|
|
378
|
+
expect(h.finishCalls[0].resultText).toContain('Completed after a retry')
|
|
379
|
+
})
|
|
380
|
+
})
|
|
@@ -693,18 +693,21 @@ describe('startSubagentWatcher', () => {
|
|
|
693
693
|
h.watcher.stop()
|
|
694
694
|
})
|
|
695
695
|
|
|
696
|
-
it('suppresses stall notifications for historical entries', () => {
|
|
697
|
-
//
|
|
698
|
-
//
|
|
699
|
-
//
|
|
700
|
-
//
|
|
696
|
+
it('suppresses stall notifications for historical (done-at-boot) entries', () => {
|
|
697
|
+
// A worker that already FINISHED before the watcher booted (turn_end
|
|
698
|
+
// present in the file) stays historical and must NOT fire stall
|
|
699
|
+
// notifications. With months of finished session history present at
|
|
700
|
+
// restart, firing stalls for each would flood the chat. NOTE: a worker
|
|
701
|
+
// still RUNNING at boot is a different case — Gap 1 promotes it to live
|
|
702
|
+
// so it DOES get the stall safety net (it's an in-flight worker the
|
|
703
|
+
// user is still awaiting), covered in subagent-watcher-handback-gaps.
|
|
701
704
|
const agentDir = '/home/user/.switchroom/agents/myagent'
|
|
702
705
|
const projectsRoot = `${agentDir}/.claude/projects`
|
|
703
706
|
const projectDir = `${projectsRoot}/myproject`
|
|
704
707
|
const sessionDir = `${projectDir}/session-abc123`
|
|
705
708
|
const subagentsDir = `${sessionDir}/subagents`
|
|
706
709
|
const jsonlPath = `${subagentsDir}/agent-deadbeef.jsonl`
|
|
707
|
-
const content = buildJSONL(subAgentUserMsg('Old task'))
|
|
710
|
+
const content = buildJSONL(subAgentUserMsg('Old task'), subAgentTurnDuration())
|
|
708
711
|
|
|
709
712
|
const h = makeHarness({
|
|
710
713
|
agentDir,
|
|
@@ -809,12 +812,15 @@ describe('startSubagentWatcher', () => {
|
|
|
809
812
|
|
|
810
813
|
describe('historical-vs-active filter', () => {
|
|
811
814
|
/**
|
|
812
|
-
* Pre-existing
|
|
813
|
-
* Stalls and completion notifications are gated on
|
|
814
|
-
* restart with months of session history doesn't
|
|
815
|
+
* Pre-existing FINISHED (done-at-boot) JSONL files are tagged
|
|
816
|
+
* historical=true. Stalls and completion notifications are gated on
|
|
817
|
+
* !historical so a restart with months of session history doesn't
|
|
818
|
+
* flood the chat. (A still-RUNNING file at boot is promoted to live by
|
|
819
|
+
* Gap 1 — see subagent-watcher-handback-gaps — so it must carry a
|
|
820
|
+
* turn_end here to stay historical.)
|
|
815
821
|
*/
|
|
816
822
|
|
|
817
|
-
it('pre-existing JSONL files
|
|
823
|
+
it('pre-existing done-at-boot JSONL files are tagged historical', () => {
|
|
818
824
|
const agentDir = '/home/user/.switchroom/agents/myagent'
|
|
819
825
|
const projectsRoot = `${agentDir}/.claude/projects`
|
|
820
826
|
const projectDir = `${projectsRoot}/myproject`
|
|
@@ -823,7 +829,7 @@ describe('startSubagentWatcher', () => {
|
|
|
823
829
|
const jsonlA = `${subagentsDir}/agent-hist-aaaa.jsonl`
|
|
824
830
|
const jsonlB = `${subagentsDir}/agent-hist-bbbb.jsonl`
|
|
825
831
|
|
|
826
|
-
const content = buildJSONL(subAgentUserMsg('Old task'))
|
|
832
|
+
const content = buildJSONL(subAgentUserMsg('Old task'), subAgentTurnDuration())
|
|
827
833
|
|
|
828
834
|
const h = makeHarness({
|
|
829
835
|
agentDir,
|
|
@@ -895,10 +901,12 @@ describe('startSubagentWatcher', () => {
|
|
|
895
901
|
})
|
|
896
902
|
|
|
897
903
|
it('pre-existing in-flight agent that finishes after restart fires completion', () => {
|
|
898
|
-
//
|
|
899
|
-
//
|
|
900
|
-
//
|
|
901
|
-
//
|
|
904
|
+
// Running at boot → Gap 1 promotes it to live (historical=false),
|
|
905
|
+
// because it's an in-flight worker the user is still awaiting across
|
|
906
|
+
// the restart. When it then writes turn_end, the completion
|
|
907
|
+
// notification fires for the state transition. (The deeper handback
|
|
908
|
+
// outcome — completed, not the dropped `orphan` — is covered in
|
|
909
|
+
// subagent-watcher-handback-gaps.)
|
|
902
910
|
const agentDir = '/home/user/.switchroom/agents/myagent'
|
|
903
911
|
const projectsRoot = `${agentDir}/.claude/projects`
|
|
904
912
|
const projectDir = `${projectsRoot}/myproject`
|