switchroom 0.15.41 → 0.15.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-scheduler/index.js +2 -1
- package/dist/auth-broker/index.js +2 -1
- package/dist/cli/notion-write-pretool.mjs +2 -1
- package/dist/cli/switchroom.js +157 -13
- package/dist/cli/ui/index.html +31 -0
- package/dist/host-control/main.js +2 -1
- package/dist/vault/approvals/kernel-server.js +2 -1
- package/dist/vault/broker/server.js +2 -1
- package/package.json +1 -1
- package/telegram-plugin/dist/gateway/gateway.js +397 -226
- package/telegram-plugin/gateway/context-occupancy.ts +91 -0
- package/telegram-plugin/gateway/gateway.ts +204 -63
- package/telegram-plugin/gateway/hostd-dispatch.ts +1 -1
- package/telegram-plugin/gateway/idle-clear.ts +72 -0
- package/telegram-plugin/gateway/poll-health.ts +9 -4
- package/telegram-plugin/gateway/poll-stall-recovery.ts +59 -0
- package/telegram-plugin/tests/context-occupancy.test.ts +55 -0
- package/telegram-plugin/tests/idle-clear.test.ts +62 -0
- package/telegram-plugin/tests/poll-stall-recovery.test.ts +32 -0
- package/telegram-plugin/tests/welcome-text.test.ts +10 -11
- package/telegram-plugin/welcome-text.ts +11 -12
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Context-headroom snapshot (RFC reference/rfcs/context-headroom-surface.md).
|
|
3
|
+
*
|
|
4
|
+
* The gateway already computes working-context occupancy at the turn-end idle
|
|
5
|
+
* gate for proactive-compaction (gateway.ts ~3000). This module turns that
|
|
6
|
+
* value into a small on-disk snapshot the host surfaces (`switchroom status` /
|
|
7
|
+
* `doctor` / web) read, so the operator can SEE each agent's headroom-to-
|
|
8
|
+
* compaction — the predictability won by ENABLE_TOOL_SEARCH=true made visible.
|
|
9
|
+
*
|
|
10
|
+
* Pure + side-effect-light so it's unit-testable; the write is best-effort and
|
|
11
|
+
* never throws (a missing snapshot reads as `unknown`, never an error).
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { mkdirSync, writeFileSync } from "node:fs";
|
|
15
|
+
import { join } from "node:path";
|
|
16
|
+
|
|
17
|
+
/** Filename written under the agent's state dir. Shared with the host reader. */
|
|
18
|
+
export const CONTEXT_OCCUPANCY_FILENAME = "context-occupancy.json";
|
|
19
|
+
|
|
20
|
+
/** Occupancy ≥ this fraction of the cap → "tight" (compaction imminent). */
|
|
21
|
+
export const TIGHT_FRACTION = 0.8;
|
|
22
|
+
|
|
23
|
+
export type ContextState = "ok" | "tight" | "unknown";
|
|
24
|
+
|
|
25
|
+
export interface ContextOccupancy {
|
|
26
|
+
/** Live working-context tokens (latest turn input + cache_read + cache_creation). */
|
|
27
|
+
occupancy: number;
|
|
28
|
+
/** session.max_context_tokens, or null when unset (native compaction only). */
|
|
29
|
+
cap: number | null;
|
|
30
|
+
/** cap - occupancy, or null when no cap. */
|
|
31
|
+
headroom: number | null;
|
|
32
|
+
/** occupancy / cap (0..1+), or null when no cap. */
|
|
33
|
+
pct: number | null;
|
|
34
|
+
/** ok / tight / unknown. `unknown` only when occupancy is unmeasurable. */
|
|
35
|
+
state: ContextState;
|
|
36
|
+
/** epoch ms (host/container clock) when computed. */
|
|
37
|
+
computedAt: number;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Build the snapshot from a measured occupancy + the resolved cap. Pure.
|
|
42
|
+
* - cap null → no ratio; state "ok" (occupancy known, just no ceiling set).
|
|
43
|
+
* - occupancy < 0 / NaN → "unknown".
|
|
44
|
+
*/
|
|
45
|
+
export function buildContextOccupancy(
|
|
46
|
+
occupancy: number,
|
|
47
|
+
cap: number | null | undefined,
|
|
48
|
+
now: number,
|
|
49
|
+
): ContextOccupancy {
|
|
50
|
+
if (!Number.isFinite(occupancy) || occupancy < 0) {
|
|
51
|
+
return { occupancy: 0, cap: cap ?? null, headroom: null, pct: null, state: "unknown", computedAt: now };
|
|
52
|
+
}
|
|
53
|
+
const c = cap != null && cap > 0 ? cap : null;
|
|
54
|
+
if (c == null) {
|
|
55
|
+
return { occupancy, cap: null, headroom: null, pct: null, state: "ok", computedAt: now };
|
|
56
|
+
}
|
|
57
|
+
const pct = occupancy / c;
|
|
58
|
+
return {
|
|
59
|
+
occupancy,
|
|
60
|
+
cap: c,
|
|
61
|
+
headroom: c - occupancy,
|
|
62
|
+
pct,
|
|
63
|
+
state: pct >= TIGHT_FRACTION ? "tight" : "ok",
|
|
64
|
+
computedAt: now,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Write `<stateDir>/context-occupancy.json`. Best-effort — callers wrap in
|
|
70
|
+
* try/catch but this also swallows internally so a write failure never
|
|
71
|
+
* disrupts the turn-end gate.
|
|
72
|
+
*/
|
|
73
|
+
export function writeContextOccupancySnapshot(
|
|
74
|
+
stateDir: string,
|
|
75
|
+
snapshot: ContextOccupancy,
|
|
76
|
+
deps?: {
|
|
77
|
+
mkdir?: (p: string, o: { recursive: true }) => void;
|
|
78
|
+
writeFile?: (p: string, d: string) => void;
|
|
79
|
+
},
|
|
80
|
+
): void {
|
|
81
|
+
try {
|
|
82
|
+
const path = join(stateDir, CONTEXT_OCCUPANCY_FILENAME);
|
|
83
|
+
(deps?.mkdir ?? ((p, o) => mkdirSync(p, o)))(stateDir, { recursive: true });
|
|
84
|
+
(deps?.writeFile ?? ((p, d) => writeFileSync(p, d)))(
|
|
85
|
+
path,
|
|
86
|
+
JSON.stringify(snapshot, null, 2) + "\n",
|
|
87
|
+
);
|
|
88
|
+
} catch {
|
|
89
|
+
/* best-effort — never break the turn-end gate */
|
|
90
|
+
}
|
|
91
|
+
}
|
|
@@ -208,7 +208,6 @@ import {
|
|
|
208
208
|
switchroomHelpText as buildSwitchroomHelpText,
|
|
209
209
|
restartAckText as buildRestartAckText,
|
|
210
210
|
newSessionAckText as buildNewSessionAckText,
|
|
211
|
-
resetSessionAckText as buildResetSessionAckText,
|
|
212
211
|
TELEGRAM_BASE_COMMANDS,
|
|
213
212
|
TELEGRAM_SWITCHROOM_COMMANDS,
|
|
214
213
|
type AgentMetadata, type AuthSummary, type StatusProbeRow,
|
|
@@ -266,7 +265,7 @@ import {
|
|
|
266
265
|
import { DEFAULT_SLOT } from '../../src/auth/accounts.js'
|
|
267
266
|
import { currentActiveSlot, type AuthCodeOutcome } from '../../src/auth/manager.js'
|
|
268
267
|
import { injectSlashCommand as injectSlashCommandImpl } from '../../src/agents/inject.js'
|
|
269
|
-
import { handleInjectCommand } from './inject-handler.js'
|
|
268
|
+
import { handleInjectCommand, type InjectDeps } from './inject-handler.js'
|
|
270
269
|
import {
|
|
271
270
|
parseModelCommand,
|
|
272
271
|
handleModelCommand,
|
|
@@ -293,7 +292,9 @@ import { refreshBanner } from '../slot-banner-driver.js'
|
|
|
293
292
|
import { loadConfig as loadSwitchroomConfig, findConfigFile as findSwitchroomConfigFile } from '../../src/config/loader.js'; import { resolveAgentConfig } from '../../src/config/merge.js'
|
|
294
293
|
import { resolveOutboundTopic as resolveOutboundTopicHelper, topicForRecipient, type TopicRouterConfig as _OutboundRouterConfig } from '../../src/telegram/topic-router.js'
|
|
295
294
|
import { readTurnUsages } from '../../src/agents/perf.js'
|
|
295
|
+
import { buildContextOccupancy, writeContextOccupancySnapshot } from './context-occupancy.js'
|
|
296
296
|
import { decideProactiveCompact, initialCompactState, type CompactState } from './proactive-compact.js'
|
|
297
|
+
import { decideIdleClear, idleDurationToMs, DEFAULT_IDLE_CLEAR_MS } from './idle-clear.js'
|
|
297
298
|
import { nextCompactNotify, idleCompactNotifyState, type CompactNotifyState } from './compact-notify.js'
|
|
298
299
|
import {
|
|
299
300
|
tryHostdDispatch,
|
|
@@ -369,6 +370,7 @@ import {
|
|
|
369
370
|
foregroundFinishAction,
|
|
370
371
|
} from './foreground-nesting.js'
|
|
371
372
|
import { createPollHealthCheck, type PollHealthCheckHandle } from './poll-health.js'
|
|
373
|
+
import { recoverFromPollStall } from './poll-stall-recovery.js'
|
|
372
374
|
import type {
|
|
373
375
|
ToolCallMessage,
|
|
374
376
|
ToolCallResult,
|
|
@@ -1351,6 +1353,11 @@ function checkApprovals(): void {
|
|
|
1351
1353
|
}
|
|
1352
1354
|
}
|
|
1353
1355
|
if (!STATIC) setInterval(checkApprovals, 5000).unref()
|
|
1356
|
+
// Idle auto-clear: check wall-clock idle every minute; maybeIdleClear no-ops
|
|
1357
|
+
// when disabled ('0s'), mid-turn, or already cleared this idle period. The
|
|
1358
|
+
// `let` state + maybeIdleClear are hoisted/initialized before this fires.
|
|
1359
|
+
const IDLE_CLEAR_CHECK_MS = Number(process.env.SWITCHROOM_IDLE_CLEAR_CHECK_MS ?? 60_000)
|
|
1360
|
+
if (!STATIC && IDLE_CLEAR_CHECK_MS > 0) setInterval(maybeIdleClear, IDLE_CLEAR_CHECK_MS).unref()
|
|
1354
1361
|
|
|
1355
1362
|
// ─── Thread / status / stream state ───────────────────────────────────────
|
|
1356
1363
|
const chatThreadMap = new Map<string, number>()
|
|
@@ -2796,6 +2803,45 @@ function purgeReactionTracking(key: string, endingTurn?: CurrentTurn): void {
|
|
|
2796
2803
|
// moot, so only evaluate when no restart drained this pass.
|
|
2797
2804
|
maybeProactiveCompact();
|
|
2798
2805
|
}
|
|
2806
|
+
// Context-headroom snapshot (RFC context-headroom-surface) — write the
|
|
2807
|
+
// current occupancy + cap so `switchroom status`/`doctor`/web can show
|
|
2808
|
+
// headroom. Independent of proactive-compaction (writes even when no cap
|
|
2809
|
+
// is configured) and best-effort (never throws). Runs on the same idle
|
|
2810
|
+
// signal — never mid-turn.
|
|
2811
|
+
snapshotContextOccupancy();
|
|
2812
|
+
}
|
|
2813
|
+
}
|
|
2814
|
+
|
|
2815
|
+
/**
|
|
2816
|
+
* Write the per-agent context-occupancy snapshot from the same live occupancy
|
|
2817
|
+
* proactive-compaction reads — but unconditionally (even with no cap set), so
|
|
2818
|
+
* the operator always sees headroom. Best-effort; never throws.
|
|
2819
|
+
*/
|
|
2820
|
+
function snapshotContextOccupancy(): void {
|
|
2821
|
+
try {
|
|
2822
|
+
const agentName = process.env.SWITCHROOM_AGENT_NAME;
|
|
2823
|
+
const file = lastSessionActiveFile;
|
|
2824
|
+
if (!agentName || !file) return;
|
|
2825
|
+
const turns = readTurnUsages(file, 1);
|
|
2826
|
+
if (turns.length === 0) return;
|
|
2827
|
+
const t = turns[0];
|
|
2828
|
+
const occupancy = t.input + t.cacheRead + t.cacheCreate;
|
|
2829
|
+
let cap: number | null = null;
|
|
2830
|
+
try {
|
|
2831
|
+
const cfg = loadSwitchroomConfig();
|
|
2832
|
+
const rawAgent = cfg.agents?.[agentName] ?? {};
|
|
2833
|
+
const resolved = resolveAgentConfig(cfg.defaults, cfg.profiles, rawAgent);
|
|
2834
|
+
cap = resolved.session?.max_context_tokens ?? null;
|
|
2835
|
+
} catch {
|
|
2836
|
+
cap = null; // config unreadable → show occupancy without a ratio
|
|
2837
|
+
}
|
|
2838
|
+
const stateDir = process.env.SWITCHROOM_AGENT_STATE_DIR ?? "/state/agent";
|
|
2839
|
+
writeContextOccupancySnapshot(
|
|
2840
|
+
stateDir,
|
|
2841
|
+
buildContextOccupancy(occupancy, cap, Date.now()),
|
|
2842
|
+
);
|
|
2843
|
+
} catch {
|
|
2844
|
+
/* best-effort — never disrupt the idle gate */
|
|
2799
2845
|
}
|
|
2800
2846
|
}
|
|
2801
2847
|
|
|
@@ -3057,6 +3103,111 @@ function maybeProactiveCompact(): void {
|
|
|
3057
3103
|
});
|
|
3058
3104
|
}
|
|
3059
3105
|
|
|
3106
|
+
// ─── Idle auto-clear ──────────────────────────────────────────────────────
|
|
3107
|
+
// Wall-clock idle → /clear (idle-clear.ts). Independent of proactive-compact
|
|
3108
|
+
// (occupancy-driven at the turn-end gate): a fully-idle agent never ends a
|
|
3109
|
+
// turn, so this runs on its own interval. Any activity (inbound / turn start /
|
|
3110
|
+
// cron fire) resets the timer via markIdleActivity(); fires once per idle
|
|
3111
|
+
// period; never mid-turn (turnInFlightForGate, the same gate compaction uses).
|
|
3112
|
+
let lastIdleActivityAt = Date.now();
|
|
3113
|
+
let idleAutoCleared = false;
|
|
3114
|
+
let idleClearDispatching = false;
|
|
3115
|
+
|
|
3116
|
+
/** Reset the idle timer + re-arm auto-clear. Call on ANY activity. */
|
|
3117
|
+
function markIdleActivity(): void {
|
|
3118
|
+
lastIdleActivityAt = Date.now();
|
|
3119
|
+
idleAutoCleared = false;
|
|
3120
|
+
}
|
|
3121
|
+
|
|
3122
|
+
/** Idle window in ms: env override → per-agent config → 3h default. 0 disables. */
|
|
3123
|
+
function resolveIdleClearMs(): number {
|
|
3124
|
+
const env = process.env.SWITCHROOM_IDLE_CLEAR_MS;
|
|
3125
|
+
if (env != null && env !== '') {
|
|
3126
|
+
const n = Number(env);
|
|
3127
|
+
return Number.isFinite(n) && n >= 0 ? n : DEFAULT_IDLE_CLEAR_MS;
|
|
3128
|
+
}
|
|
3129
|
+
try {
|
|
3130
|
+
const agentName = process.env.SWITCHROOM_AGENT_NAME;
|
|
3131
|
+
if (!agentName) return DEFAULT_IDLE_CLEAR_MS;
|
|
3132
|
+
const cfg = loadSwitchroomConfig();
|
|
3133
|
+
const rawAgent = cfg.agents?.[agentName] ?? {};
|
|
3134
|
+
const resolved = resolveAgentConfig(cfg.defaults, cfg.profiles, rawAgent);
|
|
3135
|
+
const raw = resolved.session?.idle_clear_after;
|
|
3136
|
+
if (raw == null) return DEFAULT_IDLE_CLEAR_MS; // unset → on by default (3h)
|
|
3137
|
+
const ms = idleDurationToMs(raw);
|
|
3138
|
+
return ms == null ? DEFAULT_IDLE_CLEAR_MS : ms;
|
|
3139
|
+
} catch {
|
|
3140
|
+
return DEFAULT_IDLE_CLEAR_MS; // config unreadable → keep the default on
|
|
3141
|
+
}
|
|
3142
|
+
}
|
|
3143
|
+
|
|
3144
|
+
/** Evaluate idle auto-clear (runs on IDLE_CLEAR_CHECK_MS interval). */
|
|
3145
|
+
function maybeIdleClear(): void {
|
|
3146
|
+
if (idleClearDispatching) return;
|
|
3147
|
+
const agentName = process.env.SWITCHROOM_AGENT_NAME;
|
|
3148
|
+
if (!agentName) return;
|
|
3149
|
+
const idleClearMs = resolveIdleClearMs();
|
|
3150
|
+
const decision = decideIdleClear(
|
|
3151
|
+
{
|
|
3152
|
+
lastActivityAt: lastIdleActivityAt,
|
|
3153
|
+
idleClearMs,
|
|
3154
|
+
alreadyCleared: idleAutoCleared,
|
|
3155
|
+
turnInFlight: turnInFlightForGate(),
|
|
3156
|
+
},
|
|
3157
|
+
Date.now(),
|
|
3158
|
+
);
|
|
3159
|
+
if (!decision.clear) return;
|
|
3160
|
+
// Fire once per idle period — set BEFORE the await so the next tick can't
|
|
3161
|
+
// double-dispatch. markIdleActivity() re-arms on the next real activity.
|
|
3162
|
+
idleAutoCleared = true;
|
|
3163
|
+
idleClearDispatching = true;
|
|
3164
|
+
process.stderr.write(
|
|
3165
|
+
`telegram gateway: idle auto-/clear for ${agentName} ` +
|
|
3166
|
+
`(idle >= ${Math.round(idleClearMs / 60_000)}m)\n`,
|
|
3167
|
+
);
|
|
3168
|
+
// Accepted check-to-send race (same as maybeProactiveCompact): a new inbound
|
|
3169
|
+
// could arrive between the gate check and the tmux send; /clear then lands in
|
|
3170
|
+
// claude's prompt buffer and runs at the next idle prompt (inject.ts FUTURE-GAP).
|
|
3171
|
+
void injectSlashCommandImpl(agentName, '/clear')
|
|
3172
|
+
.then(() => { void postIdleClearNotice(idleClearMs); })
|
|
3173
|
+
.catch((err: unknown) => {
|
|
3174
|
+
process.stderr.write(
|
|
3175
|
+
`telegram gateway: idle /clear inject failed for ` +
|
|
3176
|
+
`${agentName}: ${err instanceof Error ? err.message : String(err)}\n`,
|
|
3177
|
+
);
|
|
3178
|
+
})
|
|
3179
|
+
.finally(() => { idleClearDispatching = false; });
|
|
3180
|
+
}
|
|
3181
|
+
|
|
3182
|
+
/** Subtle one-line notice so the operator knows the session was auto-cleared. */
|
|
3183
|
+
async function postIdleClearNotice(idleClearMs: number): Promise<void> {
|
|
3184
|
+
try {
|
|
3185
|
+
const chatId = loadAccess().allowFrom[0];
|
|
3186
|
+
if (!chatId) return;
|
|
3187
|
+
const threadId = topicForRecipient({
|
|
3188
|
+
recipientChatId: chatId,
|
|
3189
|
+
resolvedTopic:
|
|
3190
|
+
resolveAgentOutboundTopic({ kind: 'compact-watchdog' })
|
|
3191
|
+
?? chatThreadMap.get(chatId),
|
|
3192
|
+
supergroupChatId: resolveAgentSupergroupChatId(),
|
|
3193
|
+
});
|
|
3194
|
+
const hrs = Math.round((idleClearMs / 3_600_000) * 10) / 10;
|
|
3195
|
+
const text =
|
|
3196
|
+
`🧹 <b>Cleared after ${hrs}h idle</b> — fresh slate next message; ` +
|
|
3197
|
+
`long-term memory is in Hindsight.`;
|
|
3198
|
+
await swallowingApiCall(
|
|
3199
|
+
() =>
|
|
3200
|
+
bot.api.sendMessage(chatId, text, {
|
|
3201
|
+
parse_mode: 'HTML',
|
|
3202
|
+
...(threadId != null ? { message_thread_id: threadId } : {}),
|
|
3203
|
+
}),
|
|
3204
|
+
{ chat_id: chatId, verb: 'idleAutoClear.notice' },
|
|
3205
|
+
);
|
|
3206
|
+
} catch {
|
|
3207
|
+
/* best-effort notice — the /clear itself still happened */
|
|
3208
|
+
}
|
|
3209
|
+
}
|
|
3210
|
+
|
|
3060
3211
|
/**
|
|
3061
3212
|
* Post the START card for a proactive compaction. Best-effort: a failed
|
|
3062
3213
|
* send just means no card (the compaction itself still happens). The
|
|
@@ -6690,6 +6841,10 @@ const ipcServer: IpcServer = createIpcServer({
|
|
|
6690
6841
|
},
|
|
6691
6842
|
|
|
6692
6843
|
onInjectInbound(_client: IpcClient, msg: InjectInboundMessage) {
|
|
6844
|
+
// Cron fires (incl. cheap-cron, whose session events are dropped before
|
|
6845
|
+
// currentTurn is set) are real activity — re-arm idle auto-clear so a
|
|
6846
|
+
// working scheduled agent isn't wiped after 3h of "no inbound".
|
|
6847
|
+
markIdleActivity()
|
|
6693
6848
|
const promptKey = typeof msg.inbound.meta?.prompt_key === 'string'
|
|
6694
6849
|
? msg.inbound.meta.prompt_key
|
|
6695
6850
|
: 'unknown'
|
|
@@ -9994,6 +10149,7 @@ function handleSessionEvent(ev: SessionEvent): void {
|
|
|
9994
10149
|
isDm: isDmChatId(ev.chatId),
|
|
9995
10150
|
}
|
|
9996
10151
|
currentTurn = next
|
|
10152
|
+
markIdleActivity() // any turn start (main session) is activity — re-arm idle clear
|
|
9997
10153
|
// Status-surface observability: one line at every turn SET so a later
|
|
9998
10154
|
// dark card is traceable to which turn/topic key it belonged to.
|
|
9999
10155
|
process.stderr.write(
|
|
@@ -11672,6 +11828,7 @@ async function handleInbound(
|
|
|
11672
11828
|
// (image_path_2, attachment_file_id_2, …) alongside the primary.
|
|
11673
11829
|
extraAttachments?: CoalesceAttachment[],
|
|
11674
11830
|
): Promise<void> {
|
|
11831
|
+
markIdleActivity() // any inbound resets the idle auto-clear timer + re-arms
|
|
11675
11832
|
const isTopicMessage = ctx.message?.is_topic_message ?? false
|
|
11676
11833
|
const messageThreadId = ctx.message?.message_thread_id
|
|
11677
11834
|
|
|
@@ -14452,19 +14609,37 @@ bot.command('agents', async ctx => {
|
|
|
14452
14609
|
|
|
14453
14610
|
// /inject — #725 Phase 2 slash-command bridge. Implementation in
|
|
14454
14611
|
// inject-handler.ts so it's unit-testable without booting the bot.
|
|
14455
|
-
|
|
14456
|
-
|
|
14457
|
-
|
|
14612
|
+
// Shared deps for the inject-backed commands. `/inject <verb>` uses the
|
|
14613
|
+
// defaults; first-class /compact and /clear pass `open` (anyone in the chat —
|
|
14614
|
+
// operator decision, single-tenant trust) + a `fixedVerb` so they don't need
|
|
14615
|
+
// the `/inject` prefix.
|
|
14616
|
+
function buildInjectDeps(opts?: { open?: boolean; fixedVerb?: string }): InjectDeps {
|
|
14617
|
+
return {
|
|
14618
|
+
isAuthorized: opts?.open ? () => true : isAuthorizedSender,
|
|
14458
14619
|
inject: injectSlashCommandImpl,
|
|
14459
14620
|
// accent is already inlined into the body by the handler via
|
|
14460
14621
|
// buildAccentHeader; switchroomReply doesn't need to know about it.
|
|
14461
|
-
reply: async (ctx, text,
|
|
14622
|
+
reply: async (ctx, text, replyOpts) => switchroomReply(ctx, text, { html: replyOpts?.html }),
|
|
14462
14623
|
getAgentName: getMyAgentName,
|
|
14463
|
-
getArgs: getCommandArgs,
|
|
14624
|
+
getArgs: opts?.fixedVerb != null ? () => opts.fixedVerb as string : getCommandArgs,
|
|
14464
14625
|
escapeHtml: escapeHtmlForTg,
|
|
14465
14626
|
preBlock,
|
|
14466
14627
|
formatOutput: formatSwitchroomOutput,
|
|
14467
|
-
}
|
|
14628
|
+
}
|
|
14629
|
+
}
|
|
14630
|
+
|
|
14631
|
+
bot.command('inject', async ctx => {
|
|
14632
|
+
await handleInjectCommand(ctx, buildInjectDeps())
|
|
14633
|
+
})
|
|
14634
|
+
|
|
14635
|
+
// /compact + /clear — first-class session-control commands, open to anyone in
|
|
14636
|
+
// the chat. Both are in the INJECT_COMMANDS allowlist; they ride the same
|
|
14637
|
+
// inject primitive as `/inject compact` / `/inject clear`.
|
|
14638
|
+
bot.command('compact', async ctx => {
|
|
14639
|
+
await handleInjectCommand(ctx, buildInjectDeps({ open: true, fixedVerb: '/compact' }))
|
|
14640
|
+
})
|
|
14641
|
+
bot.command('clear', async ctx => {
|
|
14642
|
+
await handleInjectCommand(ctx, buildInjectDeps({ open: true, fixedVerb: '/clear' }))
|
|
14468
14643
|
})
|
|
14469
14644
|
|
|
14470
14645
|
// /model — model dashboard + switch for this agent's live session.
|
|
@@ -14705,7 +14880,8 @@ function flushAgentHandoff(agentDir: string): number {
|
|
|
14705
14880
|
return removed
|
|
14706
14881
|
}
|
|
14707
14882
|
|
|
14708
|
-
async function
|
|
14883
|
+
async function handleNewCommand(ctx: Context): Promise<void> {
|
|
14884
|
+
const kind = 'new' // /reset removed (was a pure alias); keep the string for messages
|
|
14709
14885
|
if (!isAuthorizedSender(ctx)) return
|
|
14710
14886
|
const name = (typeof ctx.match === "string" ? ctx.match : "").trim() || getMyAgentName()
|
|
14711
14887
|
try { assertSafeAgentName(name) } catch { await switchroomReply(ctx, 'Invalid agent name.'); return }
|
|
@@ -14755,9 +14931,7 @@ async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Pro
|
|
|
14755
14931
|
|
|
14756
14932
|
const chatId = String(ctx.chat!.id)
|
|
14757
14933
|
const threadId = resolveThreadId(chatId, ctx.message?.message_thread_id)
|
|
14758
|
-
const ackText =
|
|
14759
|
-
? buildNewSessionAckText(name, flushed > 0)
|
|
14760
|
-
: buildResetSessionAckText(name, flushed > 0)
|
|
14934
|
+
const ackText = buildNewSessionAckText(name, flushed > 0)
|
|
14761
14935
|
let ackId: number | null = null
|
|
14762
14936
|
// #1075: thread-id-bearing — fall back to main chat.
|
|
14763
14937
|
try {
|
|
@@ -14823,8 +14997,7 @@ async function handleNewOrResetCommand(ctx: Context, kind: 'new' | 'reset'): Pro
|
|
|
14823
14997
|
)
|
|
14824
14998
|
}
|
|
14825
14999
|
|
|
14826
|
-
bot.command('new', async ctx =>
|
|
14827
|
-
bot.command('reset', async ctx => handleNewOrResetCommand(ctx, 'reset'))
|
|
15000
|
+
bot.command('new', async ctx => handleNewCommand(ctx))
|
|
14828
15001
|
|
|
14829
15002
|
// /update — host update from Telegram (#919). Default = dry-run plan
|
|
14830
15003
|
// (`switchroom update --check`); explicit `apply` triggers the real
|
|
@@ -21118,54 +21291,32 @@ process.on('uncaughtException', err => {
|
|
|
21118
21291
|
let runnerHandle: RunnerHandle | null = null
|
|
21119
21292
|
|
|
21120
21293
|
// Long-poll health-check handle (issue #56). Created once per process, started
|
|
21121
|
-
// after the runner comes up, stopped on clean shutdown.
|
|
21122
|
-
//
|
|
21294
|
+
// after the runner comes up, stopped on clean shutdown. On a confirmed stall
|
|
21295
|
+
// the gateway EXITS non-zero and the supervisor restarts it with a fresh runner
|
|
21296
|
+
// (see recoverFromPollStall + the 2026-06-18 incident note there). It does NOT
|
|
21297
|
+
// try to stop()+re-run the runner in place — grammy's stop() blocks on a non-
|
|
21298
|
+
// abortable getUpdates retry backoff during a network outage, which hung the
|
|
21299
|
+
// whole fleet deaf.
|
|
21123
21300
|
//
|
|
21124
21301
|
// Interval and threshold are configurable via env for ops/testing flexibility:
|
|
21125
|
-
// SWITCHROOM_POLL_HEALTH_INTERVAL_MS — default
|
|
21126
|
-
// SWITCHROOM_POLL_HEALTH_THRESHOLD — default 3
|
|
21302
|
+
// SWITCHROOM_POLL_HEALTH_INTERVAL_MS — default 60s (fast self-heal after a flap)
|
|
21303
|
+
// SWITCHROOM_POLL_HEALTH_THRESHOLD — default 3 (a single blip must not trip it)
|
|
21127
21304
|
const POLL_HEALTH_INTERVAL_MS = Number(
|
|
21128
|
-
process.env.SWITCHROOM_POLL_HEALTH_INTERVAL_MS ??
|
|
21305
|
+
process.env.SWITCHROOM_POLL_HEALTH_INTERVAL_MS ?? 60_000,
|
|
21129
21306
|
)
|
|
21130
21307
|
const POLL_HEALTH_THRESHOLD = Number(
|
|
21131
21308
|
process.env.SWITCHROOM_POLL_HEALTH_THRESHOLD ?? 3,
|
|
21132
21309
|
)
|
|
21133
21310
|
|
|
21134
|
-
/** Sentinel error thrown by onStall so the outer for-loop retries rather
|
|
21135
|
-
* than exiting. The catch block recognises this specific message. */
|
|
21136
|
-
class PollStallError extends Error {
|
|
21137
|
-
constructor() {
|
|
21138
|
-
super('poll_stall_restart')
|
|
21139
|
-
this.name = 'PollStallError'
|
|
21140
|
-
}
|
|
21141
|
-
}
|
|
21142
|
-
|
|
21143
21311
|
let pollHealthCheck: PollHealthCheckHandle | null = null
|
|
21144
21312
|
if (POLL_HEALTH_INTERVAL_MS > 0) {
|
|
21145
21313
|
pollHealthCheck = createPollHealthCheck({
|
|
21146
21314
|
ping: () => bot.api.getMe(),
|
|
21147
21315
|
onStall: async () => {
|
|
21148
|
-
|
|
21149
|
-
|
|
21150
|
-
|
|
21151
|
-
)
|
|
21152
|
-
if (runnerHandle != null && runnerHandle.isRunning()) {
|
|
21153
|
-
try {
|
|
21154
|
-
await runnerHandle.stop()
|
|
21155
|
-
} catch (err) {
|
|
21156
|
-
process.stderr.write(
|
|
21157
|
-
`telegram gateway: poll.health_check.stall_recovery runner.stop error: ${(err as Error).message}\n`,
|
|
21158
|
-
)
|
|
21159
|
-
}
|
|
21160
|
-
}
|
|
21161
|
-
// runnerHandle.stop() causes task() to resolve. That would normally
|
|
21162
|
-
// hit the `return` below and exit the startup IIFE. Instead we throw
|
|
21163
|
-
// PollStallError from inside task()'s continuation by surfacing it
|
|
21164
|
-
// through the outer catch block — but task() itself doesn't throw here.
|
|
21165
|
-
//
|
|
21166
|
-
// The simpler fix: set runnerHandle to a sentinel that the code below
|
|
21167
|
-
// `await runnerHandle.task()` checks to decide continue vs return.
|
|
21168
|
-
runnerHandle = null
|
|
21316
|
+
// Exit non-zero → _switchroom_supervise restarts the gateway sidecar
|
|
21317
|
+
// with a fresh runner. Never awaits runnerHandle.stop() (it hangs on a
|
|
21318
|
+
// wedged source). recoverFromPollStall exits 1, never 78.
|
|
21319
|
+
recoverFromPollStall({ agentName: process.env.SWITCHROOM_AGENT_NAME ?? '-' })
|
|
21169
21320
|
},
|
|
21170
21321
|
intervalMs: POLL_HEALTH_INTERVAL_MS,
|
|
21171
21322
|
failureThreshold: POLL_HEALTH_THRESHOLD,
|
|
@@ -22247,20 +22398,10 @@ void (async () => {
|
|
|
22247
22398
|
pollHealthCheck?.stop()
|
|
22248
22399
|
pollHealthCheck?.start()
|
|
22249
22400
|
await runnerHandle.task()
|
|
22250
|
-
//
|
|
22251
|
-
//
|
|
22252
|
-
//
|
|
22253
|
-
//
|
|
22254
|
-
// distinguish: null means stall-triggered, non-null means clean exit.
|
|
22255
|
-
if (runnerHandle === null) {
|
|
22256
|
-
const agentName = process.env.SWITCHROOM_AGENT_NAME ?? '-'
|
|
22257
|
-
process.stderr.write(
|
|
22258
|
-
`telegram gateway: poll.health_check.stall_recovery restarting runner agent=${agentName}\n`,
|
|
22259
|
-
)
|
|
22260
|
-
// Brief pause so the Telegram API can close the stalled connection.
|
|
22261
|
-
await new Promise(r => setTimeout(r, 2000))
|
|
22262
|
-
continue
|
|
22263
|
-
}
|
|
22401
|
+
// task() resolves only on clean shutdown (shutdown-drain stops the
|
|
22402
|
+
// runner) — exit the startup IIFE. Stall recovery no longer routes here:
|
|
22403
|
+
// onStall exits the process and the supervisor restarts the gateway
|
|
22404
|
+
// (see recoverFromPollStall). The 409 path below re-runs in place.
|
|
22264
22405
|
return
|
|
22265
22406
|
} catch (err) {
|
|
22266
22407
|
if (err instanceof GrammyError && err.error_code === 409) {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Hostd dispatch helpers for the gateway's self-restart slash-commands
|
|
3
3
|
* (#1175 RFC C, Phase 2). When the operator has opted into
|
|
4
|
-
* `host_control.enabled: true`, /restart, /new,
|
|
4
|
+
* `host_control.enabled: true`, /restart, /new, and
|
|
5
5
|
* /update apply route through the per-agent hostd UDS instead of the
|
|
6
6
|
* in-container `spawnSwitchroomDetached` shellout.
|
|
7
7
|
*
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Idle auto-clear: wipe a session's working context after a wall-clock idle
|
|
3
|
+
* period (default 3h), so a long-untouched agent starts fresh next message
|
|
4
|
+
* instead of resuming a stale, context-heavy thread. Long-term memory lives in
|
|
5
|
+
* Hindsight, so a clear loses only the in-session scratch.
|
|
6
|
+
*
|
|
7
|
+
* Sibling of proactive-compact.ts (occupancy-driven /compact at the turn-end
|
|
8
|
+
* idle gate). This one is wall-clock-driven: it fires `/clear` from a periodic
|
|
9
|
+
* interval because a fully-idle agent never ends a turn, so the turn-end gate
|
|
10
|
+
* alone would never see it. Both inject via the same primitive and both refuse
|
|
11
|
+
* to fire mid-turn (turnInFlight guard).
|
|
12
|
+
*
|
|
13
|
+
* The decider is pure so the fire-once / re-arm / not-mid-turn / disabled logic
|
|
14
|
+
* is unit-tested without the gateway.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
/** Default idle window when `session.idle_clear_after` is unset (3h). ON by default. */
|
|
18
|
+
export const DEFAULT_IDLE_CLEAR_MS = 3 * 60 * 60 * 1000;
|
|
19
|
+
|
|
20
|
+
export interface IdleClearState {
|
|
21
|
+
/** Epoch ms of the last activity (inbound, turn start, cron fire). */
|
|
22
|
+
lastActivityAt: number;
|
|
23
|
+
/** Idle window in ms. <= 0 disables auto-clear. */
|
|
24
|
+
idleClearMs: number;
|
|
25
|
+
/** Already auto-cleared since the last activity? Prevents re-clearing every tick. */
|
|
26
|
+
alreadyCleared: boolean;
|
|
27
|
+
/** A turn is in flight — never clear mid-turn. */
|
|
28
|
+
turnInFlight: boolean;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface IdleClearDecision {
|
|
32
|
+
clear: boolean;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Decide whether to auto-clear this evaluation. Fires exactly once per idle
|
|
37
|
+
* period: only when enabled, not mid-turn, not already cleared, and the idle
|
|
38
|
+
* window has elapsed. The caller sets `alreadyCleared` on fire and resets it
|
|
39
|
+
* (with `lastActivityAt`) on the next activity to re-arm.
|
|
40
|
+
*/
|
|
41
|
+
export function decideIdleClear(
|
|
42
|
+
state: IdleClearState,
|
|
43
|
+
now: number,
|
|
44
|
+
): IdleClearDecision {
|
|
45
|
+
if (state.idleClearMs <= 0) return { clear: false }; // disabled
|
|
46
|
+
if (state.turnInFlight) return { clear: false }; // never mid-turn
|
|
47
|
+
if (state.alreadyCleared) return { clear: false }; // once per idle period
|
|
48
|
+
if (now - state.lastActivityAt < state.idleClearMs) return { clear: false };
|
|
49
|
+
return { clear: true };
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Parse a `^\d+[smh]$` duration (the SessionSchema format, e.g. "3h", "30m",
|
|
54
|
+
* "7200s") to ms. Returns null on a malformed string so the caller can fall
|
|
55
|
+
* back to the default. Kept local (vs importing the web module's parser) to
|
|
56
|
+
* avoid cross-package coupling.
|
|
57
|
+
*/
|
|
58
|
+
export function idleDurationToMs(raw: string): number | null {
|
|
59
|
+
const m = /^(\d+)([smh])$/.exec(raw.trim());
|
|
60
|
+
if (!m) return null;
|
|
61
|
+
const n = Number(m[1]);
|
|
62
|
+
switch (m[2]) {
|
|
63
|
+
case "s":
|
|
64
|
+
return n * 1000;
|
|
65
|
+
case "m":
|
|
66
|
+
return n * 60_000;
|
|
67
|
+
case "h":
|
|
68
|
+
return n * 3_600_000;
|
|
69
|
+
default:
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
@@ -10,18 +10,23 @@
|
|
|
10
10
|
*
|
|
11
11
|
* Fix:
|
|
12
12
|
* A separate setInterval calls `getMe()` (a lightweight Bot API
|
|
13
|
-
* endpoint) every
|
|
14
|
-
* constitute a stall
|
|
15
|
-
* for the in-flight request to die, then let the caller restart it.
|
|
13
|
+
* endpoint) every `intervalMs`. `failureThreshold` consecutive
|
|
14
|
+
* failures constitute a stall and fire `onStall`.
|
|
16
15
|
*
|
|
17
16
|
* A single failure doesn't count — transient network blips happen.
|
|
18
17
|
* The threshold must be >= 3 so a brief Telegram outage (e.g. a
|
|
19
18
|
* data-centre hiccup) doesn't cause thrashing.
|
|
20
19
|
*
|
|
20
|
+
* Recovery (see gateway.ts onStall → poll-stall-recovery.ts):
|
|
21
|
+
* `onStall` does NOT stop()+re-run the runner — grammy's stop() blocks
|
|
22
|
+
* on a non-abortable getUpdates retry backoff during an outage, which hung
|
|
23
|
+
* the fleet deaf (2026-06-18). It exits the process non-zero and the
|
|
24
|
+
* supervisor restarts the gateway with a fresh runner.
|
|
25
|
+
*
|
|
21
26
|
* Usage:
|
|
22
27
|
* const hc = createPollHealthCheck({
|
|
23
28
|
* ping: () => bot.api.getMe(),
|
|
24
|
-
* onStall: async () => {
|
|
29
|
+
* onStall: async () => { recoverFromPollStall({ agentName }); },
|
|
25
30
|
* log: (msg) => process.stderr.write(msg),
|
|
26
31
|
* });
|
|
27
32
|
* // start after the runner is up:
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Telegram poll stall recovery (incident 2026-06-18: a network flap left the
|
|
3
|
+
* whole agent fleet alive-but-deaf for ~30 min until a manual restart).
|
|
4
|
+
*
|
|
5
|
+
* When the long-poll health-check (poll-health.ts) detects a stall — 3
|
|
6
|
+
* consecutive `getMe` failures — the OLD recovery did `await runnerHandle.stop()`
|
|
7
|
+
* then re-ran the runner in place. That HUNG: grammy's `stop()` returns the
|
|
8
|
+
* runner task promise, which is blocked on a non-abortable `getUpdates` retry
|
|
9
|
+
* backoff (`@grammyjs/runner` maxRetryTime 15h, exponential, plain setTimeout
|
|
10
|
+
* that ignores the abort signal). During a network outage the source never
|
|
11
|
+
* terminates, so `stop()` never resolves, the re-run never fires, and the
|
|
12
|
+
* gateway stays deaf.
|
|
13
|
+
*
|
|
14
|
+
* Recovery is now a clean non-zero process EXIT: `_switchroom_supervise`
|
|
15
|
+
* (profiles/_base/start.sh.hbs) restarts the gateway sidecar with a fresh
|
|
16
|
+
* runner. Same mechanism the manual fix used and the run loop's catch-block
|
|
17
|
+
* already relies on; it restarts only the gateway (claude/tmux untouched,
|
|
18
|
+
* in-flight turns recovered by boot-resume). Extracted as its own module with
|
|
19
|
+
* an injectable `exit` so the exit code is unit-testable (mirrors
|
|
20
|
+
* startup-network-retry.ts) — gateway.ts itself can't be imported in a test.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
export interface PollStallRecoveryDeps {
|
|
24
|
+
/** Process exit. Injectable for tests; defaults to process.exit. */
|
|
25
|
+
exit?: (code: number) => void;
|
|
26
|
+
/** Logger. Defaults to process.stderr. */
|
|
27
|
+
log?: (msg: string) => void;
|
|
28
|
+
/** Agent name for the log line. */
|
|
29
|
+
agentName?: string;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/** Exit code for a poll stall. Always 1 — NEVER 78. */
|
|
33
|
+
export const POLL_STALL_EXIT_CODE = 1;
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Recover from a confirmed Telegram poll stall by exiting non-zero so the
|
|
37
|
+
* supervisor restarts the gateway with a fresh runner.
|
|
38
|
+
*
|
|
39
|
+
* MUST exit 1 — NEVER 78. Exit 78 (EX_CONFIG) is the supervisor's permanent-
|
|
40
|
+
* quarantine sentinel (start.sh.hbs); quarantining on a *transient* network
|
|
41
|
+
* stall would leave the gateway dead even after connectivity returns — the
|
|
42
|
+
* exact failure this fix exists to prevent.
|
|
43
|
+
*/
|
|
44
|
+
export function recoverFromPollStall(deps: PollStallRecoveryDeps = {}): void {
|
|
45
|
+
const exit = deps.exit ?? ((code: number) => process.exit(code));
|
|
46
|
+
const log =
|
|
47
|
+
deps.log ??
|
|
48
|
+
((msg: string) => {
|
|
49
|
+
process.stderr.write(msg.endsWith("\n") ? msg : msg + "\n");
|
|
50
|
+
});
|
|
51
|
+
const agentName = deps.agentName ?? "-";
|
|
52
|
+
|
|
53
|
+
log(
|
|
54
|
+
`telegram gateway: poll.health_check.stall_recovery exiting code=${POLL_STALL_EXIT_CODE} ` +
|
|
55
|
+
`pid=${process.pid} agent=${agentName} — supervisor will restart the gateway with a fresh runner ` +
|
|
56
|
+
`(not awaiting runnerHandle.stop(): grammy stop() blocks on a non-abortable getUpdates retry backoff during an outage)`,
|
|
57
|
+
);
|
|
58
|
+
exit(POLL_STALL_EXIT_CODE);
|
|
59
|
+
}
|