switchroom 0.12.13 → 0.12.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +107 -371
- package/dist/cli/switchroom.js +627 -386
- package/dist/vault/approvals/kernel-server.js +88 -1
- package/dist/vault/broker/server.js +132 -2
- package/package.json +1 -1
- package/telegram-plugin/dist/gateway/gateway.js +289 -81
- package/telegram-plugin/gateway/approval-callback.test.ts +49 -1
- package/telegram-plugin/gateway/approval-callback.ts +85 -56
- package/telegram-plugin/gateway/gateway.ts +168 -19
- package/telegram-plugin/gateway/pending-inbound-buffer.ts +39 -0
- package/telegram-plugin/gateway/pending-permission-decisions.ts +112 -0
- package/telegram-plugin/gateway/vault-grant-inbound-builders.ts +117 -0
- package/telegram-plugin/tests/pending-inbound-buffer.test.ts +71 -1
- package/telegram-plugin/tests/pending-permission-decisions.test.ts +73 -0
- package/telegram-plugin/tests/vault-save-inbound-builders.test.ts +96 -0
|
@@ -22,14 +22,58 @@
|
|
|
22
22
|
*/
|
|
23
23
|
|
|
24
24
|
import { type Context, InlineKeyboard } from "grammy";
|
|
25
|
-
import { parseApprovalCallback, ttlMsFromToken } from "./approval-card.js";
|
|
26
25
|
import {
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
26
|
+
parseApprovalCallback,
|
|
27
|
+
ttlMsFromToken,
|
|
28
|
+
type ApprovalChoice,
|
|
29
|
+
} from "./approval-card.js";
|
|
30
|
+
import { approvalConsumeRecord } from "../../src/vault/approvals/client.js";
|
|
30
31
|
import type { ApprovalDecisionMode } from "../../src/vault/approvals/schema.js";
|
|
31
32
|
import { scopeToOpenInDriveButton } from "../../src/drive/deep-links.js";
|
|
32
33
|
|
|
34
|
+
/**
|
|
35
|
+
* Resolve a tapped approval choice to its decision tuple — PURE, no
|
|
36
|
+
* kernel I/O, so the `bad ttl token` branch (the only fallible path in
|
|
37
|
+
* the old inline switch) is unit-testable without mocking grammy.
|
|
38
|
+
*
|
|
39
|
+
* Extracted (PR-5) from `handleApprovalCallback` so PR-4's invariant —
|
|
40
|
+
* "compute + validate the decision BEFORE burning the single-use
|
|
41
|
+
* nonce" — is now structural, not a comment: the handler calls this
|
|
42
|
+
* first and only proceeds to `approvalConsume` on `ok: true`. A
|
|
43
|
+
* malformed ttl token returns `{ ok: false }` and the nonce is never
|
|
44
|
+
* touched (operator can re-tap a valid choice).
|
|
45
|
+
*/
|
|
46
|
+
export type ResolvedApprovalDecision =
|
|
47
|
+
| {
|
|
48
|
+
ok: true;
|
|
49
|
+
decision: ApprovalDecisionMode;
|
|
50
|
+
granted: boolean;
|
|
51
|
+
ttl_ms: number | null;
|
|
52
|
+
displayMode: string;
|
|
53
|
+
}
|
|
54
|
+
| { ok: false; error: string };
|
|
55
|
+
|
|
56
|
+
export function resolveApprovalDecision(
|
|
57
|
+
choice: ApprovalChoice,
|
|
58
|
+
): ResolvedApprovalDecision {
|
|
59
|
+
switch (choice.kind) {
|
|
60
|
+
case "deny":
|
|
61
|
+
return { ok: true, decision: "deny", granted: false, ttl_ms: null, displayMode: "denied" };
|
|
62
|
+
case "once":
|
|
63
|
+
// No expiry — recorded as a one-shot grant; the agent calls
|
|
64
|
+
// approval_lookup at most once, then proceeds. /approvals revoke
|
|
65
|
+
// can still target the row by id.
|
|
66
|
+
return { ok: true, decision: "allow_once", granted: true, ttl_ms: null, displayMode: "granted once" };
|
|
67
|
+
case "always":
|
|
68
|
+
return { ok: true, decision: "allow_always", granted: true, ttl_ms: null, displayMode: "granted always" };
|
|
69
|
+
case "ttl": {
|
|
70
|
+
const ms = ttlMsFromToken(choice.param);
|
|
71
|
+
if (ms === null) return { ok: false, error: "bad ttl token" };
|
|
72
|
+
return { ok: true, decision: "allow_ttl", granted: true, ttl_ms: ms, displayMode: `granted for ${choice.param}` };
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
33
77
|
/**
|
|
34
78
|
* Build the post-tap keyboard for a granted decision. Today this is
|
|
35
79
|
* just the `[ 📖 Open in Drive ]` button when the granted scope names
|
|
@@ -57,55 +101,21 @@ export async function handleApprovalCallback(
|
|
|
57
101
|
return;
|
|
58
102
|
}
|
|
59
103
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
104
|
+
// Resolve + validate the decision BEFORE burning the single-use
|
|
105
|
+
// nonce (PR-4 invariant, now structural via the pure
|
|
106
|
+
// resolveApprovalDecision — see its doc). A malformed ttl token
|
|
107
|
+
// returns { ok: false } here and the nonce is never touched, so the
|
|
108
|
+
// operator can re-tap a valid choice; pre-fix this validation ran
|
|
109
|
+
// AFTER approvalConsume(), burning the nonce with no decision
|
|
110
|
+
// recorded → the agent's approval_lookup poll never saw a verdict
|
|
111
|
+
// and the turn wedged. There is now NO fallible step between the
|
|
112
|
+
// consume→record below.
|
|
113
|
+
const resolved = resolveApprovalDecision(parsed.choice);
|
|
114
|
+
if (!resolved.ok) {
|
|
115
|
+
await ctx.answerCallbackQuery({ text: resolved.error });
|
|
63
116
|
return;
|
|
64
117
|
}
|
|
65
|
-
|
|
66
|
-
// Single-use enforcement: someone already tapped, or the nonce
|
|
67
|
-
// expired/unknown. Match the RFC §8.1 wording.
|
|
68
|
-
await ctx.answerCallbackQuery({ text: "this prompt expired" });
|
|
69
|
-
return;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
// Compute decision + ttl from the choice variant.
|
|
73
|
-
let decision: ApprovalDecisionMode;
|
|
74
|
-
let granted: boolean;
|
|
75
|
-
let ttl_ms: number | null = null;
|
|
76
|
-
let displayMode: string;
|
|
77
|
-
switch (parsed.choice.kind) {
|
|
78
|
-
case "deny":
|
|
79
|
-
decision = "deny";
|
|
80
|
-
granted = false;
|
|
81
|
-
displayMode = "denied";
|
|
82
|
-
break;
|
|
83
|
-
case "once":
|
|
84
|
-
decision = "allow_once";
|
|
85
|
-
granted = true;
|
|
86
|
-
// No expiry — recorded as a one-shot grant; the agent calls
|
|
87
|
-
// approval_lookup at most once, then proceeds. /approvals revoke
|
|
88
|
-
// can still target the row by id.
|
|
89
|
-
displayMode = "granted once";
|
|
90
|
-
break;
|
|
91
|
-
case "always":
|
|
92
|
-
decision = "allow_always";
|
|
93
|
-
granted = true;
|
|
94
|
-
displayMode = "granted always";
|
|
95
|
-
break;
|
|
96
|
-
case "ttl": {
|
|
97
|
-
decision = "allow_ttl";
|
|
98
|
-
granted = true;
|
|
99
|
-
const ms = ttlMsFromToken(parsed.choice.param);
|
|
100
|
-
if (ms === null) {
|
|
101
|
-
await ctx.answerCallbackQuery({ text: "bad ttl token" });
|
|
102
|
-
return;
|
|
103
|
-
}
|
|
104
|
-
ttl_ms = ms;
|
|
105
|
-
displayMode = `granted for ${parsed.choice.param}`;
|
|
106
|
-
break;
|
|
107
|
-
}
|
|
108
|
-
}
|
|
118
|
+
const { decision, granted, ttl_ms, displayMode } = resolved;
|
|
109
119
|
|
|
110
120
|
const granted_by_user_id = ctx.from?.id ?? 0;
|
|
111
121
|
// Approver set at decision time = the chat that received the card. We
|
|
@@ -114,18 +124,37 @@ export async function handleApprovalCallback(
|
|
|
114
124
|
// when each surface migrates and starts passing access.allowFrom.
|
|
115
125
|
const approver_set = [String(granted_by_user_id)];
|
|
116
126
|
|
|
117
|
-
|
|
127
|
+
// PR-6: atomic consume+record — ONE round-trip; the kernel burns the
|
|
128
|
+
// single-use nonce AND writes the decision in one SQLite transaction.
|
|
129
|
+
// If the record fails the burn rolls back, so `null` genuinely means
|
|
130
|
+
// "nothing happened, safe to retry" — there is no burned-nonce /
|
|
131
|
+
// no-decision wedge any more (the residual the shipped permission-TTL
|
|
132
|
+
// auto-deny used to backstop). resolveApprovalDecision already
|
|
133
|
+
// validated the ttl above, so no fallible step precedes this call.
|
|
134
|
+
const result = await approvalConsumeRecord({
|
|
118
135
|
request_id: parsed.request_id,
|
|
119
136
|
decision,
|
|
120
137
|
approver_set,
|
|
121
138
|
granted_by_user_id,
|
|
122
139
|
ttl_ms,
|
|
123
140
|
});
|
|
124
|
-
|
|
125
|
-
|
|
141
|
+
if (result === null) {
|
|
142
|
+
await ctx.answerCallbackQuery({ text: "approval kernel unreachable" });
|
|
143
|
+
return;
|
|
144
|
+
}
|
|
145
|
+
if (!result.consumed) {
|
|
146
|
+
// Already tapped / expired / unknown — single-use is enforced
|
|
147
|
+
// kernel-side and NO decision was written. RFC §8.1 wording.
|
|
148
|
+
await ctx.answerCallbackQuery({ text: "this prompt expired" });
|
|
149
|
+
return;
|
|
150
|
+
}
|
|
151
|
+
if (!result.decision_id) {
|
|
152
|
+
// Defensive: consumed:true must carry a decision_id. Kept distinct
|
|
153
|
+
// from the unreachable message for operator triage.
|
|
126
154
|
await ctx.answerCallbackQuery({ text: "kernel record failed" });
|
|
127
155
|
return;
|
|
128
156
|
}
|
|
157
|
+
const decision_id: string = result.decision_id;
|
|
129
158
|
|
|
130
159
|
// Edit the original card to its post-tap state. Drop the original
|
|
131
160
|
// action keyboard either way; on a successful grant for a Drive
|
|
@@ -138,8 +167,8 @@ export async function handleApprovalCallback(
|
|
|
138
167
|
? ` · /approvals revoke <code>${decision_id}</code>`
|
|
139
168
|
: "");
|
|
140
169
|
|
|
141
|
-
const postTapKeyboard = granted &&
|
|
142
|
-
? buildGrantedKeyboard(
|
|
170
|
+
const postTapKeyboard = granted && result.scope
|
|
171
|
+
? buildGrantedKeyboard(result.scope)
|
|
143
172
|
: undefined;
|
|
144
173
|
|
|
145
174
|
try {
|
|
@@ -245,10 +245,14 @@ import { shouldSweepChatAtBoot } from './boot-sweep-filter.js'
|
|
|
245
245
|
import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js'
|
|
246
246
|
import { handleRequestDriveApproval } from './drive-write-approval.js'
|
|
247
247
|
import { buildDiffPreviewCard } from './diff-preview-card.js'
|
|
248
|
-
import { createPendingInboundBuffer } from './pending-inbound-buffer.js'
|
|
248
|
+
import { createPendingInboundBuffer, redeliverBufferedInbound } from './pending-inbound-buffer.js'
|
|
249
|
+
import { createPendingPermissionBuffer } from './pending-permission-decisions.js'
|
|
249
250
|
import {
|
|
250
251
|
buildVaultGrantApprovedInbound,
|
|
251
252
|
buildVaultGrantDeniedInbound,
|
|
253
|
+
buildVaultSaveCompletedInbound,
|
|
254
|
+
buildVaultSaveFailedInbound,
|
|
255
|
+
buildVaultSaveDiscardedInbound,
|
|
252
256
|
} from './vault-grant-inbound-builders.js'
|
|
253
257
|
import { createPollHealthCheck, type PollHealthCheckHandle } from './poll-health.js'
|
|
254
258
|
import type {
|
|
@@ -262,6 +266,7 @@ import type {
|
|
|
262
266
|
PtyPartialForward,
|
|
263
267
|
InboundMessage,
|
|
264
268
|
InjectInboundMessage,
|
|
269
|
+
PermissionEvent,
|
|
265
270
|
} from './ipc-protocol.js'
|
|
266
271
|
import { DebounceBuffer, HourCap, buildReactionInboundMeta, buildReactionInboundText, evaluateTriggerCandidate, isGroupChat, resolveReactionsConfig, truncatePreview, type PendingReaction, type ReactionBatch, type ReactionsResolvedConfig } from './reaction-trigger.js'
|
|
267
272
|
import { writePidFile, clearPidFile } from './pid-file.js'
|
|
@@ -2093,7 +2098,23 @@ const pendingStateReaper = setInterval(() => {
|
|
|
2093
2098
|
if (now - v.startedAt > VAULT_INPUT_TTL_MS) pendingVaultOps.delete(k)
|
|
2094
2099
|
}
|
|
2095
2100
|
for (const [k, v] of pendingPermissions) {
|
|
2096
|
-
if (now - v.startedAt > PERMISSION_TTL_MS)
|
|
2101
|
+
if (now - v.startedAt > PERMISSION_TTL_MS) {
|
|
2102
|
+
// Don't just drop it: the claude turn is suspended INSIDE the MCP
|
|
2103
|
+
// permission call waiting for a verdict. A silent delete left it
|
|
2104
|
+
// wedged forever when the operator never tapped — permanent
|
|
2105
|
+
// silence, the exact symptom this series fixes. Auto-deny so the
|
|
2106
|
+
// call unblocks; claude then tells the user it couldn't get
|
|
2107
|
+
// permission (or takes a fallback). Routed through
|
|
2108
|
+
// dispatchPermissionVerdict so it's buffered+redelivered too if
|
|
2109
|
+
// the bridge is also offline at sweep time.
|
|
2110
|
+
dispatchPermissionVerdict({ type: 'permission', requestId: k, behavior: 'deny' })
|
|
2111
|
+
process.stderr.write(
|
|
2112
|
+
`telegram gateway: permission TTL expired — auto-deny request=${k} ` +
|
|
2113
|
+
`tool=${v.tool_name} (no operator response in ` +
|
|
2114
|
+
`${Math.round(PERMISSION_TTL_MS / 60000)}m)\n`,
|
|
2115
|
+
)
|
|
2116
|
+
pendingPermissions.delete(k)
|
|
2117
|
+
}
|
|
2097
2118
|
}
|
|
2098
2119
|
for (const [k, v] of vaultPassphraseCache) {
|
|
2099
2120
|
if (now > v.expiresAt) vaultPassphraseCache.delete(k)
|
|
@@ -2722,10 +2743,27 @@ silencePoke.startTimer({
|
|
|
2722
2743
|
try {
|
|
2723
2744
|
clearSilentEndState(fbKey)
|
|
2724
2745
|
} catch { /* best-effort */ }
|
|
2746
|
+
// Self-heal the inbound buffer. pendingInboundBuffer otherwise
|
|
2747
|
+
// drains ONLY on bridge re-register (onClientRegistered). After a
|
|
2748
|
+
// network storm that settles with the bridge STILL connected, user
|
|
2749
|
+
// messages buffered during the flap sit forever — until a manual
|
|
2750
|
+
// restart forces a re-register (the fleet-update thundering-herd
|
|
2751
|
+
// incident, 2026-05-19: agents "not responding", logs show
|
|
2752
|
+
// pending-inbound-buffer depth>0 with no drain). Flushing on
|
|
2753
|
+
// wedge-clear makes the agent self-heal. selfAgent-keyed; a miss
|
|
2754
|
+
// re-buffers so nothing is lost if the bridge is genuinely offline.
|
|
2755
|
+
const fbSelfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
|
|
2756
|
+
const fbRedeliver = redeliverBufferedInbound(
|
|
2757
|
+
pendingInboundBuffer,
|
|
2758
|
+
fbSelfAgent,
|
|
2759
|
+
(m) => ipcServer.sendToAgent(fbSelfAgent, m),
|
|
2760
|
+
)
|
|
2725
2761
|
process.stderr.write(
|
|
2726
2762
|
`telegram gateway: silence-poke framework-fallback ended wedged turn ` +
|
|
2727
2763
|
`chat=${fbChatId} thread=${ctx.threadId ?? '-'} silence_ms=${ctx.silenceMs} ` +
|
|
2728
|
-
`currentTurn_nulled=${turnMatchesFallback}
|
|
2764
|
+
`currentTurn_nulled=${turnMatchesFallback} ` +
|
|
2765
|
+
`drained_buffered=${fbRedeliver.redelivered}/${fbRedeliver.drained}` +
|
|
2766
|
+
`${fbRedeliver.rebuffered > 0 ? ` rebuffered=${fbRedeliver.rebuffered}` : ''}\n`,
|
|
2729
2767
|
)
|
|
2730
2768
|
},
|
|
2731
2769
|
})
|
|
@@ -2738,6 +2776,36 @@ silencePoke.startTimer({
|
|
|
2738
2776
|
// would mint the grant but silently drop the `vault_grant_approved`
|
|
2739
2777
|
// inbound, leaving the agent stuck waiting for a manual poke.
|
|
2740
2778
|
const pendingInboundBuffer = createPendingInboundBuffer()
|
|
2779
|
+
const pendingPermissionBuffer = createPendingPermissionBuffer()
|
|
2780
|
+
|
|
2781
|
+
/**
|
|
2782
|
+
* Deliver a permission verdict to this agent's bridge, buffering on a
|
|
2783
|
+
* miss so it's redelivered when the bridge reconnects. Replaces the
|
|
2784
|
+
* bare `ipcServer.broadcast({type:'permission',...})` at every verdict
|
|
2785
|
+
* site (and the TTL-sweep auto-deny). broadcast was fire-and-forget:
|
|
2786
|
+
* a verdict produced while the bridge was mid-reconnect was dropped
|
|
2787
|
+
* and the claude turn stayed suspended INSIDE the MCP permission call
|
|
2788
|
+
* forever — the user tapped Approve/Deny and nothing happened, no
|
|
2789
|
+
* further output, permanent silence. sendToAgent is registered-keyed
|
|
2790
|
+
* and returns a real delivered bool; on a miss we buffer and
|
|
2791
|
+
* onClientRegistered re-sends so the reconnecting bridge relays the
|
|
2792
|
+
* verdict to the still-suspended call. A late verdict for a dead
|
|
2793
|
+
* request_id is harmless — the bridge relays it and Claude Code
|
|
2794
|
+
* ignores an unknown request_id. (Function declaration so the
|
|
2795
|
+
* pre-2747 TTL sweep can reference it; ipcServer/pendingPermissionBuffer
|
|
2796
|
+
* are resolved at call-time, after module init.)
|
|
2797
|
+
*/
|
|
2798
|
+
function dispatchPermissionVerdict(ev: PermissionEvent): void {
|
|
2799
|
+
const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
|
|
2800
|
+
const delivered = ipcServer.sendToAgent(selfAgent, ev)
|
|
2801
|
+
if (!delivered) {
|
|
2802
|
+
pendingPermissionBuffer.push(selfAgent, ev)
|
|
2803
|
+
process.stderr.write(
|
|
2804
|
+
`telegram gateway: permission verdict buffered (bridge offline) ` +
|
|
2805
|
+
`request=${ev.requestId} behavior=${ev.behavior}\n`,
|
|
2806
|
+
)
|
|
2807
|
+
}
|
|
2808
|
+
}
|
|
2741
2809
|
|
|
2742
2810
|
const ipcServer: IpcServer = createIpcServer({
|
|
2743
2811
|
socketPath: SOCKET_PATH,
|
|
@@ -2765,6 +2833,22 @@ const ipcServer: IpcServer = createIpcServer({
|
|
|
2765
2833
|
)
|
|
2766
2834
|
}
|
|
2767
2835
|
}
|
|
2836
|
+
// PR-3: drain permission verdicts missed while the bridge was
|
|
2837
|
+
// offline. A claude turn suspended inside the MCP permission call
|
|
2838
|
+
// is unblocked the moment the reconnecting bridge relays the
|
|
2839
|
+
// verdict; without this the verdict (incl. the TTL auto-deny) was
|
|
2840
|
+
// lost and the turn stayed silent forever.
|
|
2841
|
+
const pendingVerdicts = pendingPermissionBuffer.drain(client.agentName)
|
|
2842
|
+
for (const ev of pendingVerdicts) {
|
|
2843
|
+
try {
|
|
2844
|
+
client.send(ev)
|
|
2845
|
+
} catch (err) {
|
|
2846
|
+
process.stderr.write(
|
|
2847
|
+
`telegram gateway: pending-permission drain failed agent=${client.agentName} ` +
|
|
2848
|
+
`request=${ev.requestId} behavior=${ev.behavior}: ${(err as Error).message}\n`,
|
|
2849
|
+
)
|
|
2850
|
+
}
|
|
2851
|
+
}
|
|
2768
2852
|
}
|
|
2769
2853
|
|
|
2770
2854
|
// If the agent reconnected after a /restart (or any restart), post a boot
|
|
@@ -6251,7 +6335,7 @@ async function handleInbound(
|
|
|
6251
6335
|
// Forward permission reply to connected bridge
|
|
6252
6336
|
const behavior = permMatch[1]!.toLowerCase().startsWith('y') ? 'allow' : 'deny'
|
|
6253
6337
|
const request_id = permMatch[2]!.toLowerCase()
|
|
6254
|
-
|
|
6338
|
+
dispatchPermissionVerdict({
|
|
6255
6339
|
type: 'permission',
|
|
6256
6340
|
requestId: request_id,
|
|
6257
6341
|
behavior,
|
|
@@ -6981,17 +7065,28 @@ async function handleInbound(
|
|
|
6981
7065
|
},
|
|
6982
7066
|
}
|
|
6983
7067
|
|
|
6984
|
-
//
|
|
6985
|
-
|
|
6986
|
-
|
|
6987
|
-
|
|
7068
|
+
// Deliver to THIS agent's registered bridge, buffering on miss.
|
|
7069
|
+
// broadcast()/clientCount() were the wrong primitives: broadcast is
|
|
7070
|
+
// not registered-keyed (writes to any alive socket incl. an
|
|
7071
|
+
// unregistered pre-handshake one) and yields no delivered signal,
|
|
7072
|
+
// and clientCount() counts unregistered sockets — so a bridge
|
|
7073
|
+
// mid-reconnect made clientCount()>0, the message was broadcast into
|
|
7074
|
+
// a non-registered socket, the "restarting" notice was suppressed,
|
|
7075
|
+
// and the user's message was silently lost. The old "queued either
|
|
7076
|
+
// way" comment was false: broadcast does not queue. sendToAgent is
|
|
7077
|
+
// registered-keyed + returns a real delivered bool; on a miss we
|
|
7078
|
+
// push to pendingInboundBuffer, which onClientRegistered drains on
|
|
7079
|
+
// the next bridge register — so the notice below is now truthful.
|
|
7080
|
+
const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
|
|
7081
|
+
const delivered = ipcServer.sendToAgent(selfAgent, inboundMsg)
|
|
6988
7082
|
if (!delivered) {
|
|
7083
|
+
pendingInboundBuffer.push(selfAgent, inboundMsg)
|
|
6989
7084
|
const threadOpts = messageThreadId != null ? { message_thread_id: messageThreadId } : {}
|
|
6990
7085
|
// #1075: thread-id-bearing — swallow via robustApiCall so a deleted
|
|
6991
|
-
// topic doesn't crash the gateway. Fire-and-forget; the
|
|
6992
|
-
//
|
|
7086
|
+
// topic doesn't crash the gateway. Fire-and-forget; the inbound is
|
|
7087
|
+
// genuinely buffered now, so the hint is accurate, not a guess.
|
|
6993
7088
|
void swallowingApiCall(
|
|
6994
|
-
() => bot.api.sendMessage(chat_id, '⏳ Agent is restarting
|
|
7089
|
+
() => bot.api.sendMessage(chat_id, '⏳ Agent is restarting — your message is queued and will be processed when it reconnects.', { ...threadOpts }),
|
|
6995
7090
|
{
|
|
6996
7091
|
chat_id,
|
|
6997
7092
|
verb: 'agent-restarting-notice',
|
|
@@ -8847,7 +8942,7 @@ async function handlePermissionSlash(ctx: Context, behavior: 'allow' | 'deny'):
|
|
|
8847
8942
|
return
|
|
8848
8943
|
}
|
|
8849
8944
|
// Forward to connected bridges — same IPC the button handler uses.
|
|
8850
|
-
|
|
8945
|
+
dispatchPermissionVerdict({ type: 'permission', requestId: request_id, behavior })
|
|
8851
8946
|
pendingPermissions.delete(request_id)
|
|
8852
8947
|
process.stderr.write(
|
|
8853
8948
|
`[telegram gateway] slash-${behavior} request_id=${request_id} tool=${details.tool_name} by=${senderId}\n`,
|
|
@@ -10039,6 +10134,21 @@ async function handleVaultRequestSaveCallback(ctx: Context, data: string): Promi
|
|
|
10039
10134
|
)
|
|
10040
10135
|
.catch(() => {})
|
|
10041
10136
|
}
|
|
10137
|
+
// Wake the agent that called vault_request_save — symmetric with
|
|
10138
|
+
// the vra: approve/deny path (#1052/#1150/#1156). Without this the
|
|
10139
|
+
// tool returned "waiting for operator", the turn ended, and a
|
|
10140
|
+
// Discard left the agent silently idle forever.
|
|
10141
|
+
const discardInbound = buildVaultSaveDiscardedInbound({
|
|
10142
|
+
ctx: { agent: pending.agent, key: pending.key, chat_id: pending.chat_id },
|
|
10143
|
+
stageId,
|
|
10144
|
+
operatorId: senderId,
|
|
10145
|
+
})
|
|
10146
|
+
const dDelivered = ipcServer.sendToAgent(pending.agent, discardInbound)
|
|
10147
|
+
process.stderr.write(
|
|
10148
|
+
`telegram gateway: vault_save_discarded injection agent=${pending.agent} ` +
|
|
10149
|
+
`key=${pending.key} stage=${stageId} delivered=${dDelivered}\n`,
|
|
10150
|
+
)
|
|
10151
|
+
if (!dDelivered) pendingInboundBuffer.push(pending.agent, discardInbound)
|
|
10042
10152
|
return
|
|
10043
10153
|
}
|
|
10044
10154
|
|
|
@@ -10143,6 +10253,22 @@ async function handleVaultRequestSaveCallback(ctx: Context, data: string): Promi
|
|
|
10143
10253
|
// retry by re-invoking the same MCP tool, but the value will be
|
|
10144
10254
|
// re-staged with a new ID. Drop the current stage.
|
|
10145
10255
|
pendingVaultRequestSaves.delete(stageId)
|
|
10256
|
+
// Wake the waiting agent with the failure (symmetric with the
|
|
10257
|
+
// success/discard paths) so it doesn't assume vault:<key> exists.
|
|
10258
|
+
const failReason =
|
|
10259
|
+
(write.output || 'vault write error').split('\n')[0]!.slice(0, 200)
|
|
10260
|
+
const failInbound = buildVaultSaveFailedInbound({
|
|
10261
|
+
ctx: { agent: pending.agent, key: pending.key, chat_id: pending.chat_id },
|
|
10262
|
+
stageId,
|
|
10263
|
+
operatorId: senderId,
|
|
10264
|
+
reason: failReason,
|
|
10265
|
+
})
|
|
10266
|
+
const fDelivered = ipcServer.sendToAgent(pending.agent, failInbound)
|
|
10267
|
+
process.stderr.write(
|
|
10268
|
+
`telegram gateway: vault_save_failed injection agent=${pending.agent} ` +
|
|
10269
|
+
`key=${pending.key} stage=${stageId} delivered=${fDelivered}\n`,
|
|
10270
|
+
)
|
|
10271
|
+
if (!fDelivered) pendingInboundBuffer.push(pending.agent, failInbound)
|
|
10146
10272
|
return
|
|
10147
10273
|
}
|
|
10148
10274
|
|
|
@@ -10158,6 +10284,20 @@ async function handleVaultRequestSaveCallback(ctx: Context, data: string): Promi
|
|
|
10158
10284
|
)
|
|
10159
10285
|
.catch(() => {})
|
|
10160
10286
|
}
|
|
10287
|
+
// Wake the agent that called vault_request_save so it resumes the
|
|
10288
|
+
// task that was blocked on this credential (symmetric with the
|
|
10289
|
+
// vra: approve path; buffered if the bridge is mid-reconnect).
|
|
10290
|
+
const okInbound = buildVaultSaveCompletedInbound({
|
|
10291
|
+
ctx: { agent: pending.agent, key: pending.key, chat_id: pending.chat_id },
|
|
10292
|
+
stageId,
|
|
10293
|
+
operatorId: senderId,
|
|
10294
|
+
})
|
|
10295
|
+
const okDelivered = ipcServer.sendToAgent(pending.agent, okInbound)
|
|
10296
|
+
process.stderr.write(
|
|
10297
|
+
`telegram gateway: vault_save_completed injection agent=${pending.agent} ` +
|
|
10298
|
+
`key=${pending.key} stage=${stageId} delivered=${okDelivered}\n`,
|
|
10299
|
+
)
|
|
10300
|
+
if (!okDelivered) pendingInboundBuffer.push(pending.agent, okInbound)
|
|
10161
10301
|
return
|
|
10162
10302
|
}
|
|
10163
10303
|
|
|
@@ -12084,16 +12224,25 @@ bot.on('callback_query:data', async ctx => {
|
|
|
12084
12224
|
process.stderr.write(
|
|
12085
12225
|
`telegram gateway: button_callback chatId=${cbChatId} user=${ctx.from.id} data=${JSON.stringify(agentCb.raw)} btnText=${JSON.stringify(buttonText ?? null)}\n`,
|
|
12086
12226
|
)
|
|
12087
|
-
|
|
12088
|
-
|
|
12089
|
-
|
|
12090
|
-
|
|
12227
|
+
// Registered-keyed delivery + buffer-on-miss (same fix as the
|
|
12228
|
+
// normal-inbound path above): broadcast()/clientCount() lost the
|
|
12229
|
+
// tap whenever the bridge was mid-reconnect (clientCount() counts
|
|
12230
|
+
// unregistered sockets, so the notice was suppressed AND nothing
|
|
12231
|
+
// was actually queued). sendToAgent → pendingInboundBuffer (drained
|
|
12232
|
+
// by onClientRegistered) makes the "queued" promise real.
|
|
12233
|
+
const selfAgentBtn = process.env.SWITCHROOM_AGENT_NAME ?? ''
|
|
12234
|
+
const btnDelivered = ipcServer.sendToAgent(selfAgentBtn, inboundMsg)
|
|
12235
|
+
if (!btnDelivered) {
|
|
12236
|
+
pendingInboundBuffer.push(selfAgentBtn, inboundMsg)
|
|
12237
|
+
// No registered bridge — the agent's mid-restart. Tell the user
|
|
12238
|
+
// so they don't think the button silently swallowed their tap;
|
|
12239
|
+
// the tap is genuinely buffered now and replays on reconnect.
|
|
12091
12240
|
// #1075: thread-id-bearing — swallow on THREAD_NOT_FOUND.
|
|
12092
12241
|
void swallowingApiCall(
|
|
12093
12242
|
() =>
|
|
12094
12243
|
bot.api.sendMessage(
|
|
12095
12244
|
cbChatId,
|
|
12096
|
-
'⏳ Agent is restarting — your button tap
|
|
12245
|
+
'⏳ Agent is restarting — your button tap is queued and will be processed when it comes back.',
|
|
12097
12246
|
cbThreadId != null ? { message_thread_id: cbThreadId } : {},
|
|
12098
12247
|
),
|
|
12099
12248
|
{
|
|
@@ -12222,7 +12371,7 @@ bot.on('callback_query:data', async ctx => {
|
|
|
12222
12371
|
// otherwise the rule may be unsafe to honour at scale and we
|
|
12223
12372
|
// fall back to single-use allow.
|
|
12224
12373
|
synthInbound: () => {
|
|
12225
|
-
|
|
12374
|
+
dispatchPermissionVerdict({
|
|
12226
12375
|
type: 'permission',
|
|
12227
12376
|
requestId: request_id,
|
|
12228
12377
|
behavior: 'allow',
|
|
@@ -12260,7 +12409,7 @@ bot.on('callback_query:data', async ctx => {
|
|
|
12260
12409
|
newText: baseText ? `${baseText}\n\n${label}` : label,
|
|
12261
12410
|
parseMode: 'HTML',
|
|
12262
12411
|
synthInbound: () => {
|
|
12263
|
-
|
|
12412
|
+
dispatchPermissionVerdict({
|
|
12264
12413
|
type: 'permission',
|
|
12265
12414
|
requestId: request_id,
|
|
12266
12415
|
behavior: behavior as 'allow' | 'deny',
|
|
@@ -54,6 +54,45 @@ export interface PendingInboundBufferOptions {
|
|
|
54
54
|
log?: (line: string) => void
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
+
/**
|
|
58
|
+
* Drain `agent`'s buffered inbound and re-deliver each via `send`. A
|
|
59
|
+
* `send` returning false (or throwing) means "not delivered" — the
|
|
60
|
+
* message is re-buffered so nothing is lost when the bridge is still
|
|
61
|
+
* offline. Returns counts for observability.
|
|
62
|
+
*
|
|
63
|
+
* This exists because `drain` is otherwise only called on bridge
|
|
64
|
+
* re-register (`onClientRegistered`). After a network storm that
|
|
65
|
+
* settles with the bridge STILL connected, messages buffered during
|
|
66
|
+
* the flap never drain — they sit until a manual restart forces a
|
|
67
|
+
* re-register. The silence-poke framework fallback calls this on
|
|
68
|
+
* wedge-clear so the agent self-heals (fleet-update thundering-herd
|
|
69
|
+
* incident, 2026-05-19).
|
|
70
|
+
*/
|
|
71
|
+
export function redeliverBufferedInbound(
|
|
72
|
+
buffer: PendingInboundBuffer,
|
|
73
|
+
agent: string,
|
|
74
|
+
send: (msg: InboundMessage) => boolean,
|
|
75
|
+
): { drained: number; redelivered: number; rebuffered: number } {
|
|
76
|
+
const pending = buffer.drain(agent)
|
|
77
|
+
let redelivered = 0
|
|
78
|
+
let rebuffered = 0
|
|
79
|
+
for (const msg of pending) {
|
|
80
|
+
let delivered = false
|
|
81
|
+
try {
|
|
82
|
+
delivered = send(msg)
|
|
83
|
+
} catch {
|
|
84
|
+
delivered = false
|
|
85
|
+
}
|
|
86
|
+
if (delivered) {
|
|
87
|
+
redelivered++
|
|
88
|
+
} else {
|
|
89
|
+
buffer.push(agent, msg)
|
|
90
|
+
rebuffered++
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return { drained: pending.length, redelivered, rebuffered }
|
|
94
|
+
}
|
|
95
|
+
|
|
57
96
|
export function createPendingInboundBuffer(
|
|
58
97
|
opts: PendingInboundBufferOptions = {},
|
|
59
98
|
): PendingInboundBuffer {
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-agent buffer for permission verdicts the gateway couldn't deliver
|
|
3
|
+
* because no live IPC client was registered for the agent at send-time.
|
|
4
|
+
*
|
|
5
|
+
* Background (PR-3 of the callback→model-continuation series): a
|
|
6
|
+
* tool/skill/MCP permission request suspends the claude turn *inside*
|
|
7
|
+
* the MCP permission call until the gateway relays the operator's
|
|
8
|
+
* Approve/Deny verdict back (`{type:'permission'}` → bridge
|
|
9
|
+
* `onPermission` → Claude Code). The verdict sites previously used
|
|
10
|
+
* `ipcServer.broadcast(...)`, which is fire-and-forget: if the bridge
|
|
11
|
+
* was mid-reconnect at the exact moment the operator tapped (every
|
|
12
|
+
* agent/gateway restart, claude session bounce), the verdict was
|
|
13
|
+
* dropped and the model stayed wedged forever — the user's tap did
|
|
14
|
+
* nothing and they were left silent.
|
|
15
|
+
*
|
|
16
|
+
* This is the permission-verdict analog of `pending-inbound-buffer.ts`:
|
|
17
|
+
* the verdict sites now `sendToAgent` (registered-keyed, real delivered
|
|
18
|
+
* bool) and on a miss `push()` here; `onClientRegistered` `drain()`s
|
|
19
|
+
* and re-sends so a reconnecting bridge relays the missed verdict to
|
|
20
|
+
* the still-suspended permission call.
|
|
21
|
+
*
|
|
22
|
+
* Contract mirrors pending-inbound-buffer:
|
|
23
|
+
* - `push(agent, ev)` best-effort, synchronous, bounded.
|
|
24
|
+
* - `drain(agent)` returns ALL pending verdicts in insertion order
|
|
25
|
+
* and clears them; called from `onClientRegistered`.
|
|
26
|
+
* - In-memory only; survives reconnect within one gateway lifetime,
|
|
27
|
+
* not a gateway restart. A late-redelivered verdict for a
|
|
28
|
+
* request_id claude no longer has is harmless — the bridge relays
|
|
29
|
+
* it and Claude Code ignores an unknown request_id. The TTL-sweep
|
|
30
|
+
* auto-deny is the independent backstop for "operator never tapped".
|
|
31
|
+
*
|
|
32
|
+
* Per-agent cap prevents a never-reconnecting bridge from leaking
|
|
33
|
+
* memory; on overflow the OLDEST verdict is dropped (freshest is most
|
|
34
|
+
* relevant) and logged.
|
|
35
|
+
*/
|
|
36
|
+
|
|
37
|
+
import type { PermissionEvent } from './ipc-protocol.js'
|
|
38
|
+
|
|
39
|
+
/** Default cap per agent — a reasonable backlog of permission cards
|
|
40
|
+
* stacked while the bridge is offline, no more. */
|
|
41
|
+
export const DEFAULT_PENDING_PERMISSION_CAP = 32
|
|
42
|
+
|
|
43
|
+
export interface PendingPermissionBuffer {
|
|
44
|
+
/** Append `ev` to `agent`'s queue. Returns true if accepted without
|
|
45
|
+
* eviction, false if the cap forced dropping the oldest (the new
|
|
46
|
+
* entry is STILL accepted). */
|
|
47
|
+
push: (agent: string, ev: PermissionEvent) => boolean
|
|
48
|
+
/** Pop and return all pending verdicts for `agent` (insertion order).
|
|
49
|
+
* Empty array when none. Idempotent. */
|
|
50
|
+
drain: (agent: string) => PermissionEvent[]
|
|
51
|
+
/** Test-only: current depth for `agent`. */
|
|
52
|
+
depth: (agent: string) => number
|
|
53
|
+
/** Test-only: total depth across all agents. */
|
|
54
|
+
totalDepth: () => number
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export interface PendingPermissionBufferOptions {
|
|
58
|
+
capPerAgent?: number
|
|
59
|
+
log?: (line: string) => void
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export function createPendingPermissionBuffer(
|
|
63
|
+
opts: PendingPermissionBufferOptions = {},
|
|
64
|
+
): PendingPermissionBuffer {
|
|
65
|
+
const cap = opts.capPerAgent ?? DEFAULT_PENDING_PERMISSION_CAP
|
|
66
|
+
const log = opts.log ?? ((line: string) => process.stderr.write(line))
|
|
67
|
+
const queues = new Map<string, PermissionEvent[]>()
|
|
68
|
+
|
|
69
|
+
return {
|
|
70
|
+
push(agent, ev) {
|
|
71
|
+
let q = queues.get(agent)
|
|
72
|
+
if (q == null) {
|
|
73
|
+
q = []
|
|
74
|
+
queues.set(agent, q)
|
|
75
|
+
}
|
|
76
|
+
let evicted = false
|
|
77
|
+
if (q.length >= cap) {
|
|
78
|
+
const dropped = q.shift()
|
|
79
|
+
evicted = true
|
|
80
|
+
log(
|
|
81
|
+
`pending-permission-buffer: agent=${agent} cap=${cap} reached — ` +
|
|
82
|
+
`dropped oldest verdict request=${dropped?.requestId ?? '-'} ` +
|
|
83
|
+
`behavior=${dropped?.behavior ?? '-'}\n`,
|
|
84
|
+
)
|
|
85
|
+
}
|
|
86
|
+
q.push(ev)
|
|
87
|
+
log(
|
|
88
|
+
`pending-permission-buffer: agent=${agent} buffered request=${ev.requestId} ` +
|
|
89
|
+
`behavior=${ev.behavior} depth_after=${q.length} evicted=${evicted}\n`,
|
|
90
|
+
)
|
|
91
|
+
return !evicted
|
|
92
|
+
},
|
|
93
|
+
drain(agent) {
|
|
94
|
+
const q = queues.get(agent)
|
|
95
|
+
if (q == null || q.length === 0) return []
|
|
96
|
+
queues.delete(agent)
|
|
97
|
+
log(
|
|
98
|
+
`pending-permission-buffer: drained agent=${agent} count=${q.length} ` +
|
|
99
|
+
`requests=[${q.map((e) => e.requestId).join(',')}]\n`,
|
|
100
|
+
)
|
|
101
|
+
return q
|
|
102
|
+
},
|
|
103
|
+
depth(agent) {
|
|
104
|
+
return queues.get(agent)?.length ?? 0
|
|
105
|
+
},
|
|
106
|
+
totalDepth() {
|
|
107
|
+
let n = 0
|
|
108
|
+
for (const q of queues.values()) n += q.length
|
|
109
|
+
return n
|
|
110
|
+
},
|
|
111
|
+
}
|
|
112
|
+
}
|