switchroom 0.14.9 → 0.14.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-scheduler/index.js +161 -157
- package/dist/auth-broker/index.js +82 -80
- package/dist/cli/drive-write-pretool.mjs +10 -10
- package/dist/cli/notion-write-pretool.mjs +84 -82
- package/dist/cli/skill-validate-pretool.mjs +72 -72
- package/dist/cli/switchroom.js +1053 -917
- package/dist/host-control/main.js +150 -148
- package/dist/vault/approvals/kernel-server.js +84 -82
- package/dist/vault/broker/server.js +85 -83
- package/package.json +3 -3
- package/telegram-plugin/dist/bridge/bridge.js +112 -112
- package/telegram-plugin/dist/gateway/gateway.js +1218 -634
- package/telegram-plugin/dist/server.js +160 -160
- package/telegram-plugin/gateway/boot-card.ts +15 -11
- package/telegram-plugin/gateway/gateway.ts +94 -67
- package/telegram-plugin/gateway/webhook-ingest-server.test.ts +125 -0
- package/telegram-plugin/gateway/webhook-ingest-server.ts +218 -0
- package/telegram-plugin/runtime-metrics.ts +8 -52
- package/telegram-plugin/silence-poke.ts +39 -312
- package/telegram-plugin/tests/boot-card-silent-on-operator.test.ts +27 -30
- package/telegram-plugin/tests/silence-poke.test.ts +54 -569
- package/telegram-plugin/uat/scenarios/jtbd-fast-ack-dm.test.ts +21 -23
- package/telegram-plugin/uat/scenarios/silence-poke-soft-dm.test.ts +0 -155
|
@@ -625,17 +625,21 @@ export async function startBootCard(
|
|
|
625
625
|
...(opts.updateOutcomeLine ? { updateOutcomeLine: opts.updateOutcomeLine } : {}),
|
|
626
626
|
})
|
|
627
627
|
|
|
628
|
-
//
|
|
629
|
-
//
|
|
630
|
-
//
|
|
631
|
-
// a
|
|
632
|
-
//
|
|
633
|
-
//
|
|
634
|
-
//
|
|
635
|
-
//
|
|
636
|
-
//
|
|
637
|
-
//
|
|
638
|
-
|
|
628
|
+
// Boot cards are ALWAYS delivered silently (no Telegram
|
|
629
|
+
// notification). They land in the chat as a record — the operator
|
|
630
|
+
// can scroll up to see "✅ <agent> back up · vX.Y.Z" — but they
|
|
631
|
+
// never buzz a phone. Rationale: every agent posts a card on every
|
|
632
|
+
// restart, so a fleet redeploy of N agents produced N notifications;
|
|
633
|
+
// even a single user `/restart` or a crash-recovery is a status
|
|
634
|
+
// record, not something that should pull attention. Operator
|
|
635
|
+
// decision (2026-05-29): silence them all, unconditionally.
|
|
636
|
+
//
|
|
637
|
+
// Previously this was keyed on the `operator:` reason-detail prefix
|
|
638
|
+
// (only routine `switchroom update` was silent); user `/restart`,
|
|
639
|
+
// `cli: switchroom restart` rollouts, crashes, and fresh boots all
|
|
640
|
+
// still notified. That distinction is gone — the card is the record,
|
|
641
|
+
// the chat is where you look, and nothing here warrants a push.
|
|
642
|
+
const silentBootCard = true
|
|
639
643
|
|
|
640
644
|
let messageId: number
|
|
641
645
|
try {
|
|
@@ -263,6 +263,8 @@ import { formatUpdateStatusLine } from './update-status-line.js'
|
|
|
263
263
|
import type { HostdRequest } from '../../src/host-control/protocol.js'
|
|
264
264
|
import type { AgentAudit } from '../welcome-text.js'
|
|
265
265
|
import { shouldSweepChatAtBoot } from './boot-sweep-filter.js'
|
|
266
|
+
import { startWebhookIngestServer } from './webhook-ingest-server.js'
|
|
267
|
+
import { recordWebhookEvent } from '../../src/web/webhook-gateway-record.js'
|
|
266
268
|
|
|
267
269
|
import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js'
|
|
268
270
|
import { handleRequestDriveApproval } from './drive-write-approval.js'
|
|
@@ -3405,11 +3407,13 @@ function ensureIssuesCard(chatId: string, threadId: number | undefined): void {
|
|
|
3405
3407
|
}
|
|
3406
3408
|
|
|
3407
3409
|
// #1122: framework safety-net for "model is silent to the user for >5min."
|
|
3408
|
-
// Starts a single setInterval poll that walks active turns
|
|
3409
|
-
//
|
|
3410
|
-
//
|
|
3411
|
-
//
|
|
3412
|
-
//
|
|
3410
|
+
// Starts a single setInterval poll that walks active turns; at 300s of
|
|
3411
|
+
// silence the framework itself sends a user-visible "still working… /
|
|
3412
|
+
// still thinking…" message AND unwedges the turn. The model-targeted
|
|
3413
|
+
// nudge ladder (ack/soft/firm) and the 60s awareness ping were retired
|
|
3414
|
+
// once the live-updating reply/draft took over the pacing job — only
|
|
3415
|
+
// this single unwedge fallback remains. Honours
|
|
3416
|
+
// SWITCHROOM_DISABLE_SILENCE_POKE=1 kill switch (no-op if set).
|
|
3413
3417
|
// Set when this gateway dispatches an `update_apply` to hostd that
|
|
3414
3418
|
// returns `started`; cleared when the dispatch poll resolves (terminal
|
|
3415
3419
|
// / not-configured / timeout). While set, the framework silence
|
|
@@ -3423,43 +3427,6 @@ silencePoke.startTimer({
|
|
|
3423
3427
|
// Re-emit through the unified runtime-metrics fan-out (PostHog + JSONL).
|
|
3424
3428
|
emitRuntimeMetric(event)
|
|
3425
3429
|
},
|
|
3426
|
-
onAwarenessPing: async (ctx) => {
|
|
3427
|
-
// Early framework-owned awareness signal (~60s) so the user never
|
|
3428
|
-
// faces a silent chat while the model is busy / held / thinking.
|
|
3429
|
-
// Distinct from the 300s onFrameworkFallback: fires earlier, sends
|
|
3430
|
-
// a SILENT message (disable_notification: true — ambient liveness,
|
|
3431
|
-
// not a device buzz), and is bounded to ONE per turn by the silence-
|
|
3432
|
-
// poke module's `awarenessPingFired` flag. Reuses
|
|
3433
|
-
// `formatFrameworkFallbackText` so the wording stays consistent and
|
|
3434
|
-
// in-flight tools are named when known. If the model has been
|
|
3435
|
-
// silent long enough to cross 300s, the heavier framework_fallback
|
|
3436
|
-
// escalates with a notification.
|
|
3437
|
-
//
|
|
3438
|
-
// Late-fire guard mirrors the framework_fallback handler: skip if
|
|
3439
|
-
// the turn ended cleanly between the silence-poke arming and this
|
|
3440
|
-
// timer-fired handler so we don't talk over a clean response.
|
|
3441
|
-
if (activeTurnStartedAt.get(ctx.key) == null && currentTurn == null) {
|
|
3442
|
-
return
|
|
3443
|
-
}
|
|
3444
|
-
const text = silencePoke.formatFrameworkFallbackText(
|
|
3445
|
-
ctx.fallbackKind,
|
|
3446
|
-
ctx.silenceMs,
|
|
3447
|
-
ctx.inFlightTools,
|
|
3448
|
-
)
|
|
3449
|
-
try {
|
|
3450
|
-
await robustApiCall(
|
|
3451
|
-
() => bot.api.sendMessage(ctx.chatId, text, {
|
|
3452
|
-
...(ctx.threadId != null ? { message_thread_id: ctx.threadId } : {}),
|
|
3453
|
-
disable_notification: true,
|
|
3454
|
-
}),
|
|
3455
|
-
{ chat_id: ctx.chatId, ...(ctx.threadId != null ? { threadId: ctx.threadId } : {}) },
|
|
3456
|
-
)
|
|
3457
|
-
} catch (err) {
|
|
3458
|
-
process.stderr.write(
|
|
3459
|
-
`silence-poke awareness-ping sendMessage failed chat=${ctx.chatId} thread=${ctx.threadId}: ${err}\n`,
|
|
3460
|
-
)
|
|
3461
|
-
}
|
|
3462
|
-
},
|
|
3463
3430
|
onFrameworkFallback: async (ctx) => {
|
|
3464
3431
|
// Late-fire short-circuit (2026-05-23 audit finding). The fallback
|
|
3465
3432
|
// can race a clean turn-end: the model's actual reply lands inside
|
|
@@ -4050,25 +4017,6 @@ const ipcServer: IpcServer = createIpcServer({
|
|
|
4050
4017
|
process.stderr.write(`telegram gateway: ipc: tool_call tool=${msg.tool} agent=${client.agentName ?? '-'} clientId=${client.id ?? '-'} callId=${msg.id}\n`)
|
|
4051
4018
|
try {
|
|
4052
4019
|
const result = await executeToolCall(msg.tool, msg.args)
|
|
4053
|
-
// #1122 silence-poke chokepoint: piggyback any armed poke onto the
|
|
4054
|
-
// tool result's content text. The model sees the [silence-poke]
|
|
4055
|
-
// system-reminder block as part of the next conversational turn.
|
|
4056
|
-
// No-op when nothing is armed (the common case) — cost is one
|
|
4057
|
-
// map iteration over <=N active turns (typically 1).
|
|
4058
|
-
const reminder = silencePoke.consumeArmedPoke()
|
|
4059
|
-
if (reminder != null && result != null && typeof result === 'object') {
|
|
4060
|
-
const r = result as { content?: Array<{ type: string; text: string }> }
|
|
4061
|
-
if (Array.isArray(r.content) && r.content.length > 0 && r.content[0]!.type === 'text') {
|
|
4062
|
-
r.content[0]!.text = `${r.content[0]!.text}\n\n<system-reminder>\n${reminder}\n</system-reminder>`
|
|
4063
|
-
} else {
|
|
4064
|
-
// Tool result didn't carry a text block to wrap — re-shape so
|
|
4065
|
-
// the reminder still reaches the model.
|
|
4066
|
-
r.content = [
|
|
4067
|
-
...(Array.isArray(r.content) ? r.content : []),
|
|
4068
|
-
{ type: 'text', text: `<system-reminder>\n${reminder}\n</system-reminder>` },
|
|
4069
|
-
]
|
|
4070
|
-
}
|
|
4071
|
-
}
|
|
4072
4020
|
return { type: 'tool_call_result', id: msg.id, success: true, result }
|
|
4073
4021
|
} catch (err) {
|
|
4074
4022
|
return {
|
|
@@ -4092,17 +4040,13 @@ const ipcServer: IpcServer = createIpcServer({
|
|
|
4092
4040
|
progressDriver?.ingest(ev, chatHint, threadHint)
|
|
4093
4041
|
handleSessionEvent(ev)
|
|
4094
4042
|
// #1122 silence-poke: surface activity signals from the session
|
|
4095
|
-
// stream so the fallback message wording is honest
|
|
4096
|
-
//
|
|
4043
|
+
// stream so the 300s framework-fallback message wording is honest
|
|
4044
|
+
// (thinking vs working, plus the longest-running in-flight tool).
|
|
4097
4045
|
if (currentTurn != null) {
|
|
4098
4046
|
const key = statusKey(currentTurn.sessionChatId, currentTurn.sessionThreadId)
|
|
4099
4047
|
if (ev.kind === 'thinking') {
|
|
4100
4048
|
silencePoke.noteThinking(key, Date.now())
|
|
4101
4049
|
} else if (ev.kind === 'tool_use') {
|
|
4102
|
-
if (ev.toolName === 'Task' || ev.toolName === 'Agent') {
|
|
4103
|
-
// Built-in claude sub-agent dispatch — extends soft threshold to 5min.
|
|
4104
|
-
silencePoke.noteSubagentDispatch(key)
|
|
4105
|
-
}
|
|
4106
4050
|
// #1292: track in-flight tool calls so the 300s framework
|
|
4107
4051
|
// fallback message can name the actual observable (e.g.
|
|
4108
4052
|
// "running Grep \"foo\" for 4m") instead of the dishonest
|
|
@@ -4652,6 +4596,89 @@ const ipcServer: IpcServer = createIpcServer({
|
|
|
4652
4596
|
log: (msg) => process.stderr.write(`telegram gateway: ipc — ${msg}\n`),
|
|
4653
4597
|
})
|
|
4654
4598
|
|
|
4599
|
+
// ─── Webhook ingest server (RFC webhook-via-gateway-socket) ───────────────
|
|
4600
|
+
// Under the Docker runtime the host-side web receiver runs as the operator
|
|
4601
|
+
// UID and cannot write this agent's UID-owned dir (EACCES 500) nor connect
|
|
4602
|
+
// gateway.sock. When `channels.telegram.webhook_via_gateway` is set, the
|
|
4603
|
+
// receiver instead forwards verified+rendered events here over a dedicated
|
|
4604
|
+
// peercred-gated UDS; this gateway (agent UID) owns the jsonl append,
|
|
4605
|
+
// dedup, and dispatch firing. Wrapped so any failure is best-effort and can
|
|
4606
|
+
// NEVER crash gateway boot (which would take the agent down).
|
|
4607
|
+
;(() => {
|
|
4608
|
+
try {
|
|
4609
|
+
const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
|
|
4610
|
+
if (!selfAgent) return
|
|
4611
|
+
|
|
4612
|
+
let viaGateway = false
|
|
4613
|
+
try {
|
|
4614
|
+
const cfg = loadSwitchroomConfig()
|
|
4615
|
+
const raw = cfg.agents?.[selfAgent]
|
|
4616
|
+
viaGateway = raw
|
|
4617
|
+
? resolveAgentConfig(cfg.defaults, cfg.profiles, raw).channels?.telegram
|
|
4618
|
+
?.webhook_via_gateway === true
|
|
4619
|
+
: false
|
|
4620
|
+
} catch (err) {
|
|
4621
|
+
process.stderr.write(
|
|
4622
|
+
`telegram gateway: webhook-ingest config probe failed: ${(err as Error).message}\n`,
|
|
4623
|
+
)
|
|
4624
|
+
}
|
|
4625
|
+
if (!viaGateway) return
|
|
4626
|
+
|
|
4627
|
+
// Allowed peer UIDs: the agent's own UID (self-connections) + the
|
|
4628
|
+
// operator/receiver UID emitted into the env by the compose generator
|
|
4629
|
+
// (SWITCHROOM_WEBHOOK_RECEIVER_UID == operatorUid). Fail-closed: if the
|
|
4630
|
+
// receiver UID is unset, only self can connect → receiver gets 503,
|
|
4631
|
+
// surfacing the misconfiguration instead of silently accepting any UID.
|
|
4632
|
+
const allowedUids: number[] = []
|
|
4633
|
+
const ownUid = typeof process.getuid === 'function' ? process.getuid() : null
|
|
4634
|
+
if (ownUid !== null) allowedUids.push(ownUid)
|
|
4635
|
+
const receiverUidRaw = process.env.SWITCHROOM_WEBHOOK_RECEIVER_UID
|
|
4636
|
+
const receiverUid = receiverUidRaw ? Number(receiverUidRaw) : NaN
|
|
4637
|
+
if (Number.isInteger(receiverUid)) allowedUids.push(receiverUid)
|
|
4638
|
+
|
|
4639
|
+
const socketPath = join(STATE_DIR, 'webhook.sock')
|
|
4640
|
+
|
|
4641
|
+
// In-process delivery of a synthesized webhook turn — the same
|
|
4642
|
+
// sendToAgent + buffer-on-failure primitive onInjectInbound uses, so a
|
|
4643
|
+
// webhook fire landing mid bridge-reconnect is buffered, not dropped.
|
|
4644
|
+
const webhookInject = (agentName: string, inbound: unknown): boolean => {
|
|
4645
|
+
// The wire record is a structural mirror of the bridge's
|
|
4646
|
+
// InboundMessage (same cast the scheduler's ipcDispatcher uses).
|
|
4647
|
+
const msg = inbound as InboundMessage
|
|
4648
|
+
const delivered = ipcServer.sendToAgent(agentName, msg)
|
|
4649
|
+
if (delivered) markClaudeBusyForInbound(msg)
|
|
4650
|
+
else pendingInboundBuffer.push(agentName, msg)
|
|
4651
|
+
return delivered
|
|
4652
|
+
}
|
|
4653
|
+
|
|
4654
|
+
startWebhookIngestServer({
|
|
4655
|
+
socketPath,
|
|
4656
|
+
allowedUids,
|
|
4657
|
+
log: (s) => process.stderr.write(`telegram gateway: ${s}`),
|
|
4658
|
+
onRecord: (req) =>
|
|
4659
|
+
recordWebhookEvent(
|
|
4660
|
+
{
|
|
4661
|
+
agent: selfAgent,
|
|
4662
|
+
source: req.source,
|
|
4663
|
+
event_type: req.event_type,
|
|
4664
|
+
ts: req.ts,
|
|
4665
|
+
rendered_text: req.rendered_text,
|
|
4666
|
+
payload: req.payload,
|
|
4667
|
+
...(req.delivery_id ? { delivery_id: req.delivery_id } : {}),
|
|
4668
|
+
},
|
|
4669
|
+
{
|
|
4670
|
+
inject: webhookInject,
|
|
4671
|
+
log: (s) => process.stderr.write(`telegram gateway: ${s}`),
|
|
4672
|
+
},
|
|
4673
|
+
),
|
|
4674
|
+
})
|
|
4675
|
+
} catch (err) {
|
|
4676
|
+
process.stderr.write(
|
|
4677
|
+
`telegram gateway: webhook-ingest server start failed (non-fatal): ${(err as Error).message}\n`,
|
|
4678
|
+
)
|
|
4679
|
+
}
|
|
4680
|
+
})()
|
|
4681
|
+
|
|
4655
4682
|
// ─── Opportunistic idle-drain of pendingInboundBuffer ─────────────────────
|
|
4656
4683
|
// pendingInboundBuffer otherwise drains only on (a) bridge re-register
|
|
4657
4684
|
// (onClientRegistered) or (b) the silence-poke framework fallback
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for the peercred-gated webhook ingest UDS server
|
|
3
|
+
* (RFC docs/rfcs/webhook-via-gateway-socket.md).
|
|
4
|
+
*
|
|
5
|
+
* MUST run under `bun test`: the peer-credential gate calls
|
|
6
|
+
* `getPeerCred` (bun:ffi getsockopt SO_PEERCRED), which returns null
|
|
7
|
+
* under node/vitest — so the allow path is only exercisable under bun.
|
|
8
|
+
* This file is excluded from the vitest run (see vitest.config.ts) and
|
|
9
|
+
* listed in the `test:bun` / `test` scripts in package.json.
|
|
10
|
+
*
|
|
11
|
+
* Sockets live in an isolated tmpdir — never `~/.switchroom`.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { describe, it, expect, afterEach } from 'bun:test'
|
|
15
|
+
import net from 'node:net'
|
|
16
|
+
import { mkdtempSync } from 'node:fs'
|
|
17
|
+
import { tmpdir } from 'node:os'
|
|
18
|
+
import { join } from 'node:path'
|
|
19
|
+
import {
|
|
20
|
+
startWebhookIngestServer,
|
|
21
|
+
type WebhookIngestServer,
|
|
22
|
+
type WebhookIngestRequest,
|
|
23
|
+
} from './webhook-ingest-server.js'
|
|
24
|
+
import { forwardToGateway } from '../../src/web/webhook-ingest-client.js'
|
|
25
|
+
|
|
26
|
+
const servers: WebhookIngestServer[] = []
|
|
27
|
+
afterEach(() => {
|
|
28
|
+
for (const s of servers.splice(0)) s.close()
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
function tmpSocket(): string {
|
|
32
|
+
const dir = mkdtempSync(join(tmpdir(), 'webhook-ingest-srv-'))
|
|
33
|
+
return join(dir, 'webhook.sock')
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function sampleReq(): WebhookIngestRequest {
|
|
37
|
+
return {
|
|
38
|
+
agent: 'reggie',
|
|
39
|
+
source: 'github',
|
|
40
|
+
event_type: 'pull_request',
|
|
41
|
+
ts: 1_700_000_000_000,
|
|
42
|
+
rendered_text: 'PR opened',
|
|
43
|
+
payload: { action: 'opened', number: 7 },
|
|
44
|
+
delivery_id: 'd-7',
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
describe('startWebhookIngestServer (peercred-gated)', () => {
|
|
49
|
+
it('accepts a connection from an allowed uid and round-trips onRecord', async () => {
|
|
50
|
+
const socketPath = tmpSocket()
|
|
51
|
+
let received: WebhookIngestRequest | null = null
|
|
52
|
+
const server = startWebhookIngestServer({
|
|
53
|
+
socketPath,
|
|
54
|
+
allowedUids: [process.getuid!()], // our own uid → allowed
|
|
55
|
+
onRecord: (req) => {
|
|
56
|
+
received = req
|
|
57
|
+
return { status: 'ok', ts: req.ts, dispatched: 1 }
|
|
58
|
+
},
|
|
59
|
+
log: () => {},
|
|
60
|
+
})
|
|
61
|
+
servers.push(server)
|
|
62
|
+
|
|
63
|
+
const resp = await forwardToGateway(socketPath, sampleReq())
|
|
64
|
+
|
|
65
|
+
expect(resp).not.toBeNull()
|
|
66
|
+
expect(resp!.status).toBe('ok')
|
|
67
|
+
expect(resp!.ts).toBe(1_700_000_000_000)
|
|
68
|
+
expect(resp!.dispatched).toBe(1)
|
|
69
|
+
expect(received).not.toBeNull()
|
|
70
|
+
expect(received!.agent).toBe('reggie')
|
|
71
|
+
expect(received!.event_type).toBe('pull_request')
|
|
72
|
+
expect(received!.delivery_id).toBe('d-7')
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
it('denies a connection whose uid is not in allowedUids (onRecord never runs)', async () => {
|
|
76
|
+
const socketPath = tmpSocket()
|
|
77
|
+
let called = false
|
|
78
|
+
const server = startWebhookIngestServer({
|
|
79
|
+
socketPath,
|
|
80
|
+
allowedUids: [999_999], // definitely not our uid
|
|
81
|
+
onRecord: () => {
|
|
82
|
+
called = true
|
|
83
|
+
return { status: 'ok' }
|
|
84
|
+
},
|
|
85
|
+
log: () => {},
|
|
86
|
+
})
|
|
87
|
+
servers.push(server)
|
|
88
|
+
|
|
89
|
+
// The server destroys the connection in its accept callback BEFORE
|
|
90
|
+
// reading any bytes, so onRecord must never run. We assert the
|
|
91
|
+
// security property (no service for an unlisted uid) directly rather
|
|
92
|
+
// than via the client resolving: bun's net client does not observe a
|
|
93
|
+
// server-side destroy of a just-accepted UDS connection promptly, so
|
|
94
|
+
// awaiting forwardToGateway here would hang. Send raw bytes and give
|
|
95
|
+
// the server ample time to (not) process them.
|
|
96
|
+
const raw = net.createConnection(socketPath)
|
|
97
|
+
raw.on('connect', () => raw.write(JSON.stringify(sampleReq()) + '\n'))
|
|
98
|
+
raw.on('error', () => {})
|
|
99
|
+
await new Promise((r) => setTimeout(r, 300))
|
|
100
|
+
raw.destroy()
|
|
101
|
+
|
|
102
|
+
expect(called).toBe(false)
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
it('returns null when the socket does not exist (gateway down)', async () => {
|
|
106
|
+
const socketPath = tmpSocket() // never bound
|
|
107
|
+
const resp = await forwardToGateway(socketPath, sampleReq(), { timeoutMs: 2000 })
|
|
108
|
+
expect(resp).toBeNull()
|
|
109
|
+
})
|
|
110
|
+
|
|
111
|
+
it('surfaces a gateway-side error status from onRecord', async () => {
|
|
112
|
+
const socketPath = tmpSocket()
|
|
113
|
+
const server = startWebhookIngestServer({
|
|
114
|
+
socketPath,
|
|
115
|
+
allowedUids: [process.getuid!()],
|
|
116
|
+
onRecord: () => ({ status: 'error', error: 'write failed' }),
|
|
117
|
+
log: () => {},
|
|
118
|
+
})
|
|
119
|
+
servers.push(server)
|
|
120
|
+
|
|
121
|
+
const resp = await forwardToGateway(socketPath, sampleReq())
|
|
122
|
+
expect(resp).not.toBeNull()
|
|
123
|
+
expect(resp!.status).toBe('error')
|
|
124
|
+
})
|
|
125
|
+
})
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Webhook ingest UDS server (RFC docs/rfcs/webhook-via-gateway-socket.md).
|
|
3
|
+
*
|
|
4
|
+
* A dedicated, peercred-gated Unix socket the host-side web receiver
|
|
5
|
+
* forwards verified webhook events to. It is deliberately SEPARATE from
|
|
6
|
+
* the bridge IPC socket (`gateway.sock`):
|
|
7
|
+
*
|
|
8
|
+
* - `gateway.sock` is bound via `Bun.listen`, whose accepted socket does
|
|
9
|
+
* NOT expose a file descriptor — so SO_PEERCRED can't be read on it.
|
|
10
|
+
* This server uses `node:net` (whose `Socket._handle.fd` IS readable
|
|
11
|
+
* under Bun, verified empirically) precisely so the peer-credential
|
|
12
|
+
* gate works.
|
|
13
|
+
* - Keeping the chat-critical bridge socket untouched avoids any risk to
|
|
14
|
+
* the bridge-flap-sensitive reconnect path.
|
|
15
|
+
*
|
|
16
|
+
* Auth model — two independent layers:
|
|
17
|
+
* 1. **Filesystem**: the socket is chmod 0o666 so the host operator UID
|
|
18
|
+
* can connect at all (the agent dir itself is 0775/agent-UID, which
|
|
19
|
+
* blocks the operator from writing files but not from connecting a
|
|
20
|
+
* world-connectable socket).
|
|
21
|
+
* 2. **Peer credentials**: every connection's SO_PEERCRED UID must be in
|
|
22
|
+
* `allowedUids` (the agent's own UID + the operator/receiver UID).
|
|
23
|
+
* This is the load-bearing gate: it ensures only the trusted receiver
|
|
24
|
+
* (which enforces per-event HMAC) — or the agent itself — can inject a
|
|
25
|
+
* webhook turn. Fail-closed: unreadable creds or an unlisted UID is
|
|
26
|
+
* denied.
|
|
27
|
+
*
|
|
28
|
+
* Wire protocol: one JSON line in (`WebhookIngestRequest`), one JSON line
|
|
29
|
+
* out (`WebhookIngestResponse`), then the connection closes. No framing
|
|
30
|
+
* beyond the trailing newline; requests are small (a rendered event +
|
|
31
|
+
* payload).
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
import net from 'node:net'
|
|
35
|
+
import { chmodSync, existsSync, unlinkSync } from 'node:fs'
|
|
36
|
+
import { getPeerCred } from '../../src/vault/broker/peercred-ffi.js'
|
|
37
|
+
|
|
38
|
+
/** Forwarded by the receiver; structural mirror of WebhookGatewayRecord. */
|
|
39
|
+
export interface WebhookIngestRequest {
|
|
40
|
+
agent: string
|
|
41
|
+
source: string
|
|
42
|
+
event_type: string
|
|
43
|
+
ts: number
|
|
44
|
+
rendered_text: string
|
|
45
|
+
payload: Record<string, unknown>
|
|
46
|
+
delivery_id?: string
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface WebhookIngestResponse {
|
|
50
|
+
status: 'ok' | 'deduped' | 'error'
|
|
51
|
+
ts?: number
|
|
52
|
+
error?: string
|
|
53
|
+
dispatched?: number
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export interface WebhookIngestServerOptions {
|
|
57
|
+
socketPath: string
|
|
58
|
+
/** SO_PEERCRED UIDs permitted to inject. Connections from any other UID
|
|
59
|
+
* (or with unreadable creds) are denied and the socket destroyed. */
|
|
60
|
+
allowedUids: number[]
|
|
61
|
+
/** Handle one verified, forwarded event. Synchronous return (the record
|
|
62
|
+
* path is file I/O + an in-process inject) wrapped in Promise for the
|
|
63
|
+
* server's await. */
|
|
64
|
+
onRecord: (req: WebhookIngestRequest) => WebhookIngestResponse | Promise<WebhookIngestResponse>
|
|
65
|
+
log?: (line: string) => void
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export interface WebhookIngestServer {
|
|
69
|
+
close: () => void
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const MAX_REQUEST_BYTES = 1024 * 1024 // 1 MiB — github payloads fit easily
|
|
73
|
+
|
|
74
|
+
/** Read the accepted connection's fd via the undocumented `_handle.fd`.
|
|
75
|
+
* Under Bun's node:net polyfill this is present (verified); returns null
|
|
76
|
+
* if absent so the caller fails closed. */
|
|
77
|
+
function fdOf(conn: net.Socket): number | null {
|
|
78
|
+
const handle = (conn as unknown as { _handle?: { fd?: number } })._handle
|
|
79
|
+
if (!handle || typeof handle.fd !== 'number' || handle.fd < 0) return null
|
|
80
|
+
return handle.fd
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Start the webhook ingest server. Never throws — bind failures are logged
|
|
85
|
+
* and surfaced via the returned server still being usable as a no-op close.
|
|
86
|
+
* The CALLER (gateway boot) additionally wraps this so a failure here can
|
|
87
|
+
* never take the agent down; this function's own try/catch is belt-and-
|
|
88
|
+
* suspenders.
|
|
89
|
+
*/
|
|
90
|
+
export function startWebhookIngestServer(
|
|
91
|
+
opts: WebhookIngestServerOptions,
|
|
92
|
+
): WebhookIngestServer {
|
|
93
|
+
const log = opts.log ?? ((s) => process.stderr.write(s))
|
|
94
|
+
const allowed = new Set(opts.allowedUids)
|
|
95
|
+
|
|
96
|
+
// Clear a stale socket from a previous (crashed) gateway. Safe: only this
|
|
97
|
+
// agent's gateway binds this path, and we're about to rebind it.
|
|
98
|
+
try {
|
|
99
|
+
if (existsSync(opts.socketPath)) unlinkSync(opts.socketPath)
|
|
100
|
+
} catch (err) {
|
|
101
|
+
log(`webhook-ingest-server: could not unlink stale socket: ${(err as Error).message}\n`)
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const server = net.createServer((conn) => {
|
|
105
|
+
// ── Peer-credential gate (fail-closed) ──────────────────────────────────
|
|
106
|
+
const fd = fdOf(conn)
|
|
107
|
+
const cred = fd !== null ? getPeerCred(fd) : null
|
|
108
|
+
if (cred === null || !allowed.has(cred.uid)) {
|
|
109
|
+
log(
|
|
110
|
+
`webhook-ingest-server: DENY connection uid=${cred?.uid ?? 'unknown'} ` +
|
|
111
|
+
`(allowed=${[...allowed].join(',')})\n`,
|
|
112
|
+
)
|
|
113
|
+
conn.destroy()
|
|
114
|
+
return
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
let buf = ''
|
|
118
|
+
let handled = false
|
|
119
|
+
conn.setEncoding('utf8')
|
|
120
|
+
|
|
121
|
+
const reply = (resp: WebhookIngestResponse) => {
|
|
122
|
+
if (handled) return
|
|
123
|
+
handled = true
|
|
124
|
+
try {
|
|
125
|
+
conn.write(JSON.stringify(resp) + '\n')
|
|
126
|
+
} catch {
|
|
127
|
+
/* peer may have hung up */
|
|
128
|
+
}
|
|
129
|
+
conn.end()
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
conn.on('data', (chunk: string) => {
|
|
133
|
+
if (handled) return
|
|
134
|
+
buf += chunk
|
|
135
|
+
if (buf.length > MAX_REQUEST_BYTES) {
|
|
136
|
+
reply({ status: 'error', error: 'request too large' })
|
|
137
|
+
return
|
|
138
|
+
}
|
|
139
|
+
const nl = buf.indexOf('\n')
|
|
140
|
+
if (nl === -1) return // wait for the full line
|
|
141
|
+
const line = buf.slice(0, nl)
|
|
142
|
+
|
|
143
|
+
let req: WebhookIngestRequest
|
|
144
|
+
try {
|
|
145
|
+
req = JSON.parse(line) as WebhookIngestRequest
|
|
146
|
+
} catch {
|
|
147
|
+
reply({ status: 'error', error: 'malformed request' })
|
|
148
|
+
return
|
|
149
|
+
}
|
|
150
|
+
if (
|
|
151
|
+
!req ||
|
|
152
|
+
typeof req.agent !== 'string' ||
|
|
153
|
+
typeof req.source !== 'string' ||
|
|
154
|
+
typeof req.event_type !== 'string' ||
|
|
155
|
+
typeof req.rendered_text !== 'string' ||
|
|
156
|
+
typeof req.payload !== 'object' ||
|
|
157
|
+
req.payload === null
|
|
158
|
+
) {
|
|
159
|
+
reply({ status: 'error', error: 'invalid request shape' })
|
|
160
|
+
return
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
Promise.resolve()
|
|
164
|
+
.then(() => opts.onRecord(req))
|
|
165
|
+
.then((resp) => reply(resp))
|
|
166
|
+
.catch((err: unknown) => {
|
|
167
|
+
log(`webhook-ingest-server: onRecord threw: ${String(err)}\n`)
|
|
168
|
+
reply({ status: 'error', error: 'internal error' })
|
|
169
|
+
})
|
|
170
|
+
})
|
|
171
|
+
|
|
172
|
+
conn.on('error', (err) => {
|
|
173
|
+
log(`webhook-ingest-server: conn error: ${err.message}\n`)
|
|
174
|
+
})
|
|
175
|
+
|
|
176
|
+
// Drop slow/idle clients so a stuck connection can't pin the socket.
|
|
177
|
+
conn.setTimeout(10_000, () => {
|
|
178
|
+
if (!handled) reply({ status: 'error', error: 'timeout' })
|
|
179
|
+
conn.destroy()
|
|
180
|
+
})
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
server.on('error', (err) => {
|
|
184
|
+
log(`webhook-ingest-server: server error: ${err.message}\n`)
|
|
185
|
+
})
|
|
186
|
+
|
|
187
|
+
try {
|
|
188
|
+
server.listen(opts.socketPath, () => {
|
|
189
|
+
try {
|
|
190
|
+
// World-connectable at the FS layer; peercred is the real gate.
|
|
191
|
+
chmodSync(opts.socketPath, 0o666)
|
|
192
|
+
} catch (err) {
|
|
193
|
+
log(`webhook-ingest-server: chmod failed: ${(err as Error).message}\n`)
|
|
194
|
+
}
|
|
195
|
+
log(
|
|
196
|
+
`webhook-ingest-server: listening at ${opts.socketPath} ` +
|
|
197
|
+
`(allowed uids: ${[...allowed].join(',')})\n`,
|
|
198
|
+
)
|
|
199
|
+
})
|
|
200
|
+
} catch (err) {
|
|
201
|
+
log(`webhook-ingest-server: listen failed: ${(err as Error).message}\n`)
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
close: () => {
|
|
206
|
+
try {
|
|
207
|
+
server.close()
|
|
208
|
+
} catch {
|
|
209
|
+
/* ignore */
|
|
210
|
+
}
|
|
211
|
+
try {
|
|
212
|
+
if (existsSync(opts.socketPath)) unlinkSync(opts.socketPath)
|
|
213
|
+
} catch {
|
|
214
|
+
/* ignore */
|
|
215
|
+
}
|
|
216
|
+
},
|
|
217
|
+
}
|
|
218
|
+
}
|
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
import { existsSync, mkdirSync, appendFileSync } from 'node:fs'
|
|
20
20
|
import { dirname, join } from 'node:path'
|
|
21
21
|
import { captureEvent } from './analytics-posthog.js'
|
|
22
|
-
import type { PokeLevel } from './silence-poke.js'
|
|
23
22
|
|
|
24
23
|
export type RuntimeMetricEvent =
|
|
25
24
|
/**
|
|
@@ -63,40 +62,14 @@ export type RuntimeMetricEvent =
|
|
|
63
62
|
ended_via: 'reply' | 'stream_reply_done' | 'silent' | 'forced' | 'framework_fallback'
|
|
64
63
|
}
|
|
65
64
|
/**
|
|
66
|
-
*
|
|
67
|
-
*
|
|
68
|
-
*
|
|
69
|
-
*
|
|
70
|
-
*
|
|
71
|
-
*
|
|
72
|
-
*
|
|
73
|
-
|
|
74
|
-
| {
|
|
75
|
-
kind: 'silence_poke_fired'
|
|
76
|
-
key: string
|
|
77
|
-
level: PokeLevel
|
|
78
|
-
silence_ms: number
|
|
79
|
-
subagent_wait: boolean
|
|
80
|
-
}
|
|
81
|
-
/**
|
|
82
|
-
* The model sent an outbound message within the success window
|
|
83
|
-
* (default 15s) after a poke fired. Pair with `silence_poke_fired`
|
|
84
|
-
* to compute success rate — the design target is >80%. (`ack`-level
|
|
85
|
-
* success is not currently emitted — the ack poke sits outside the
|
|
86
|
-
* `pokesFired` ladder noteOutbound measures against; the type admits
|
|
87
|
-
* `ack` only so the silence-poke metric union stays assignable.)
|
|
88
|
-
*/
|
|
89
|
-
| {
|
|
90
|
-
kind: 'silence_poke_succeeded'
|
|
91
|
-
key: string
|
|
92
|
-
level: PokeLevel
|
|
93
|
-
latency_ms: number
|
|
94
|
-
}
|
|
95
|
-
/**
|
|
96
|
-
* Last-resort: 5 minutes silent, the framework itself sent a
|
|
97
|
-
* user-visible "still working… / still thinking…" message. Should
|
|
98
|
-
* be rare (target <5 per 1000 turns); a high rate means the model
|
|
99
|
-
* is genuinely stuck or the soft/firm pokes aren't being honoured.
|
|
65
|
+
* Last-resort safety net: 5 minutes silent, the framework itself sent
|
|
66
|
+
* a user-visible "still working… / still thinking…" message AND
|
|
67
|
+
* unwedged the turn (cleared activeTurnStartedAt, nulled currentTurn,
|
|
68
|
+
* drained buffered inbound). Should be rare (target <5 per 1000 turns);
|
|
69
|
+
* a high rate means turns are genuinely getting stuck. This is the only
|
|
70
|
+
* remaining framework safety-net signal — the model-targeted nudge
|
|
71
|
+
* ladder (ack/soft/firm) and the 60s awareness ping were retired once
|
|
72
|
+
* the live-updating reply/draft took over the pacing job.
|
|
100
73
|
*/
|
|
101
74
|
| {
|
|
102
75
|
kind: 'silence_fallback_sent'
|
|
@@ -104,23 +77,6 @@ export type RuntimeMetricEvent =
|
|
|
104
77
|
fallback_kind: 'working' | 'thinking'
|
|
105
78
|
silence_ms: number
|
|
106
79
|
}
|
|
107
|
-
/**
|
|
108
|
-
* Awareness ping (~60s, default): framework-owned user-visible
|
|
109
|
-
* "still working… / still thinking…" message sent BEFORE the 300s
|
|
110
|
-
* fallback so the user never faces a silent chat for the full 5
|
|
111
|
-
* minutes. Silent (no device ping); one-shot per turn; suppressed
|
|
112
|
-
* by any outbound or sub-agent dispatch. A high rate is the
|
|
113
|
-
* diagnostic signal that frequent silences exist (held-inbound,
|
|
114
|
-
* extended-thinking, slow startup), and the rate of the heavier
|
|
115
|
-
* silence_fallback_sent that still follows tells us how many of
|
|
116
|
-
* those escalate all the way to 5 min.
|
|
117
|
-
*/
|
|
118
|
-
| {
|
|
119
|
-
kind: 'awareness_ping_sent'
|
|
120
|
-
key: string
|
|
121
|
-
fallback_kind: 'working' | 'thinking'
|
|
122
|
-
silence_ms: number
|
|
123
|
-
}
|
|
124
80
|
/**
|
|
125
81
|
* #1445 cross-turn pending-async ambient lifecycle. `started` fires
|
|
126
82
|
* when a turn ends with a captured anchor AND a pending Agent/Task/
|