switchroom 0.14.9 → 0.14.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -625,17 +625,21 @@ export async function startBootCard(
625
625
  ...(opts.updateOutcomeLine ? { updateOutcomeLine: opts.updateOutcomeLine } : {}),
626
626
  })
627
627
 
628
- // Silence the notification for operator-initiated redeploys. A
629
- // routine `switchroom update` should land in the chat as a record
630
- // but not buzz every user's phone every agent posts a card, so
631
- // a fleet update with N agents produces N notifications otherwise.
632
- // We key on the reason-text prefix `operator:` (today only
633
- // `operator: switchroom update` writes this) so user-initiated
634
- // restarts (`user: /restart from chat`, `cli: switchroom restart`)
635
- // and unplanned events (crash, fresh, planned-marker) keep their
636
- // normal notification behaviour — the user explicitly asked for
637
- // those, or they need to know something went wrong.
638
- const silentBootCard = opts.restartReasonDetail?.startsWith('operator:') === true
628
+ // Boot cards are ALWAYS delivered silently (no Telegram
629
+ // notification). They land in the chat as a record — the operator
630
+ // can scroll up to see "✅ <agent> back up · vX.Y.Z" but they
631
+ // never buzz a phone. Rationale: every agent posts a card on every
632
+ // restart, so a fleet redeploy of N agents produced N notifications;
633
+ // even a single user `/restart` or a crash-recovery is a status
634
+ // record, not something that should pull attention. Operator
635
+ // decision (2026-05-29): silence them all, unconditionally.
636
+ //
637
+ // Previously this was keyed on the `operator:` reason-detail prefix
638
+ // (only routine `switchroom update` was silent); user `/restart`,
639
+ // `cli: switchroom restart` rollouts, crashes, and fresh boots all
640
+ // still notified. That distinction is gone — the card is the record,
641
+ // the chat is where you look, and nothing here warrants a push.
642
+ const silentBootCard = true
639
643
 
640
644
  let messageId: number
641
645
  try {
@@ -263,6 +263,8 @@ import { formatUpdateStatusLine } from './update-status-line.js'
263
263
  import type { HostdRequest } from '../../src/host-control/protocol.js'
264
264
  import type { AgentAudit } from '../welcome-text.js'
265
265
  import { shouldSweepChatAtBoot } from './boot-sweep-filter.js'
266
+ import { startWebhookIngestServer } from './webhook-ingest-server.js'
267
+ import { recordWebhookEvent } from '../../src/web/webhook-gateway-record.js'
266
268
 
267
269
  import { createIpcServer, type IpcClient, type IpcServer } from './ipc-server.js'
268
270
  import { handleRequestDriveApproval } from './drive-write-approval.js'
@@ -3405,11 +3407,13 @@ function ensureIssuesCard(chatId: string, threadId: number | undefined): void {
3405
3407
  }
3406
3408
 
3407
3409
  // #1122: framework safety-net for "model is silent to the user for >5min."
3408
- // Starts a single setInterval poll that walks active turns and arms
3409
- // soft/firm poke reminders piggybacked on the next tool result. At 300s
3410
- // the framework itself sends a user-visible "still working… / still
3411
- // thinking…" message. Honours SWITCHROOM_DISABLE_SILENCE_POKE=1 kill
3412
- // switch (no-op if set).
3410
+ // Starts a single setInterval poll that walks active turns; at 300s of
3411
+ // silence the framework itself sends a user-visible "still working… /
3412
+ // still thinking…" message AND unwedges the turn. The model-targeted
3413
+ // nudge ladder (ack/soft/firm) and the 60s awareness ping were retired
3414
+ // once the live-updating reply/draft took over the pacing job — only
3415
+ // this single unwedge fallback remains. Honours
3416
+ // SWITCHROOM_DISABLE_SILENCE_POKE=1 kill switch (no-op if set).
3413
3417
  // Set when this gateway dispatches an `update_apply` to hostd that
3414
3418
  // returns `started`; cleared when the dispatch poll resolves (terminal
3415
3419
  // / not-configured / timeout). While set, the framework silence
@@ -3423,43 +3427,6 @@ silencePoke.startTimer({
3423
3427
  // Re-emit through the unified runtime-metrics fan-out (PostHog + JSONL).
3424
3428
  emitRuntimeMetric(event)
3425
3429
  },
3426
- onAwarenessPing: async (ctx) => {
3427
- // Early framework-owned awareness signal (~60s) so the user never
3428
- // faces a silent chat while the model is busy / held / thinking.
3429
- // Distinct from the 300s onFrameworkFallback: fires earlier, sends
3430
- // a SILENT message (disable_notification: true — ambient liveness,
3431
- // not a device buzz), and is bounded to ONE per turn by the silence-
3432
- // poke module's `awarenessPingFired` flag. Reuses
3433
- // `formatFrameworkFallbackText` so the wording stays consistent and
3434
- // in-flight tools are named when known. If the model has been
3435
- // silent long enough to cross 300s, the heavier framework_fallback
3436
- // escalates with a notification.
3437
- //
3438
- // Late-fire guard mirrors the framework_fallback handler: skip if
3439
- // the turn ended cleanly between the silence-poke arming and this
3440
- // timer-fired handler so we don't talk over a clean response.
3441
- if (activeTurnStartedAt.get(ctx.key) == null && currentTurn == null) {
3442
- return
3443
- }
3444
- const text = silencePoke.formatFrameworkFallbackText(
3445
- ctx.fallbackKind,
3446
- ctx.silenceMs,
3447
- ctx.inFlightTools,
3448
- )
3449
- try {
3450
- await robustApiCall(
3451
- () => bot.api.sendMessage(ctx.chatId, text, {
3452
- ...(ctx.threadId != null ? { message_thread_id: ctx.threadId } : {}),
3453
- disable_notification: true,
3454
- }),
3455
- { chat_id: ctx.chatId, ...(ctx.threadId != null ? { threadId: ctx.threadId } : {}) },
3456
- )
3457
- } catch (err) {
3458
- process.stderr.write(
3459
- `silence-poke awareness-ping sendMessage failed chat=${ctx.chatId} thread=${ctx.threadId}: ${err}\n`,
3460
- )
3461
- }
3462
- },
3463
3430
  onFrameworkFallback: async (ctx) => {
3464
3431
  // Late-fire short-circuit (2026-05-23 audit finding). The fallback
3465
3432
  // can race a clean turn-end: the model's actual reply lands inside
@@ -4050,25 +4017,6 @@ const ipcServer: IpcServer = createIpcServer({
4050
4017
  process.stderr.write(`telegram gateway: ipc: tool_call tool=${msg.tool} agent=${client.agentName ?? '-'} clientId=${client.id ?? '-'} callId=${msg.id}\n`)
4051
4018
  try {
4052
4019
  const result = await executeToolCall(msg.tool, msg.args)
4053
- // #1122 silence-poke chokepoint: piggyback any armed poke onto the
4054
- // tool result's content text. The model sees the [silence-poke]
4055
- // system-reminder block as part of the next conversational turn.
4056
- // No-op when nothing is armed (the common case) — cost is one
4057
- // map iteration over <=N active turns (typically 1).
4058
- const reminder = silencePoke.consumeArmedPoke()
4059
- if (reminder != null && result != null && typeof result === 'object') {
4060
- const r = result as { content?: Array<{ type: string; text: string }> }
4061
- if (Array.isArray(r.content) && r.content.length > 0 && r.content[0]!.type === 'text') {
4062
- r.content[0]!.text = `${r.content[0]!.text}\n\n<system-reminder>\n${reminder}\n</system-reminder>`
4063
- } else {
4064
- // Tool result didn't carry a text block to wrap — re-shape so
4065
- // the reminder still reaches the model.
4066
- r.content = [
4067
- ...(Array.isArray(r.content) ? r.content : []),
4068
- { type: 'text', text: `<system-reminder>\n${reminder}\n</system-reminder>` },
4069
- ]
4070
- }
4071
- }
4072
4020
  return { type: 'tool_call_result', id: msg.id, success: true, result }
4073
4021
  } catch (err) {
4074
4022
  return {
@@ -4092,17 +4040,13 @@ const ipcServer: IpcServer = createIpcServer({
4092
4040
  progressDriver?.ingest(ev, chatHint, threadHint)
4093
4041
  handleSessionEvent(ev)
4094
4042
  // #1122 silence-poke: surface activity signals from the session
4095
- // stream so the fallback message wording is honest and so
4096
- // subagent-dispatch waits don't fire spurious soft pokes.
4043
+ // stream so the 300s framework-fallback message wording is honest
4044
+ // (thinking vs working, plus the longest-running in-flight tool).
4097
4045
  if (currentTurn != null) {
4098
4046
  const key = statusKey(currentTurn.sessionChatId, currentTurn.sessionThreadId)
4099
4047
  if (ev.kind === 'thinking') {
4100
4048
  silencePoke.noteThinking(key, Date.now())
4101
4049
  } else if (ev.kind === 'tool_use') {
4102
- if (ev.toolName === 'Task' || ev.toolName === 'Agent') {
4103
- // Built-in claude sub-agent dispatch — extends soft threshold to 5min.
4104
- silencePoke.noteSubagentDispatch(key)
4105
- }
4106
4050
  // #1292: track in-flight tool calls so the 300s framework
4107
4051
  // fallback message can name the actual observable (e.g.
4108
4052
  // "running Grep \"foo\" for 4m") instead of the dishonest
@@ -4652,6 +4596,89 @@ const ipcServer: IpcServer = createIpcServer({
4652
4596
  log: (msg) => process.stderr.write(`telegram gateway: ipc — ${msg}\n`),
4653
4597
  })
4654
4598
 
4599
+ // ─── Webhook ingest server (RFC webhook-via-gateway-socket) ───────────────
4600
+ // Under the Docker runtime the host-side web receiver runs as the operator
4601
+ // UID and cannot write this agent's UID-owned dir (EACCES 500) nor connect
4602
+ // gateway.sock. When `channels.telegram.webhook_via_gateway` is set, the
4603
+ // receiver instead forwards verified+rendered events here over a dedicated
4604
+ // peercred-gated UDS; this gateway (agent UID) owns the jsonl append,
4605
+ // dedup, and dispatch firing. Wrapped so any failure is best-effort and can
4606
+ // NEVER crash gateway boot (which would take the agent down).
4607
+ ;(() => {
4608
+ try {
4609
+ const selfAgent = process.env.SWITCHROOM_AGENT_NAME ?? ''
4610
+ if (!selfAgent) return
4611
+
4612
+ let viaGateway = false
4613
+ try {
4614
+ const cfg = loadSwitchroomConfig()
4615
+ const raw = cfg.agents?.[selfAgent]
4616
+ viaGateway = raw
4617
+ ? resolveAgentConfig(cfg.defaults, cfg.profiles, raw).channels?.telegram
4618
+ ?.webhook_via_gateway === true
4619
+ : false
4620
+ } catch (err) {
4621
+ process.stderr.write(
4622
+ `telegram gateway: webhook-ingest config probe failed: ${(err as Error).message}\n`,
4623
+ )
4624
+ }
4625
+ if (!viaGateway) return
4626
+
4627
+ // Allowed peer UIDs: the agent's own UID (self-connections) + the
4628
+ // operator/receiver UID emitted into the env by the compose generator
4629
+ // (SWITCHROOM_WEBHOOK_RECEIVER_UID == operatorUid). Fail-closed: if the
4630
+ // receiver UID is unset, only self can connect → receiver gets 503,
4631
+ // surfacing the misconfiguration instead of silently accepting any UID.
4632
+ const allowedUids: number[] = []
4633
+ const ownUid = typeof process.getuid === 'function' ? process.getuid() : null
4634
+ if (ownUid !== null) allowedUids.push(ownUid)
4635
+ const receiverUidRaw = process.env.SWITCHROOM_WEBHOOK_RECEIVER_UID
4636
+ const receiverUid = receiverUidRaw ? Number(receiverUidRaw) : NaN
4637
+ if (Number.isInteger(receiverUid)) allowedUids.push(receiverUid)
4638
+
4639
+ const socketPath = join(STATE_DIR, 'webhook.sock')
4640
+
4641
+ // In-process delivery of a synthesized webhook turn — the same
4642
+ // sendToAgent + buffer-on-failure primitive onInjectInbound uses, so a
4643
+ // webhook fire landing mid bridge-reconnect is buffered, not dropped.
4644
+ const webhookInject = (agentName: string, inbound: unknown): boolean => {
4645
+ // The wire record is a structural mirror of the bridge's
4646
+ // InboundMessage (same cast the scheduler's ipcDispatcher uses).
4647
+ const msg = inbound as InboundMessage
4648
+ const delivered = ipcServer.sendToAgent(agentName, msg)
4649
+ if (delivered) markClaudeBusyForInbound(msg)
4650
+ else pendingInboundBuffer.push(agentName, msg)
4651
+ return delivered
4652
+ }
4653
+
4654
+ startWebhookIngestServer({
4655
+ socketPath,
4656
+ allowedUids,
4657
+ log: (s) => process.stderr.write(`telegram gateway: ${s}`),
4658
+ onRecord: (req) =>
4659
+ recordWebhookEvent(
4660
+ {
4661
+ agent: selfAgent,
4662
+ source: req.source,
4663
+ event_type: req.event_type,
4664
+ ts: req.ts,
4665
+ rendered_text: req.rendered_text,
4666
+ payload: req.payload,
4667
+ ...(req.delivery_id ? { delivery_id: req.delivery_id } : {}),
4668
+ },
4669
+ {
4670
+ inject: webhookInject,
4671
+ log: (s) => process.stderr.write(`telegram gateway: ${s}`),
4672
+ },
4673
+ ),
4674
+ })
4675
+ } catch (err) {
4676
+ process.stderr.write(
4677
+ `telegram gateway: webhook-ingest server start failed (non-fatal): ${(err as Error).message}\n`,
4678
+ )
4679
+ }
4680
+ })()
4681
+
4655
4682
  // ─── Opportunistic idle-drain of pendingInboundBuffer ─────────────────────
4656
4683
  // pendingInboundBuffer otherwise drains only on (a) bridge re-register
4657
4684
  // (onClientRegistered) or (b) the silence-poke framework fallback
@@ -0,0 +1,125 @@
1
+ /**
2
+ * Tests for the peercred-gated webhook ingest UDS server
3
+ * (RFC docs/rfcs/webhook-via-gateway-socket.md).
4
+ *
5
+ * MUST run under `bun test`: the peer-credential gate calls
6
+ * `getPeerCred` (bun:ffi getsockopt SO_PEERCRED), which returns null
7
+ * under node/vitest — so the allow path is only exercisable under bun.
8
+ * This file is excluded from the vitest run (see vitest.config.ts) and
9
+ * listed in the `test:bun` / `test` scripts in package.json.
10
+ *
11
+ * Sockets live in an isolated tmpdir — never `~/.switchroom`.
12
+ */
13
+
14
+ import { describe, it, expect, afterEach } from 'bun:test'
15
+ import net from 'node:net'
16
+ import { mkdtempSync } from 'node:fs'
17
+ import { tmpdir } from 'node:os'
18
+ import { join } from 'node:path'
19
+ import {
20
+ startWebhookIngestServer,
21
+ type WebhookIngestServer,
22
+ type WebhookIngestRequest,
23
+ } from './webhook-ingest-server.js'
24
+ import { forwardToGateway } from '../../src/web/webhook-ingest-client.js'
25
+
26
+ const servers: WebhookIngestServer[] = []
27
+ afterEach(() => {
28
+ for (const s of servers.splice(0)) s.close()
29
+ })
30
+
31
+ function tmpSocket(): string {
32
+ const dir = mkdtempSync(join(tmpdir(), 'webhook-ingest-srv-'))
33
+ return join(dir, 'webhook.sock')
34
+ }
35
+
36
+ function sampleReq(): WebhookIngestRequest {
37
+ return {
38
+ agent: 'reggie',
39
+ source: 'github',
40
+ event_type: 'pull_request',
41
+ ts: 1_700_000_000_000,
42
+ rendered_text: 'PR opened',
43
+ payload: { action: 'opened', number: 7 },
44
+ delivery_id: 'd-7',
45
+ }
46
+ }
47
+
48
+ describe('startWebhookIngestServer (peercred-gated)', () => {
49
+ it('accepts a connection from an allowed uid and round-trips onRecord', async () => {
50
+ const socketPath = tmpSocket()
51
+ let received: WebhookIngestRequest | null = null
52
+ const server = startWebhookIngestServer({
53
+ socketPath,
54
+ allowedUids: [process.getuid!()], // our own uid → allowed
55
+ onRecord: (req) => {
56
+ received = req
57
+ return { status: 'ok', ts: req.ts, dispatched: 1 }
58
+ },
59
+ log: () => {},
60
+ })
61
+ servers.push(server)
62
+
63
+ const resp = await forwardToGateway(socketPath, sampleReq())
64
+
65
+ expect(resp).not.toBeNull()
66
+ expect(resp!.status).toBe('ok')
67
+ expect(resp!.ts).toBe(1_700_000_000_000)
68
+ expect(resp!.dispatched).toBe(1)
69
+ expect(received).not.toBeNull()
70
+ expect(received!.agent).toBe('reggie')
71
+ expect(received!.event_type).toBe('pull_request')
72
+ expect(received!.delivery_id).toBe('d-7')
73
+ })
74
+
75
+ it('denies a connection whose uid is not in allowedUids (onRecord never runs)', async () => {
76
+ const socketPath = tmpSocket()
77
+ let called = false
78
+ const server = startWebhookIngestServer({
79
+ socketPath,
80
+ allowedUids: [999_999], // definitely not our uid
81
+ onRecord: () => {
82
+ called = true
83
+ return { status: 'ok' }
84
+ },
85
+ log: () => {},
86
+ })
87
+ servers.push(server)
88
+
89
+ // The server destroys the connection in its accept callback BEFORE
90
+ // reading any bytes, so onRecord must never run. We assert the
91
+ // security property (no service for an unlisted uid) directly rather
92
+ // than via the client resolving: bun's net client does not observe a
93
+ // server-side destroy of a just-accepted UDS connection promptly, so
94
+ // awaiting forwardToGateway here would hang. Send raw bytes and give
95
+ // the server ample time to (not) process them.
96
+ const raw = net.createConnection(socketPath)
97
+ raw.on('connect', () => raw.write(JSON.stringify(sampleReq()) + '\n'))
98
+ raw.on('error', () => {})
99
+ await new Promise((r) => setTimeout(r, 300))
100
+ raw.destroy()
101
+
102
+ expect(called).toBe(false)
103
+ })
104
+
105
+ it('returns null when the socket does not exist (gateway down)', async () => {
106
+ const socketPath = tmpSocket() // never bound
107
+ const resp = await forwardToGateway(socketPath, sampleReq(), { timeoutMs: 2000 })
108
+ expect(resp).toBeNull()
109
+ })
110
+
111
+ it('surfaces a gateway-side error status from onRecord', async () => {
112
+ const socketPath = tmpSocket()
113
+ const server = startWebhookIngestServer({
114
+ socketPath,
115
+ allowedUids: [process.getuid!()],
116
+ onRecord: () => ({ status: 'error', error: 'write failed' }),
117
+ log: () => {},
118
+ })
119
+ servers.push(server)
120
+
121
+ const resp = await forwardToGateway(socketPath, sampleReq())
122
+ expect(resp).not.toBeNull()
123
+ expect(resp!.status).toBe('error')
124
+ })
125
+ })
@@ -0,0 +1,218 @@
1
+ /**
2
+ * Webhook ingest UDS server (RFC docs/rfcs/webhook-via-gateway-socket.md).
3
+ *
4
+ * A dedicated, peercred-gated Unix socket the host-side web receiver
5
+ * forwards verified webhook events to. It is deliberately SEPARATE from
6
+ * the bridge IPC socket (`gateway.sock`):
7
+ *
8
+ * - `gateway.sock` is bound via `Bun.listen`, whose accepted socket does
9
+ * NOT expose a file descriptor — so SO_PEERCRED can't be read on it.
10
+ * This server uses `node:net` (whose `Socket._handle.fd` IS readable
11
+ * under Bun, verified empirically) precisely so the peer-credential
12
+ * gate works.
13
+ * - Keeping the chat-critical bridge socket untouched avoids any risk to
14
+ * the bridge-flap-sensitive reconnect path.
15
+ *
16
+ * Auth model — two independent layers:
17
+ * 1. **Filesystem**: the socket is chmod 0o666 so the host operator UID
18
+ * can connect at all (the agent dir itself is 0775/agent-UID, which
19
+ * blocks the operator from writing files but not from connecting a
20
+ * world-connectable socket).
21
+ * 2. **Peer credentials**: every connection's SO_PEERCRED UID must be in
22
+ * `allowedUids` (the agent's own UID + the operator/receiver UID).
23
+ * This is the load-bearing gate: it ensures only the trusted receiver
24
+ * (which enforces per-event HMAC) — or the agent itself — can inject a
25
+ * webhook turn. Fail-closed: unreadable creds or an unlisted UID is
26
+ * denied.
27
+ *
28
+ * Wire protocol: one JSON line in (`WebhookIngestRequest`), one JSON line
29
+ * out (`WebhookIngestResponse`), then the connection closes. No framing
30
+ * beyond the trailing newline; requests are small (a rendered event +
31
+ * payload).
32
+ */
33
+
34
+ import net from 'node:net'
35
+ import { chmodSync, existsSync, unlinkSync } from 'node:fs'
36
+ import { getPeerCred } from '../../src/vault/broker/peercred-ffi.js'
37
+
38
+ /** Forwarded by the receiver; structural mirror of WebhookGatewayRecord. */
39
+ export interface WebhookIngestRequest {
40
+ agent: string
41
+ source: string
42
+ event_type: string
43
+ ts: number
44
+ rendered_text: string
45
+ payload: Record<string, unknown>
46
+ delivery_id?: string
47
+ }
48
+
49
+ export interface WebhookIngestResponse {
50
+ status: 'ok' | 'deduped' | 'error'
51
+ ts?: number
52
+ error?: string
53
+ dispatched?: number
54
+ }
55
+
56
+ export interface WebhookIngestServerOptions {
57
+ socketPath: string
58
+ /** SO_PEERCRED UIDs permitted to inject. Connections from any other UID
59
+ * (or with unreadable creds) are denied and the socket destroyed. */
60
+ allowedUids: number[]
61
+ /** Handle one verified, forwarded event. Synchronous return (the record
62
+ * path is file I/O + an in-process inject) wrapped in Promise for the
63
+ * server's await. */
64
+ onRecord: (req: WebhookIngestRequest) => WebhookIngestResponse | Promise<WebhookIngestResponse>
65
+ log?: (line: string) => void
66
+ }
67
+
68
+ export interface WebhookIngestServer {
69
+ close: () => void
70
+ }
71
+
72
+ const MAX_REQUEST_BYTES = 1024 * 1024 // 1 MiB — github payloads fit easily
73
+
74
+ /** Read the accepted connection's fd via the undocumented `_handle.fd`.
75
+ * Under Bun's node:net polyfill this is present (verified); returns null
76
+ * if absent so the caller fails closed. */
77
+ function fdOf(conn: net.Socket): number | null {
78
+ const handle = (conn as unknown as { _handle?: { fd?: number } })._handle
79
+ if (!handle || typeof handle.fd !== 'number' || handle.fd < 0) return null
80
+ return handle.fd
81
+ }
82
+
83
+ /**
84
+ * Start the webhook ingest server. Never throws — bind failures are logged
85
+ * and surfaced via the returned server still being usable as a no-op close.
86
+ * The CALLER (gateway boot) additionally wraps this so a failure here can
87
+ * never take the agent down; this function's own try/catch is belt-and-
88
+ * suspenders.
89
+ */
90
+ export function startWebhookIngestServer(
91
+ opts: WebhookIngestServerOptions,
92
+ ): WebhookIngestServer {
93
+ const log = opts.log ?? ((s) => process.stderr.write(s))
94
+ const allowed = new Set(opts.allowedUids)
95
+
96
+ // Clear a stale socket from a previous (crashed) gateway. Safe: only this
97
+ // agent's gateway binds this path, and we're about to rebind it.
98
+ try {
99
+ if (existsSync(opts.socketPath)) unlinkSync(opts.socketPath)
100
+ } catch (err) {
101
+ log(`webhook-ingest-server: could not unlink stale socket: ${(err as Error).message}\n`)
102
+ }
103
+
104
+ const server = net.createServer((conn) => {
105
+ // ── Peer-credential gate (fail-closed) ──────────────────────────────────
106
+ const fd = fdOf(conn)
107
+ const cred = fd !== null ? getPeerCred(fd) : null
108
+ if (cred === null || !allowed.has(cred.uid)) {
109
+ log(
110
+ `webhook-ingest-server: DENY connection uid=${cred?.uid ?? 'unknown'} ` +
111
+ `(allowed=${[...allowed].join(',')})\n`,
112
+ )
113
+ conn.destroy()
114
+ return
115
+ }
116
+
117
+ let buf = ''
118
+ let handled = false
119
+ conn.setEncoding('utf8')
120
+
121
+ const reply = (resp: WebhookIngestResponse) => {
122
+ if (handled) return
123
+ handled = true
124
+ try {
125
+ conn.write(JSON.stringify(resp) + '\n')
126
+ } catch {
127
+ /* peer may have hung up */
128
+ }
129
+ conn.end()
130
+ }
131
+
132
+ conn.on('data', (chunk: string) => {
133
+ if (handled) return
134
+ buf += chunk
135
+ if (buf.length > MAX_REQUEST_BYTES) {
136
+ reply({ status: 'error', error: 'request too large' })
137
+ return
138
+ }
139
+ const nl = buf.indexOf('\n')
140
+ if (nl === -1) return // wait for the full line
141
+ const line = buf.slice(0, nl)
142
+
143
+ let req: WebhookIngestRequest
144
+ try {
145
+ req = JSON.parse(line) as WebhookIngestRequest
146
+ } catch {
147
+ reply({ status: 'error', error: 'malformed request' })
148
+ return
149
+ }
150
+ if (
151
+ !req ||
152
+ typeof req.agent !== 'string' ||
153
+ typeof req.source !== 'string' ||
154
+ typeof req.event_type !== 'string' ||
155
+ typeof req.rendered_text !== 'string' ||
156
+ typeof req.payload !== 'object' ||
157
+ req.payload === null
158
+ ) {
159
+ reply({ status: 'error', error: 'invalid request shape' })
160
+ return
161
+ }
162
+
163
+ Promise.resolve()
164
+ .then(() => opts.onRecord(req))
165
+ .then((resp) => reply(resp))
166
+ .catch((err: unknown) => {
167
+ log(`webhook-ingest-server: onRecord threw: ${String(err)}\n`)
168
+ reply({ status: 'error', error: 'internal error' })
169
+ })
170
+ })
171
+
172
+ conn.on('error', (err) => {
173
+ log(`webhook-ingest-server: conn error: ${err.message}\n`)
174
+ })
175
+
176
+ // Drop slow/idle clients so a stuck connection can't pin the socket.
177
+ conn.setTimeout(10_000, () => {
178
+ if (!handled) reply({ status: 'error', error: 'timeout' })
179
+ conn.destroy()
180
+ })
181
+ })
182
+
183
+ server.on('error', (err) => {
184
+ log(`webhook-ingest-server: server error: ${err.message}\n`)
185
+ })
186
+
187
+ try {
188
+ server.listen(opts.socketPath, () => {
189
+ try {
190
+ // World-connectable at the FS layer; peercred is the real gate.
191
+ chmodSync(opts.socketPath, 0o666)
192
+ } catch (err) {
193
+ log(`webhook-ingest-server: chmod failed: ${(err as Error).message}\n`)
194
+ }
195
+ log(
196
+ `webhook-ingest-server: listening at ${opts.socketPath} ` +
197
+ `(allowed uids: ${[...allowed].join(',')})\n`,
198
+ )
199
+ })
200
+ } catch (err) {
201
+ log(`webhook-ingest-server: listen failed: ${(err as Error).message}\n`)
202
+ }
203
+
204
+ return {
205
+ close: () => {
206
+ try {
207
+ server.close()
208
+ } catch {
209
+ /* ignore */
210
+ }
211
+ try {
212
+ if (existsSync(opts.socketPath)) unlinkSync(opts.socketPath)
213
+ } catch {
214
+ /* ignore */
215
+ }
216
+ },
217
+ }
218
+ }
@@ -19,7 +19,6 @@
19
19
  import { existsSync, mkdirSync, appendFileSync } from 'node:fs'
20
20
  import { dirname, join } from 'node:path'
21
21
  import { captureEvent } from './analytics-posthog.js'
22
- import type { PokeLevel } from './silence-poke.js'
23
22
 
24
23
  export type RuntimeMetricEvent =
25
24
  /**
@@ -63,40 +62,14 @@ export type RuntimeMetricEvent =
63
62
  ended_via: 'reply' | 'stream_reply_done' | 'silent' | 'forced' | 'framework_fallback'
64
63
  }
65
64
  /**
66
- * Framework safety-net: a silence-poke was armed. `ack` is the early
67
- * (~10s) ack-budget poke the model has sent NOTHING this turn and is
68
- * leaving the user on a silent chat. `soft` (75s) / `firm` (180s) are
69
- * the silence-since-last-outbound ladder. The system-reminder appended
70
- * to the next tool result nudges the model to send an update. Doubles
71
- * as a design-health signal — if these fire frequently, the
72
- * conversational-pacing prompt isn't doing its job.
73
- */
74
- | {
75
- kind: 'silence_poke_fired'
76
- key: string
77
- level: PokeLevel
78
- silence_ms: number
79
- subagent_wait: boolean
80
- }
81
- /**
82
- * The model sent an outbound message within the success window
83
- * (default 15s) after a poke fired. Pair with `silence_poke_fired`
84
- * to compute success rate — the design target is >80%. (`ack`-level
85
- * success is not currently emitted — the ack poke sits outside the
86
- * `pokesFired` ladder noteOutbound measures against; the type admits
87
- * `ack` only so the silence-poke metric union stays assignable.)
88
- */
89
- | {
90
- kind: 'silence_poke_succeeded'
91
- key: string
92
- level: PokeLevel
93
- latency_ms: number
94
- }
95
- /**
96
- * Last-resort: 5 minutes silent, the framework itself sent a
97
- * user-visible "still working… / still thinking…" message. Should
98
- * be rare (target <5 per 1000 turns); a high rate means the model
99
- * is genuinely stuck or the soft/firm pokes aren't being honoured.
65
+ * Last-resort safety net: 5 minutes silent, the framework itself sent
66
+ * a user-visible "still working… / still thinking…" message AND
67
+ * unwedged the turn (cleared activeTurnStartedAt, nulled currentTurn,
68
+ * drained buffered inbound). Should be rare (target <5 per 1000 turns);
69
+ * a high rate means turns are genuinely getting stuck. This is the only
70
+ * remaining framework safety-net signal — the model-targeted nudge
71
+ * ladder (ack/soft/firm) and the 60s awareness ping were retired once
72
+ * the live-updating reply/draft took over the pacing job.
100
73
  */
101
74
  | {
102
75
  kind: 'silence_fallback_sent'
@@ -104,23 +77,6 @@ export type RuntimeMetricEvent =
104
77
  fallback_kind: 'working' | 'thinking'
105
78
  silence_ms: number
106
79
  }
107
- /**
108
- * Awareness ping (~60s, default): framework-owned user-visible
109
- * "still working… / still thinking…" message sent BEFORE the 300s
110
- * fallback so the user never faces a silent chat for the full 5
111
- * minutes. Silent (no device ping); one-shot per turn; suppressed
112
- * by any outbound or sub-agent dispatch. A high rate is the
113
- * diagnostic signal that frequent silences exist (held-inbound,
114
- * extended-thinking, slow startup), and the rate of the heavier
115
- * silence_fallback_sent that still follows tells us how many of
116
- * those escalate all the way to 5 min.
117
- */
118
- | {
119
- kind: 'awareness_ping_sent'
120
- key: string
121
- fallback_kind: 'working' | 'thinking'
122
- silence_ms: number
123
- }
124
80
  /**
125
81
  * #1445 cross-turn pending-async ambient lifecycle. `started` fires
126
82
  * when a turn ends with a captured anchor AND a pending Agent/Task/