polygram 0.8.0 → 0.9.0-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/.claude-plugin/plugin.json +1 -1
  2. package/lib/{agent-loader.js → agents/loader.js} +6 -8
  3. package/lib/{approvals.js → approvals/store.js} +28 -5
  4. package/lib/{approval-ui.js → approvals/ui.js} +1 -17
  5. package/lib/config.js +121 -0
  6. package/lib/{error-classify.js → error/classify.js} +25 -34
  7. package/lib/handlers/abort.js +89 -0
  8. package/lib/handlers/approvals.js +361 -0
  9. package/lib/handlers/autosteer.js +94 -0
  10. package/lib/handlers/config-callback.js +118 -0
  11. package/lib/handlers/config-ui.js +104 -0
  12. package/lib/handlers/dispatcher.js +263 -0
  13. package/lib/handlers/download.js +182 -0
  14. package/lib/handlers/extract-attachments.js +97 -0
  15. package/lib/handlers/ipc-send.js +80 -0
  16. package/lib/handlers/poll.js +140 -0
  17. package/lib/handlers/record-inbound.js +88 -0
  18. package/lib/handlers/slash-commands.js +319 -0
  19. package/lib/handlers/voice.js +107 -0
  20. package/lib/pm-interface.js +27 -29
  21. package/lib/sdk/build-options.js +177 -0
  22. package/lib/sdk/callbacks.js +213 -0
  23. package/lib/{process-manager-sdk.js → sdk/process-manager.js} +19 -31
  24. package/lib/{telegram.js → telegram/api.js} +2 -2
  25. package/lib/{telegram-prompt.js → telegram/display-hint.js} +0 -14
  26. package/lib/{stream-reply.js → telegram/streamer.js} +4 -4
  27. package/package.json +2 -3
  28. package/polygram.js +347 -2581
  29. package/scripts/doctor.js +1 -1
  30. package/scripts/ipc-smoke.js +1 -10
  31. package/bin/approval-hook.js +0 -113
  32. package/lib/approval-waiters.js +0 -201
  33. package/lib/pm-router.js +0 -201
  34. package/lib/process-manager.js +0 -806
  35. /package/lib/{auto-resume.js → db/auto-resume.js} +0 -0
  36. /package/lib/{inbox.js → db/inbox.js} +0 -0
  37. /package/lib/{pairings.js → db/pairings.js} +0 -0
  38. /package/lib/{replay-window.js → db/replay-window.js} +0 -0
  39. /package/lib/{sent-cache.js → db/sent-cache.js} +0 -0
  40. /package/lib/{sessions.js → db/sessions.js} +0 -0
  41. /package/lib/{net-errors.js → error/net.js} +0 -0
  42. /package/lib/{ipc-client.js → ipc/client.js} +0 -0
  43. /package/lib/{ipc-file-validator.js → ipc/file-validator.js} +0 -0
  44. /package/lib/{ipc-server.js → ipc/server.js} +0 -0
  45. /package/lib/{telegram-chunk.js → telegram/chunk.js} +0 -0
  46. /package/lib/{deliver.js → telegram/deliver.js} +0 -0
  47. /package/lib/{telegram-format.js → telegram/format.js} +0 -0
  48. /package/lib/{parse-response.js → telegram/parse.js} +0 -0
  49. /package/lib/{status-reactions.js → telegram/reactions.js} +0 -0
  50. /package/lib/{typing-indicator.js → telegram/typing.js} +0 -0
  51. /package/lib/{voice.js → telegram/voice.js} +0 -0
@@ -1,806 +0,0 @@
1
- /**
2
- * LRU-bounded warm process pool with FIFO pending queue per process.
3
- *
4
- * Each `entry` owns ONE claude subprocess. Messages sent via `send()` are
5
- * appended to `entry.pendingQueue` and their prompt is written to the
6
- * subprocess stdin. Claude processes stdin in FIFO order and emits one
7
- * `result` event per turn. Each result resolves the oldest pending
8
- * (queue head).
9
- *
10
- * Timers (idle + wall-clock) are only armed for the HEAD of the queue —
11
- * the turn Claude is currently working on. When the head is shifted,
12
- * the next pending becomes head and its timers arm fresh. This avoids
13
- * the footgun of "pending #2's timer started ticking when its stdin
14
- * was written, but Claude spent 5 minutes on pending #1 first → #2
15
- * times out before Claude sees it".
16
- *
17
- * Timer fire rejects ONLY that pending (policy: don't kill the whole
18
- * subprocess, other in-flight work is probably fine). If the subprocess
19
- * is truly stuck, its head pending will time out repeatedly.
20
- *
21
- * The `onStreamChunk` and `onToolUse` callbacks pass the live `entry` so
22
- * callers can inspect `entry.pendingQueue[0]` to route output to the
23
- * correct turn's streamer / reactor / source message.
24
- *
25
- * All I/O (spawn, db) is injected for testability.
26
- */
27
-
28
- const { createInterface } = require('readline');
29
- const { isTransientHttpError } = require('./error-classify');
30
-
31
- const DEFAULT_CAP = 10;
32
- const DEFAULT_KILL_TIMEOUT_MS = 3000;
33
- // 0.7.7: transient HTTP retry. When Anthropic returns a 5xx (or 429
34
- // rate-limit) and the turn produced ZERO assistant messages so far,
35
- // pm sleeps and retries the user message ONCE before surfacing the
36
- // error to the user. Matches OpenClaw's
37
- // pi-embedded-Vt2x_Jl3.js:39210-39216 — "single retry, then surface".
38
- // Idempotency-protected: we only retry if no assistant content has
39
- // streamed (otherwise re-sending would replay tools that already ran).
40
- const DEFAULT_TRANSIENT_RETRY_DELAY_MS = 2500;
41
- const MAX_TRANSIENT_RETRIES = 1;
42
- // 0.7.6 (item H): hard cap on per-session pending queue depth.
43
- // Pre-fix, a chat with rapid-fire user messages (or a stuck Claude that
44
- // stops emitting `result`) could grow pendingQueue unbounded — each
45
- // pending holds a streamer + reactor + timers, so a runaway client
46
- // could exhaust memory or burn API quota for ack reactions on every
47
- // dropped message. 50 is generous (a normal turn never queues more
48
- // than a handful) but safely bounded.
49
- const DEFAULT_QUEUE_CAP = 50;
50
-
51
- /**
52
- * Pull user-visible text from a stream-json `assistant` event.
53
- * See header for colon-normalisation / tool_use-filter rationale.
54
- */
55
- function extractAssistantText(event) {
56
- const blocks = event?.message?.content;
57
- if (!Array.isArray(blocks)) return '';
58
- const parts = [];
59
- for (const b of blocks) {
60
- if (!b) continue;
61
- if (b.type === 'text' && typeof b.text === 'string') {
62
- parts.push(b.text);
63
- }
64
- }
65
- return parts.join('\n\n').trim().replace(/([^:]):\s*$/, '$1…');
66
- }
67
-
68
- // 0.7.6 (item F): sum the four canonical usage counters across a Map of
69
- // per-message usage objects. Each map value is the LAST-SEEN usage for
70
- // that message id (Anthropic emits cumulative totals within a message);
71
- // summing across map values gives the turn-wide totals.
72
- //
73
- // Defensive against missing fields — older claude versions may not
74
- // always emit cache_*_input_tokens.
75
- function sumUsage(usageByMessage) {
76
- const out = {
77
- input_tokens: 0,
78
- output_tokens: 0,
79
- cache_creation_input_tokens: 0,
80
- cache_read_input_tokens: 0,
81
- };
82
- for (const u of usageByMessage.values()) {
83
- if (!u) continue;
84
- if (Number.isFinite(u.input_tokens)) out.input_tokens += u.input_tokens;
85
- if (Number.isFinite(u.output_tokens)) out.output_tokens += u.output_tokens;
86
- if (Number.isFinite(u.cache_creation_input_tokens)) {
87
- out.cache_creation_input_tokens += u.cache_creation_input_tokens;
88
- }
89
- if (Number.isFinite(u.cache_read_input_tokens)) {
90
- out.cache_read_input_tokens += u.cache_read_input_tokens;
91
- }
92
- }
93
- return out;
94
- }
95
-
96
- /**
97
- * Stream-json CLI-backed ProcessManager. Implements the canonical
98
- * Pm interface (`lib/pm-interface.js`). Optional methods exposed:
99
- * `requestRespawn` — drain queue and respawn process on next send
100
- * (kept for parity with rc.6+ feature-detection at the router; SDK
101
- * pm uses `applyFlagSettings` + `setModel` for the same UX).
102
- *
103
- * Optional methods NOT implemented (SDK pm has these): `steer`,
104
- * `setModel`, `applyFlagSettings`, `setPermissionMode`,
105
- * `drainQueue`, `interrupt`, `resetSession`.
106
- *
107
- * @implements {import('./pm-interface.js').Pm}
108
- */
109
- class ProcessManager {
110
- constructor({
111
- cap = DEFAULT_CAP,
112
- queueCap = DEFAULT_QUEUE_CAP,
113
- spawnFn,
114
- db = null,
115
- logger = console,
116
- killTimeoutMs = DEFAULT_KILL_TIMEOUT_MS,
117
- onInit = null, // (sessionKey, event, entry) → void
118
- onResult = null, // (sessionKey, event, entry, pending) → void
119
- onClose = null, // (sessionKey, code, entry) → void
120
- onStreamChunk = null, // (sessionKey, partialText, entry) → void — routes to pendingQueue[0]
121
- onToolUse = null, // (sessionKey, toolName, entry) → void — routes to pendingQueue[0]
122
- onAssistantMessageStart = null, // (sessionKey, entry) → void — fires when a NEW top-level assistant message begins (after a previous one ended). Used by polygram.js to call streamer.forceNewMessage() so each assistant message gets its own bubble.
123
- onRespawn = null, // (sessionKey, reason, entry) → void — fires after graceful drain-and-kill
124
- onQueueDrop = null, // 0.7.6: (sessionKey, droppedPending, entry) → void — fired when a pending is dropped because pendingQueue exceeded queueCap. Polygram uses this to surface a warning on the dropped message.
125
- } = {}) {
126
- if (!spawnFn) throw new Error('spawnFn required');
127
- this.cap = cap;
128
- this.queueCap = queueCap;
129
- this.spawnFn = spawnFn;
130
- this.db = db;
131
- this.logger = logger;
132
- this.killTimeoutMs = killTimeoutMs;
133
- this.onInit = onInit;
134
- this.onResult = onResult;
135
- this.onClose = onClose;
136
- this.onStreamChunk = onStreamChunk;
137
- this.onToolUse = onToolUse;
138
- this.onAssistantMessageStart = onAssistantMessageStart;
139
- this.onRespawn = onRespawn;
140
- this.onQueueDrop = onQueueDrop;
141
- this.procs = new Map();
142
- }
143
-
144
- has(sessionKey) {
145
- return this.procs.has(sessionKey);
146
- }
147
-
148
- get(sessionKey) {
149
- return this.procs.get(sessionKey);
150
- }
151
-
152
- size() {
153
- return this.procs.size;
154
- }
155
-
156
- keys() {
157
- return Array.from(this.procs.keys());
158
- }
159
-
160
- async getOrSpawn(sessionKey, spawnContext) {
161
- const existing = this.procs.get(sessionKey);
162
- if (existing && !existing.closed) {
163
- existing.lastUsedTs = Date.now();
164
- return existing;
165
- }
166
- if (this.procs.size >= this.cap) {
167
- const evicted = await this.evictLRU();
168
- if (!evicted) {
169
- // All sessions are in-flight — wait for one to drain, then retry.
170
- // Waiters are held in `this._lruWaiters` FIFO and signalled when any
171
- // pending queue empties (see _maybeSignalLruWaiter).
172
- await this._awaitLruSlot();
173
- // After waking, try the whole path again — the evictLRU may now
174
- // succeed, or an existing session may have been spawned for this key.
175
- return this.getOrSpawn(sessionKey, spawnContext);
176
- }
177
- }
178
- return this._spawn(sessionKey, spawnContext);
179
- }
180
-
181
- // Hold a promise pair per waiter. _maybeSignalLruWaiter shifts the oldest
182
- // waiter when a slot might have freed up. Each waiter has its own timer
183
- // that rejects with 'LRU wait timeout' if no slot appears in time.
184
- _awaitLruSlot({ timeoutMs = 5 * 60_000 } = {}) {
185
- if (!this._lruWaiters) this._lruWaiters = [];
186
- return new Promise((resolve, reject) => {
187
- const waiter = { resolve, reject };
188
- const timer = setTimeout(() => {
189
- const idx = this._lruWaiters.indexOf(waiter);
190
- if (idx !== -1) this._lruWaiters.splice(idx, 1);
191
- this._logEvent('lru-wait-timeout', { cap: this.cap, queued_waiters: this._lruWaiters.length });
192
- reject(new Error(`LRU wait timeout after ${timeoutMs / 1000}s`));
193
- }, timeoutMs);
194
- waiter.timer = timer;
195
- this._lruWaiters.push(waiter);
196
- this._logEvent('lru-wait', { cap: this.cap, queued_waiters: this._lruWaiters.length });
197
- });
198
- }
199
-
200
- _maybeSignalLruWaiter() {
201
- if (!this._lruWaiters || this._lruWaiters.length === 0) return;
202
- // Only signal if there's actually capacity now (a session went idle
203
- // or closed). Otherwise keep waiters sleeping for the next chance.
204
- let hasIdle = false;
205
- for (const v of this.procs.values()) {
206
- if (!v.inFlight) { hasIdle = true; break; }
207
- }
208
- if (!hasIdle && this.procs.size >= this.cap) return;
209
- const w = this._lruWaiters.shift();
210
- clearTimeout(w.timer);
211
- w.resolve();
212
- }
213
-
214
- async evictLRU() {
215
- let victim = null;
216
- for (const [k, v] of this.procs) {
217
- if (v.inFlight) continue;
218
- if (!victim || v.lastUsedTs < victim.entry.lastUsedTs) {
219
- victim = { key: k, entry: v };
220
- }
221
- }
222
- if (!victim) {
223
- this._logEvent('lru-full', { cap: this.cap });
224
- return false;
225
- }
226
- this._logEvent('evict', { session_key: victim.key, chat_id: victim.entry.chatId });
227
- await this.kill(victim.key);
228
- return true;
229
- }
230
-
231
- /**
232
- * Request a graceful respawn (e.g. because /model or /effort changed).
233
- * If the queue is empty, kill now; otherwise mark the entry so it kills
234
- * itself when the last pending resolves. Next send() respawns fresh
235
- * with whatever config spawnFn reads at that moment.
236
- *
237
- * onRespawn fires with `wasDrained=true` ONLY when we waited for an
238
- * in-flight turn to finish before swapping. The immediate-kill case
239
- * (queue empty at request time) calls onRespawn with `wasDrained=false`
240
- * so callers can decide whether to post a user-visible confirmation
241
- * (which is redundant noise when the user wasn't waiting on a turn).
242
- */
243
- requestRespawn(sessionKey, reason = 'config-change') {
244
- const entry = this.procs.get(sessionKey);
245
- if (!entry || entry.closed) return { killed: false, queued: 0 };
246
- entry.needsRespawn = reason;
247
- this._logEvent('respawn-requested', {
248
- session_key: sessionKey,
249
- chat_id: entry.chatId,
250
- reason,
251
- queued: entry.pendingQueue.length,
252
- });
253
- if (entry.pendingQueue.length === 0) {
254
- // Queue empty — kill immediately, fire onRespawn after close.
255
- this._killAndNotifyRespawn(sessionKey, reason, false).catch(() => {});
256
- return { killed: true, queued: 0 };
257
- }
258
- return { killed: false, queued: entry.pendingQueue.length };
259
- }
260
-
261
- async _killAndNotifyRespawn(sessionKey, reason, wasDrained) {
262
- const entry = this.procs.get(sessionKey);
263
- await this.kill(sessionKey);
264
- if (this.onRespawn && entry) {
265
- try { this.onRespawn(sessionKey, reason, entry, wasDrained); }
266
- catch (err) { this.logger.error(`[pm] onRespawn: ${err.message}`); }
267
- }
268
- }
269
-
270
- async kill(sessionKey) {
271
- const entry = this.procs.get(sessionKey);
272
- if (!entry) return;
273
- this.procs.delete(sessionKey);
274
- try { entry.proc.kill('SIGTERM'); } catch {}
275
- await new Promise((resolve) => {
276
- if (entry.closed) return resolve();
277
- const timer = setTimeout(() => {
278
- try { entry.proc.kill('SIGKILL'); } catch {}
279
- resolve();
280
- }, this.killTimeoutMs);
281
- entry.proc.once('close', () => { clearTimeout(timer); resolve(); });
282
- });
283
- // Reject all pendings in the queue (if any survived the 'close' handler).
284
- while (entry.pendingQueue.length > 0) {
285
- const p = entry.pendingQueue.shift();
286
- p.clearTimers?.();
287
- p.reject(new Error('Process killed'));
288
- }
289
- }
290
-
291
- async killChat(chatId) {
292
- const prefix = String(chatId);
293
- const targets = [];
294
- for (const key of this.procs.keys()) {
295
- if (key === prefix || key.startsWith(prefix + ':')) targets.push(key);
296
- }
297
- for (const key of targets) await this.kill(key);
298
- }
299
-
300
- async shutdown() {
301
- // rc.38: mark "we're shutting down" so the proc.on('close') handler
302
- // suppresses the misleading `resume-fail` event for signal-driven
303
- // exits (SIGHUP from tmux pty close, SIGTERM from our own kill,
304
- // SIGKILL from the kill-timeout escalator). Pre-rc.38 every deploy
305
- // logged a `resume-fail` for every CLI-pm chat AND cleared the
306
- // saved session_id, forcing a fresh resume on the next user turn
307
- // — slower first turn, fresh context — for no real reason.
308
- this._shuttingDown = true;
309
- const keys = Array.from(this.procs.keys());
310
- for (const key of keys) await this.kill(key);
311
- }
312
-
313
- _spawn(sessionKey, ctx = {}) {
314
- const proc = this.spawnFn(sessionKey, ctx);
315
- const rl = createInterface({ input: proc.stdout });
316
- const entry = {
317
- sessionKey,
318
- proc,
319
- rl,
320
- pendingQueue: [],
321
- lastUsedTs: Date.now(),
322
- inFlight: false,
323
- closed: false,
324
- needsRespawn: null,
325
- sessionId: ctx.existingSessionId || null,
326
- chatId: ctx.chatId || null,
327
- threadId: ctx.threadId || null,
328
- label: ctx.label || sessionKey,
329
- };
330
-
331
- rl.on('line', (line) => {
332
- let event;
333
- try { event = JSON.parse(line); }
334
- catch { this.logger.error(`[${entry.label}] non-JSON: ${line.slice(0, 200)}`); return; }
335
-
336
- // Fix A: ANY stream-json event counts as Claude activity. Reset the
337
- // idle timer on the HEAD pending (the turn Claude is working on),
338
- // regardless of event type. Subagent runs emit `user`-type
339
- // tool_result events between the parent's assistant events — those
340
- // previously did NOT reset the timer, causing false timeouts during
341
- // long subagent work.
342
- const head = entry.pendingQueue[0];
343
- if (head) head.resetIdleTimer?.();
344
-
345
- if (event.type === 'system' && event.subtype === 'init') {
346
- entry.sessionId = event.session_id;
347
- if (this.onInit) this.onInit(sessionKey, event, entry);
348
- }
349
-
350
- if (event.type === 'assistant' && head) {
351
- // 0.7.0 (Phase F): detect message_id transitions to split bubbles
352
- // per top-level assistant message. Each Anthropic stream-json
353
- // 'assistant' event carries event.message.id; the same id across
354
- // events means cumulative updates to the same message, a new
355
- // id means a new message (typically after a tool-result cycle).
356
- const messageId = event.message?.id;
357
- const added = extractAssistantText(event);
358
- // 0.7.4 (item B): first sign Claude is doing real work on this
359
- // pending. Fire onFirstStream ONCE, regardless of whether the
360
- // assistant message has text or only tool_use blocks (some turns
361
- // emit tool_use first with no preamble).
362
- const hasAssistantContent = !!added || (Array.isArray(event.message?.content)
363
- && event.message.content.some((b) => b?.type === 'tool_use'));
364
- if (hasAssistantContent) {
365
- head.fireFirstStream?.();
366
- // 0.7.7: any assistant content (text OR tool_use) disqualifies
367
- // the turn from transient-retry — re-sending the user prompt
368
- // after this point would replay tools that already executed.
369
- head.firstAssistantSeen = true;
370
- }
371
- // 0.7.6 (item F): accumulate usage + counters for turn telemetry.
372
- // The `result` event carries total_cost_usd + duration_ms but NOT
373
- // a usage breakdown; usage lives on each assistant.message.usage.
374
- // Anthropic emits cumulative totals per assistant message id
375
- // (so within a single message the last usage seen wins; across
376
- // distinct messages they sum).
377
- const usage = event.message?.usage;
378
- if (usage) {
379
- if (messageId != null && head.lastUsageMessageId === messageId) {
380
- // same message, replace running totals for this message
381
- head.usageByMessage.set(messageId, usage);
382
- } else {
383
- head.lastUsageMessageId = messageId;
384
- head.usageByMessage.set(messageId, usage);
385
- }
386
- }
387
- if (Array.isArray(event.message?.content)) {
388
- for (const b of event.message.content) {
389
- if (b?.type === 'tool_use') head.toolUseCount++;
390
- }
391
- }
392
- if (added) {
393
- // Pre-0.7.0 we did `streamText = streamText + '\n\n' + added`,
394
- // which DUPLICATED text on every update because `added` is
395
- // the cumulative full text-so-far of the current assistant
396
- // message (not a delta). 0.7.0 REPLACES instead — the new
397
- // text is already cumulative — and uses messageId boundaries
398
- // to fire onAssistantMessageStart for each new top-level
399
- // assistant message. The streamer responds by force-creating
400
- // a fresh bubble, so each assistant message gets its own.
401
- const isNewMessage = head.lastAssistantMessageId != null
402
- && messageId != null
403
- && head.lastAssistantMessageId !== messageId
404
- && head.streamText
405
- && head.streamText.length > 0;
406
- if (isNewMessage && this.onAssistantMessageStart) {
407
- try { this.onAssistantMessageStart(sessionKey, entry); }
408
- catch (err) { this.logger.error(`[${entry.label}] onAssistantMessageStart: ${err.message}`); }
409
- }
410
- if (messageId != null) head.lastAssistantMessageId = messageId;
411
- head.streamText = added;
412
- if (this.onStreamChunk) {
413
- try { this.onStreamChunk(sessionKey, head.streamText, entry); }
414
- catch (err) { this.logger.error(`[${entry.label}] onStreamChunk: ${err.message}`); }
415
- }
416
- }
417
- if (this.onToolUse) {
418
- const blocks = event.message?.content;
419
- if (Array.isArray(blocks)) {
420
- for (const b of blocks) {
421
- if (b?.type === 'tool_use' && b.name) {
422
- try { this.onToolUse(sessionKey, b.name, entry); }
423
- catch (err) { this.logger.error(`[${entry.label}] onToolUse: ${err.message}`); }
424
- }
425
- }
426
- }
427
- }
428
- }
429
-
430
- if (event.type === 'result' && head) {
431
- // 0.7.7: transient HTTP retry. If Anthropic returned a
432
- // retryable error AND the turn produced ZERO assistant
433
- // content yet AND we haven't already retried, sleep and
434
- // re-write the prompt instead of resolving the pending.
435
- // Idempotency: firstAssistantSeen guards against replaying
436
- // tools that already ran.
437
- const errSignal = event.error || event.subtype;
438
- const isError = event.subtype !== 'success';
439
- const shouldTransientRetry = isError
440
- && !head.firstAssistantSeen
441
- && head.transientRetries < MAX_TRANSIENT_RETRIES
442
- && head.prompt != null
443
- && isTransientHttpError({ message: errSignal, subtype: event.subtype });
444
- if (shouldTransientRetry) {
445
- head.transientRetries++;
446
- this._logEvent('transient-retry', {
447
- session_key: sessionKey,
448
- chat_id: entry.chatId,
449
- attempt: head.transientRetries,
450
- subtype: event.subtype,
451
- error: typeof errSignal === 'string' ? errSignal.slice(0, 200) : null,
452
- });
453
- // Reset accumulators so the retried turn's metrics aren't
454
- // contaminated by the failed-turn's totals (usage on a
455
- // failed turn IS billed but we surface it as a separate
456
- // event-log entry rather than mixing into turn_metrics).
457
- head.usageByMessage = new Map();
458
- head.lastUsageMessageId = null;
459
- head.toolUseCount = 0;
460
- head.streamText = '';
461
- head.lastAssistantMessageId = null;
462
- // Re-arm idle timer (the old one is still ticking from the
463
- // previous activate; resetIdleTimer just re-arms).
464
- head.resetIdleTimer?.();
465
- // Sleep then re-write. Keep the pending in-place; the next
466
- // 'result' event resolves it normally (or hits the same
467
- // retry path if MAX_TRANSIENT_RETRIES hadn't been
468
- // exhausted, which after the increment above it has).
469
- setTimeout(() => {
470
- // Edge case: pending was killed/aborted during the
471
- // retry sleep — process exited, queue drained, etc.
472
- // Skip the re-write if pendingQueue no longer holds us.
473
- if (entry.pendingQueue[0] !== head || entry.closed) return;
474
- try {
475
- entry.proc.stdin.write(JSON.stringify({
476
- type: 'user',
477
- message: { role: 'user', content: head.prompt },
478
- }) + '\n');
479
- } catch (err) {
480
- // stdin write failed — fall back to surfacing the
481
- // error. Mark as not-retried-anymore so we don't loop.
482
- this.logger.error(`[${entry.label}] transient-retry stdin write failed: ${err.message}`);
483
- entry.pendingQueue.shift();
484
- head.clearTimers();
485
- head.reject(err);
486
- }
487
- }, DEFAULT_TRANSIENT_RETRY_DELAY_MS);
488
- return; // don't shift / resolve; wait for next result
489
- }
490
-
491
- entry.pendingQueue.shift();
492
- head.clearTimers();
493
- if (this.onResult) this.onResult(sessionKey, event, entry, head);
494
- // 0.7.6 (item F): sum usage across distinct assistant messages
495
- // (each message id seen got its last-known usage stored; sum the
496
- // map values). Yields a single-row metric summary the caller
497
- // can persist via db.insertTurnMetric().
498
- const usageTotals = sumUsage(head.usageByMessage);
499
- head.resolve({
500
- text: event.result || '',
501
- sessionId: event.session_id,
502
- cost: event.total_cost_usd,
503
- duration: event.duration_ms,
504
- error: event.subtype === 'success' ? null : (event.error || event.subtype),
505
- metrics: {
506
- inputTokens: usageTotals.input_tokens,
507
- outputTokens: usageTotals.output_tokens,
508
- cacheCreationTokens: usageTotals.cache_creation_input_tokens,
509
- cacheReadTokens: usageTotals.cache_read_input_tokens,
510
- numAssistantMessages: head.usageByMessage.size,
511
- numToolUses: head.toolUseCount,
512
- resultSubtype: event.subtype || null,
513
- },
514
- });
515
- // Activate next head or settle idle state.
516
- if (entry.pendingQueue.length > 0) {
517
- entry.pendingQueue[0].activate();
518
- } else {
519
- entry.inFlight = false;
520
- // An entry just went idle → an LRU waiter might be able to run now.
521
- this._maybeSignalLruWaiter();
522
- // Graceful drain-and-respawn: if caller asked for a respawn
523
- // (e.g. /model change) and we just emptied the queue, kill now
524
- // and fire onRespawn so the caller can post confirmation.
525
- if (entry.needsRespawn) {
526
- const reason = entry.needsRespawn;
527
- entry.needsRespawn = null;
528
- this._logEvent('respawn-draining', {
529
- session_key: sessionKey,
530
- chat_id: entry.chatId,
531
- reason,
532
- });
533
- // wasDrained=true: this path runs after the queue emptied
534
- // naturally (an in-flight turn finished), so the user was
535
- // waiting and the confirmation message is meaningful.
536
- this._killAndNotifyRespawn(sessionKey, reason, true).catch(() => {});
537
- }
538
- }
539
- }
540
- });
541
-
542
- proc.on('close', (code) => {
543
- entry.closed = true;
544
- entry.inFlight = false;
545
- while (entry.pendingQueue.length > 0) {
546
- const p = entry.pendingQueue.shift();
547
- p.clearTimers?.();
548
- p.reject(new Error(`Process exited (code ${code})`));
549
- }
550
- this.procs.delete(sessionKey);
551
- // A slot freed up → maybe an LRU waiter can run now.
552
- this._maybeSignalLruWaiter();
553
- // rc.38: only fire `resume-fail` for UNEXPECTED non-zero exits.
554
- // Signal-driven exits during planned shutdown (SIGHUP from tmux
555
- // pty close on `tmux kill-session`, SIGTERM from our own kill(),
556
- // SIGKILL from the kill-timeout escalator) are NOT resume
557
- // failures — the saved session_id is still valid, we'd just be
558
- // clearing it for nothing and logging misleading noise on every
559
- // deploy. The real signal we care about is "the CLI rejected a
560
- // stale or corrupt resume id at startup with a non-zero exit
561
- // while polygram is healthy."
562
- const isPlannedShutdown = this._shuttingDown
563
- || code === null // killed without an exit code
564
- || code === 129 // SIGHUP (tmux pty close on deploy kickstart)
565
- || code === 143 // SIGTERM (our own kill())
566
- || code === 137; // SIGKILL (kill-timeout escalation)
567
- if (code !== 0 && ctx.existingSessionId && this.db?.clearSessionId
568
- && !isPlannedShutdown) {
569
- this._logEvent('resume-fail', { session_key: sessionKey, session_id: ctx.existingSessionId, code });
570
- try { this.db.clearSessionId(sessionKey); } catch (err) {
571
- this.logger.error(`[${entry.label}] clearSessionId failed: ${err.message}`);
572
- }
573
- }
574
- if (this.onClose) this.onClose(sessionKey, code, entry);
575
- });
576
-
577
- // rc.38: stdin error listener. Async EIO writes (the kernel reports
578
- // them after the subprocess pipe closed during shutdown) had no
579
- // listener pre-rc.38 → bubbled to the global uncaughtException
580
- // handler → emitted misleading `uncaught-exception: write EIO`
581
- // events on every deploy. Listening swallows that path; runtime
582
- // stdin errors (rare; usually a real problem) still log here.
583
- proc.stdin?.on?.('error', (err) => {
584
- this.logger.error(`[${entry.label}] stdin error: ${err.message}`);
585
- });
586
-
587
- proc.on('error', (err) => {
588
- this.logger.error(`[${entry.label}] proc error: ${err.message}`);
589
- entry.closed = true;
590
- entry.inFlight = false;
591
- while (entry.pendingQueue.length > 0) {
592
- const p = entry.pendingQueue.shift();
593
- p.clearTimers?.();
594
- p.reject(err);
595
- }
596
- this.procs.delete(sessionKey);
597
- });
598
-
599
- this.procs.set(sessionKey, entry);
600
- return entry;
601
- }
602
-
603
- /**
604
- * Append a turn to the queue. The returned promise resolves when Claude
605
- * emits a `result` event for this turn (they emerge in stdin-write
606
- * order). The underlying stdin write happens synchronously inside this
607
- * call — the caller should have already serialised writes across
608
- * sessions via an external lock if order matters.
609
- *
610
- * Options:
611
- * timeoutMs — idle timer between Claude events (default 10min)
612
- * maxTurnMs — wall-clock ceiling from "activate" time (default 30min)
613
- * context — opaque object stored on the pending (polygram puts
614
- * streamer, reactor, sourceMsgId here for its own use)
615
- */
616
- send(sessionKey, prompt, {
617
- timeoutMs = 600_000,
618
- maxTurnMs = 30 * 60_000,
619
- context = {},
620
- } = {}) {
621
- return new Promise((resolve, reject) => {
622
- const entry = this.procs.get(sessionKey);
623
- if (!entry || entry.closed) return reject(new Error('No process for session'));
624
- if (!entry.proc.stdin || entry.proc.stdin.destroyed || !entry.proc.stdin.writable) {
625
- return reject(new Error('Process stdin not writable'));
626
- }
627
- // If this entry is awaiting respawn, refuse new sends — the caller
628
- // should wait for the respawn to complete (which happens when the
629
- // current queue drains).
630
- if (entry.needsRespawn) {
631
- return reject(new Error(`Session awaiting respawn (${entry.needsRespawn})`));
632
- }
633
-
634
- entry.lastUsedTs = Date.now();
635
-
636
- let idleTimer = null;
637
- let maxTimer = null;
638
- let activated = false;
639
-
640
- const clearTimers = () => {
641
- if (idleTimer) { clearTimeout(idleTimer); idleTimer = null; }
642
- if (maxTimer) { clearTimeout(maxTimer); maxTimer = null; }
643
- };
644
-
645
- const pending = {
646
- resolve: (r) => { clearTimers(); resolve(r); },
647
- reject: (e) => { clearTimers(); reject(e); },
648
- clearTimers,
649
- startedAt: null,
650
- streamText: '',
651
- context,
652
- idleTimer: null,
653
- maxTimer: null,
654
- activated: false,
655
- // 0.7.6 (item F): per-turn telemetry accumulators. usageByMessage
656
- // collects each assistant message's last-seen usage; we sum
657
- // across messages at result time (each id is summed once, not
658
- // per stream chunk, since usage in stream-json is cumulative
659
- // *within* a message — last-seen-per-message wins).
660
- usageByMessage: new Map(),
661
- lastUsageMessageId: null,
662
- toolUseCount: 0,
663
- // 0.7.4 (item B): set true when the first stream event (assistant
664
- // text or tool_use) arrives for this pending. Fires
665
- // `context.onFirstStream` once. Used by polygram to flip the
666
- // status reaction QUEUED → THINKING when Claude actually starts
667
- // producing output, not when the pending becomes queue head
668
- // (which can be ~hundreds of ms before the first token).
669
- firstStreamFired: false,
670
- // 0.7.7: transient-retry support. We hold the prompt so we can
671
- // re-write it on transient 5xx/429 if zero assistant content
672
- // streamed yet. firstAssistantSeen flips on first assistant
673
- // event with non-empty content OR tool_use blocks — once true,
674
- // retry is no longer idempotent (we'd replay executed tools)
675
- // and pm surfaces the error instead.
676
- prompt,
677
- transientRetries: 0,
678
- firstAssistantSeen: false,
679
- };
680
-
681
- pending.fireFirstStream = () => {
682
- if (pending.firstStreamFired) return;
683
- pending.firstStreamFired = true;
684
- try { context?.onFirstStream?.(); }
685
- catch (err) { this.logger.error(`[${entry.label}] onFirstStream: ${err.message}`); }
686
- };
687
-
688
- const fireTimeout = (reason) => {
689
- // Only act if we're still the head; if we've been shifted/killed
690
- // already, this is a stale callback.
691
- if (entry.pendingQueue[0] !== pending) return;
692
- this._logEvent('turn-timeout', {
693
- session_key: sessionKey,
694
- chat_id: entry.chatId,
695
- reason,
696
- });
697
- // Remove from queue, reject. Per Q1 policy: don't kill the
698
- // subprocess — later pendings might still be fine.
699
- entry.pendingQueue.shift();
700
- pending.reject(new Error(reason));
701
- // Activate next head if any, else idle.
702
- if (entry.pendingQueue.length > 0) {
703
- entry.pendingQueue[0].activate();
704
- } else {
705
- entry.inFlight = false;
706
- }
707
- };
708
-
709
- const armIdle = () => setTimeout(
710
- () => fireTimeout(`Timeout: ${timeoutMs / 1000}s idle with no Claude activity`),
711
- timeoutMs,
712
- );
713
-
714
- pending.activate = () => {
715
- if (activated) return;
716
- activated = true;
717
- pending.activated = true;
718
- pending.startedAt = Date.now();
719
- idleTimer = armIdle();
720
- pending.idleTimer = idleTimer;
721
- maxTimer = setTimeout(
722
- () => fireTimeout(`Turn exceeded ${maxTurnMs / 1000}s wall-clock ceiling`),
723
- maxTurnMs,
724
- );
725
- pending.maxTimer = maxTimer;
726
- // Give callers a hook so they can transition user-visible state
727
- // (e.g. status reaction "👀 queued" → "🤔 thinking") the moment
728
- // Claude actually starts this pending, not the moment it arrived.
729
- try { context?.onActivate?.(); }
730
- catch (err) { this.logger.error(`[${entry.label}] onActivate: ${err.message}`); }
731
- };
732
-
733
- pending.resetIdleTimer = () => {
734
- if (!activated) return;
735
- if (idleTimer) clearTimeout(idleTimer);
736
- idleTimer = armIdle();
737
- pending.idleTimer = idleTimer;
738
- };
739
-
740
- // 0.7.6 (item H): enforce per-session queue cap. Drop the OLDEST
741
- // non-active pending (index 1 — index 0 is the in-flight head and
742
- // killing it mid-turn would corrupt Claude's state). The dropped
743
- // pending's promise rejects so its handler (polygram.js) can
744
- // surface a "couldn't keep up — message dropped" warning to the
745
- // user. We drop AFTER pushing the new pending so the cap means
746
- // "at most queueCap pendings live", not "refuse to enqueue past N".
747
- // Refusing the new write would lose the most recent message —
748
- // usually the one the user actually cares about — whereas
749
- // dropping the oldest preserves recency at the cost of a stale
750
- // queued turn that the user has likely moved past anyway.
751
- entry.pendingQueue.push(pending);
752
- entry.inFlight = true;
753
- while (entry.pendingQueue.length > this.queueCap) {
754
- // Splice at index 1 to leave the active head intact.
755
- const dropped = entry.pendingQueue.splice(1, 1)[0];
756
- if (!dropped) break;
757
- dropped.clearTimers?.();
758
- const dropErr = new Error(
759
- `queue overflow: dropped (queue cap ${this.queueCap})`,
760
- );
761
- dropErr.code = 'QUEUE_OVERFLOW';
762
- this._logEvent('queue-overflow-drop', {
763
- session_key: sessionKey,
764
- chat_id: entry.chatId,
765
- queue_len: entry.pendingQueue.length,
766
- source_msg_id: dropped.context?.sourceMsgId ?? null,
767
- });
768
- if (this.onQueueDrop) {
769
- try { this.onQueueDrop(sessionKey, dropped, entry); }
770
- catch (err) { this.logger.error(`[${entry.label}] onQueueDrop: ${err.message}`); }
771
- }
772
- dropped.reject(dropErr);
773
- }
774
-
775
- // If we're the only pending, activate immediately. Otherwise wait
776
- // until the preceding pending is shifted out.
777
- if (entry.pendingQueue.length === 1) pending.activate();
778
-
779
- try {
780
- entry.proc.stdin.write(JSON.stringify({
781
- type: 'user',
782
- message: { role: 'user', content: prompt },
783
- }) + '\n');
784
- } catch (err) {
785
- const idx = entry.pendingQueue.indexOf(pending);
786
- if (idx !== -1) entry.pendingQueue.splice(idx, 1);
787
- if (entry.pendingQueue.length === 0) entry.inFlight = false;
788
- pending.reject(err);
789
- }
790
- });
791
- }
792
-
793
- _logEvent(kind, detail) {
794
- if (!this.db?.logEvent) return;
795
- try { this.db.logEvent(kind, detail); }
796
- catch (err) { this.logger.error(`[pm] logEvent ${kind} failed: ${err.message}`); }
797
- }
798
- }
799
-
800
- module.exports = {
801
- ProcessManager,
802
- DEFAULT_CAP,
803
- DEFAULT_QUEUE_CAP,
804
- extractAssistantText,
805
- sumUsage,
806
- };