polygram 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/error/classify.js +47 -6
- package/lib/process/cli-process.js +122 -14
- package/lib/process-manager.js +13 -0
- package/lib/sdk/callbacks.js +35 -0
- package/package.json +1 -1
- package/polygram.js +14 -8
package/lib/error/classify.js
CHANGED
|
@@ -214,14 +214,25 @@ const CODES = {
|
|
|
214
214
|
isTransient: false,
|
|
215
215
|
autoRecover: null,
|
|
216
216
|
},
|
|
217
|
-
// TURN_TIMEOUT: per-turn time cap
|
|
218
|
-
//
|
|
219
|
-
//
|
|
220
|
-
//
|
|
221
|
-
//
|
|
217
|
+
// TURN_TIMEOUT: per-turn time cap fired because the turn went QUIET with no
|
|
218
|
+
// detectable progress (0.16: the busy-aware checkpoint extends a turn that's
|
|
219
|
+
// provably working, so reaching this code means the probe saw no streaming /
|
|
220
|
+
// no active shell — a genuine stall/wedge, not a long-but-working turn). Not
|
|
221
|
+
// transient. Copy must not name a number (the 2026-06-11 UMI false-⏱ rendered
|
|
222
|
+
// "10-minute" under a 60-minute cap).
|
|
222
223
|
TURN_TIMEOUT: {
|
|
223
224
|
kind: 'turnTimeout',
|
|
224
|
-
userMessage: '⏱ This one
|
|
225
|
+
userMessage: '⏱ This one went quiet with no progress, so I stopped waiting — send /stop to clear it, or resend if you still need it.',
|
|
226
|
+
isTransient: false,
|
|
227
|
+
autoRecover: null,
|
|
228
|
+
},
|
|
229
|
+
// TURN_MAX_EXCEEDED (0.16): the busy-aware checkpoint kept extending a turn
|
|
230
|
+
// that WAS still working, until it hit the hard wall-clock backstop
|
|
231
|
+
// (turnHardMaxMs, default 90 min). Distinct from TURN_TIMEOUT (which means
|
|
232
|
+
// "went quiet") — here it ran genuinely long and we capped it for safety.
|
|
233
|
+
TURN_MAX_EXCEEDED: {
|
|
234
|
+
kind: 'turnMaxExceeded',
|
|
235
|
+
userMessage: '⏱ This ran past the max time and I had to stop it. Resend if you still need it — or break it into smaller steps.',
|
|
225
236
|
isTransient: false,
|
|
226
237
|
autoRecover: null,
|
|
227
238
|
},
|
|
@@ -416,8 +427,38 @@ function detectWedgedSessionError(text) {
|
|
|
416
427
|
return cls;
|
|
417
428
|
}
|
|
418
429
|
|
|
430
|
+
/**
|
|
431
|
+
* 0.16: decide how the streamed-reply catch (polygram.js handleMessage) should
|
|
432
|
+
* cap the bubble + set the reactor when a turn ends in error. Extracted as a
|
|
433
|
+
* pure fn so the decision is unit-testable (the catch itself isn't unit-reachable).
|
|
434
|
+
*
|
|
435
|
+
* Returns { errorSuffix, reactorState }:
|
|
436
|
+
* - errorSuffix: appended to streamer.finalize('') (null = no suffix)
|
|
437
|
+
* - reactorState: reactor.setState(...) value
|
|
438
|
+
*
|
|
439
|
+
* Turn-end timeouts (TURN_TIMEOUT = went quiet, TURN_MAX_EXCEEDED = hit hard cap)
|
|
440
|
+
* are real stops → the "stream interrupted" suffix is honest here. Note: the cli
|
|
441
|
+
* backend's TURN_TIMEOUT err.message is `turn timeout (...)` which does NOT match
|
|
442
|
+
* the legacy /wall-clock ceiling|idle.../ regex, so we branch on err.code, not
|
|
443
|
+
* the message text (a v1-review correction).
|
|
444
|
+
*/
|
|
445
|
+
function classifyTurnEndError(err) {
|
|
446
|
+
const code = err?.code;
|
|
447
|
+
// The cli backend sets err.code (TURN_TIMEOUT / TURN_MAX_EXCEEDED). The SDK +
|
|
448
|
+
// tmux backends reject with a MESSAGE and NO code (e.g. "Turn exceeded 1800s
|
|
449
|
+
// wall-clock ceiling" / "Timeout: 600s idle with no Claude activity"), so we
|
|
450
|
+
// MUST keep the legacy message regex as a fallback — without it those
|
|
451
|
+
// backends' timeouts flip from the calm ⏱ TIMEOUT reactor to the scary ERROR
|
|
452
|
+
// one (regression caught in the 0.16 code review).
|
|
453
|
+
const isTimeout = code === 'TURN_TIMEOUT'
|
|
454
|
+
|| code === 'TURN_MAX_EXCEEDED'
|
|
455
|
+
|| /wall-clock ceiling|idle with no Claude activity/i.test(err?.message || '');
|
|
456
|
+
return { errorSuffix: 'stream interrupted', reactorState: isTimeout ? 'TIMEOUT' : 'ERROR' };
|
|
457
|
+
}
|
|
458
|
+
|
|
419
459
|
module.exports = {
|
|
420
460
|
classify,
|
|
461
|
+
classifyTurnEndError,
|
|
421
462
|
detectWedgedSessionError,
|
|
422
463
|
isTransientHttpError,
|
|
423
464
|
PATTERNS,
|
|
@@ -97,7 +97,8 @@ const INPUT_LEDGER_CAP = 64;
|
|
|
97
97
|
// (Envelope shape verified from prod JSONL + the P0 spike — Q1.)
|
|
98
98
|
const UPS_ENVELOPE_TURN_ID_RE = /<channel\s[^>]*turn_id="([0-9a-f-]{36})"/g;
|
|
99
99
|
const DEFAULT_TURN_TIMEOUT_MS = 600_000; // 10 min idle cap (resets on each reply — Review F#13)
|
|
100
|
-
const DEFAULT_TURN_ABSOLUTE_MS = 1_800_000; // 30 min
|
|
100
|
+
const DEFAULT_TURN_ABSOLUTE_MS = 1_800_000; // 30 min busy-aware checkpoint interval (0.16: re-arms while working)
|
|
101
|
+
const DEFAULT_TURN_HARD_MAX_MS = 5_400_000; // 90 min hard wall-clock backstop (0.16: extension can't exceed this)
|
|
101
102
|
const DEFAULT_INTERRUPT_GRACE_MS = 5_000; // after Ctrl-C, wait this long for Claude to ack before synthesizing 'interrupted'
|
|
102
103
|
const DEFAULT_MAX_REPLIES_PER_TURN = 20; // P1 #12: cap on quiet-window resets to prevent chatty-Claude hang
|
|
103
104
|
const PING_INTERVAL_MS = 10_000;
|
|
@@ -223,6 +224,7 @@ class CliProcess extends Process {
|
|
|
223
224
|
deliveryWatchdogMs = DEFAULT_DELIVERY_WATCHDOG_MS,
|
|
224
225
|
turnTimeoutMs = DEFAULT_TURN_TIMEOUT_MS,
|
|
225
226
|
turnAbsoluteMs = DEFAULT_TURN_ABSOLUTE_MS,
|
|
227
|
+
turnHardMaxMs = DEFAULT_TURN_HARD_MAX_MS,
|
|
226
228
|
bgWorkStallMs = DEFAULT_BG_WORK_STALL_MS,
|
|
227
229
|
interruptGraceMs = DEFAULT_INTERRUPT_GRACE_MS,
|
|
228
230
|
maxRepliesPerTurn = DEFAULT_MAX_REPLIES_PER_TURN,
|
|
@@ -258,6 +260,7 @@ class CliProcess extends Process {
|
|
|
258
260
|
this.deliveryWatchdogMs = deliveryWatchdogMs;
|
|
259
261
|
this.turnTimeoutMs = turnTimeoutMs;
|
|
260
262
|
this.turnAbsoluteMs = turnAbsoluteMs;
|
|
263
|
+
this.turnHardMaxMs = turnHardMaxMs;
|
|
261
264
|
this.bgWorkStallMs = bgWorkStallMs;
|
|
262
265
|
this.interruptGraceMs = interruptGraceMs;
|
|
263
266
|
this.maxRepliesPerTurn = maxRepliesPerTurn;
|
|
@@ -2083,7 +2086,12 @@ class CliProcess extends Process {
|
|
|
2083
2086
|
// Added absoluteTimer as the true wall-clock ceiling at 30 min so a
|
|
2084
2087
|
// legitimate 15-min "replies every 60s" turn isn't killed mid-stream
|
|
2085
2088
|
// while still bounding runaways.
|
|
2086
|
-
|
|
2089
|
+
// 0.16: `reason` ∈ {'idle','absolute','hard-max'}. The absolute checkpoint
|
|
2090
|
+
// (_checkpointAbsolute) passes its already-captured `probeResult` so we
|
|
2091
|
+
// don't double capture-pane on the give-up path. err.code is mapped from
|
|
2092
|
+
// reason: 'hard-max' → TURN_MAX_EXCEEDED (ran long while working), else
|
|
2093
|
+
// → TURN_TIMEOUT (went quiet / idle).
|
|
2094
|
+
const fireTimeout = (reason, probeResult = null) => {
|
|
2087
2095
|
if (!this.pendingTurns.has(turnId)) return;
|
|
2088
2096
|
const pending = this.pendingTurns.get(turnId);
|
|
2089
2097
|
// 0.13 D1 (S9): unblock any open ask FIRST — claude must never stay
|
|
@@ -2105,7 +2113,11 @@ class CliProcess extends Process {
|
|
|
2105
2113
|
if (pending._activityQuietTimer) clearTimeout(pending._activityQuietTimer);
|
|
2106
2114
|
if (pending._onStop) this.off('stop-hook', pending._onStop);
|
|
2107
2115
|
this.inFlight = this.pendingTurns.size > 0;
|
|
2108
|
-
const turnTimeoutMs = reason === '
|
|
2116
|
+
const turnTimeoutMs = reason === 'hard-max'
|
|
2117
|
+
? (pending._turnHardMaxMs || this.turnHardMaxMs)
|
|
2118
|
+
: reason === 'absolute'
|
|
2119
|
+
? this.turnAbsoluteMs
|
|
2120
|
+
: (opts.maxTurnMs || this.turnTimeoutMs);
|
|
2109
2121
|
|
|
2110
2122
|
// 0.13 D1 ceiling-resolve: a ceiling expiring on a turn with delivered
|
|
2111
2123
|
// replies RESOLVES it — the user already has their answer; rejecting
|
|
@@ -2150,10 +2162,10 @@ class CliProcess extends Process {
|
|
|
2150
2162
|
// 0.12.3 wedge characterization (docs/0.13-turn-wedge-autorecovery-spec.md):
|
|
2151
2163
|
// a zero-reply turn hit the ceiling = claude wedged (no hooks AND no
|
|
2152
2164
|
// "esc to interrupt" the whole window). Capture the TUI pane tail + busy
|
|
2153
|
-
// flags to learn WHAT state claude is stuck in
|
|
2154
|
-
//
|
|
2155
|
-
//
|
|
2156
|
-
|
|
2165
|
+
// flags to learn WHAT state claude is stuck in. 0.16: reuse the probe the
|
|
2166
|
+
// absolute checkpoint already captured (probeResult) to avoid a second
|
|
2167
|
+
// capture-pane; only probe fresh on the idle-timer path (no prior probe).
|
|
2168
|
+
const logProbe = (probe) => {
|
|
2157
2169
|
this._logEvent('turn-timeout-pane', {
|
|
2158
2170
|
reason,
|
|
2159
2171
|
streaming: probe.streaming,
|
|
@@ -2162,10 +2174,12 @@ class CliProcess extends Process {
|
|
|
2162
2174
|
captured: probe.captured,
|
|
2163
2175
|
pane_tail: probe.paneTail,
|
|
2164
2176
|
});
|
|
2165
|
-
}
|
|
2177
|
+
};
|
|
2178
|
+
if (probeResult) { try { logProbe(probeResult); } catch { /* best-effort */ } }
|
|
2179
|
+
else this.probeBusyState().then(logProbe).catch(() => { /* telemetry best-effort */ });
|
|
2166
2180
|
this.emit('idle');
|
|
2167
2181
|
const err = new Error(`turn timeout (${turnTimeoutMs}ms, reason=${reason})`);
|
|
2168
|
-
err.code = 'TURN_TIMEOUT';
|
|
2182
|
+
err.code = reason === 'hard-max' ? 'TURN_MAX_EXCEEDED' : 'TURN_TIMEOUT';
|
|
2169
2183
|
reject(err);
|
|
2170
2184
|
};
|
|
2171
2185
|
const pending = {
|
|
@@ -2179,15 +2193,24 @@ class CliProcess extends Process {
|
|
|
2179
2193
|
// hardTimer = idle ceiling. Resets on any activity (_noteActivity)
|
|
2180
2194
|
// so a chatty or tool-heavy turn isn't killed at 10 min wall-clock.
|
|
2181
2195
|
hardTimer: setTimeout(() => fireTimeout('idle'), opts.maxTurnMs || this.turnTimeoutMs),
|
|
2182
|
-
// absoluteTimer =
|
|
2183
|
-
//
|
|
2184
|
-
//
|
|
2185
|
-
//
|
|
2186
|
-
|
|
2196
|
+
// absoluteTimer = busy-aware checkpoint (0.16). Fires every
|
|
2197
|
+
// turnAbsoluteMs (30min) as a LIVENESS CHECK: if the turn is provably
|
|
2198
|
+
// working (streaming/active shell + progress since last checkpoint) and
|
|
2199
|
+
// under the hard backstop, re-arm; else give up. Replaces the old
|
|
2200
|
+
// one-shot 30-min guillotine that cut actively-working turns.
|
|
2201
|
+
absoluteTimer: setTimeout(() => this._checkpointAbsolute(turnId), this.turnAbsoluteMs),
|
|
2187
2202
|
// Review F#13: attach fireTimeout so activity can reset the idle
|
|
2188
2203
|
// timer (creates a fresh setTimeout with the same closure).
|
|
2189
2204
|
_fireTimeout: fireTimeout,
|
|
2190
2205
|
startedAt: Date.now(),
|
|
2206
|
+
// 0.16: hard wall-clock backstop for this turn (per-send override →
|
|
2207
|
+
// instance default). The checkpoint never extends past this.
|
|
2208
|
+
_turnHardMaxMs: opts.maxTurnHardMs || this.turnHardMaxMs,
|
|
2209
|
+
// 0.16: checkpoint progress-tracking (MF-A) — extend only if activity
|
|
2210
|
+
// advanced since the previous checkpoint, not just "a shell exists".
|
|
2211
|
+
_lastCheckpointActivityAt: Date.now(),
|
|
2212
|
+
_lastCheckpointPaneTail: null,
|
|
2213
|
+
_extended: false,
|
|
2191
2214
|
};
|
|
2192
2215
|
this.pendingTurns.set(turnId, pending);
|
|
2193
2216
|
|
|
@@ -2332,6 +2355,81 @@ class CliProcess extends Process {
|
|
|
2332
2355
|
this._interruptGraceTimer.unref?.();
|
|
2333
2356
|
}
|
|
2334
2357
|
|
|
2358
|
+
/**
|
|
2359
|
+
* 0.16 busy-aware ceiling checkpoint. Armed by the per-turn absoluteTimer
|
|
2360
|
+
* every `turnAbsoluteMs` (30min). Decides whether to EXTEND a still-working
|
|
2361
|
+
* turn or give up:
|
|
2362
|
+
*
|
|
2363
|
+
* - replied turn → resolve gracefully (delegate to fireTimeout, which takes
|
|
2364
|
+
* the line-2118 ceiling-resolve branch).
|
|
2365
|
+
* - probe says working AND progress advanced since last checkpoint AND
|
|
2366
|
+
* elapsed < hard backstop → re-arm (turn stays pending, /stop keeps
|
|
2367
|
+
* working, the live reply lands in the same bubble). Ping once.
|
|
2368
|
+
* - not working / no progress → give up as 'idle' → TURN_TIMEOUT (went quiet).
|
|
2369
|
+
* - elapsed ≥ hard backstop → give up as 'hard-max' → TURN_MAX_EXCEEDED.
|
|
2370
|
+
*
|
|
2371
|
+
* MF-A: "working" requires evidence of PROGRESS (streaming now, or activity /
|
|
2372
|
+
* pane changed since the last checkpoint), not merely a shell's existence — a
|
|
2373
|
+
* hung/zombie background shell would otherwise extend to the hard max.
|
|
2374
|
+
* MF-C: re-check pendingTurns AFTER the async probe (the turn can resolve /
|
|
2375
|
+
* abort / kill during the capture-pane round-trip — TOCTOU), and reassign
|
|
2376
|
+
* pending.absoluteTimer so teardown sites clear the live handle.
|
|
2377
|
+
*/
|
|
2378
|
+
async _checkpointAbsolute(turnId) {
|
|
2379
|
+
if (!this.pendingTurns.has(turnId)) return;
|
|
2380
|
+
let pending = this.pendingTurns.get(turnId);
|
|
2381
|
+
// Replied turn (or consumed-acked): the ceiling RESOLVES it, never extends.
|
|
2382
|
+
if ((pending.replies?.length || 0) > 0
|
|
2383
|
+
|| (pending.seen === true && pending._consumedAcked === true)) {
|
|
2384
|
+
pending._fireTimeout('absolute');
|
|
2385
|
+
return;
|
|
2386
|
+
}
|
|
2387
|
+
let probe = null;
|
|
2388
|
+
try { probe = await this.probeBusyState(); } catch { probe = null; }
|
|
2389
|
+
// MF-C TOCTOU: the turn may have settled during the capture-pane await.
|
|
2390
|
+
if (!this.pendingTurns.has(turnId)) return;
|
|
2391
|
+
pending = this.pendingTurns.get(turnId);
|
|
2392
|
+
// Also bail if the turn entered finalization DURING the probe — a reply
|
|
2393
|
+
// landed, or it's in stop-grace, or it consumed-acked. Re-arming or pinging
|
|
2394
|
+
// now would resurrect a settling turn (spurious "still working" right as the
|
|
2395
|
+
// real answer lands). It will finalize through its own quiet/grace path.
|
|
2396
|
+
if (pending._stopGracePending
|
|
2397
|
+
|| (pending.replies?.length || 0) > 0
|
|
2398
|
+
|| (pending.seen === true && pending._consumedAcked === true)) return;
|
|
2399
|
+
const now = Date.now();
|
|
2400
|
+
const elapsed = now - pending.startedAt;
|
|
2401
|
+
const maxMs = pending._turnHardMaxMs || this.turnHardMaxMs;
|
|
2402
|
+
const streaming = !!(probe && probe.streaming);
|
|
2403
|
+
const hasShell = !!(probe && (probe.backgroundShell || probe.shellCount > 0));
|
|
2404
|
+
const lastAct = Math.max(this._lastActivityAt || 0, this._lastHookEventAt || 0);
|
|
2405
|
+
// MF-A progress delta: streaming NOW is live proof; otherwise require that
|
|
2406
|
+
// activity advanced OR the pane changed since the previous checkpoint.
|
|
2407
|
+
const progressed = streaming
|
|
2408
|
+
|| (lastAct > (pending._lastCheckpointActivityAt || pending.startedAt))
|
|
2409
|
+
|| (!!probe && probe.paneTail != null && probe.paneTail !== pending._lastCheckpointPaneTail);
|
|
2410
|
+
const working = (streaming || hasShell) && progressed;
|
|
2411
|
+
|
|
2412
|
+
if (working && elapsed < maxMs) {
|
|
2413
|
+
pending._lastCheckpointActivityAt = lastAct || pending._lastCheckpointActivityAt;
|
|
2414
|
+
pending._lastCheckpointPaneTail = (probe && probe.paneTail) || pending._lastCheckpointPaneTail;
|
|
2415
|
+
// MF-C: reassign the live handle so cleanup sites clear THIS timer.
|
|
2416
|
+
pending.absoluteTimer = setTimeout(() => this._checkpointAbsolute(turnId), this.turnAbsoluteMs);
|
|
2417
|
+
this._logEvent('turn-extended', {
|
|
2418
|
+
turn_id: turnId, elapsed_ms: elapsed, streaming, shell_count: probe ? probe.shellCount : 0,
|
|
2419
|
+
});
|
|
2420
|
+
// Progress ping ONCE per turn (first extension) — emits an event polygram
|
|
2421
|
+
// turns into a single "still working" message (honest: probe-confirmed).
|
|
2422
|
+
if (!pending._extended) {
|
|
2423
|
+
pending._extended = true;
|
|
2424
|
+
this.emit('turn-extended', { sessionKey: this.sessionKey, turnId, elapsedMs: elapsed });
|
|
2425
|
+
}
|
|
2426
|
+
return;
|
|
2427
|
+
}
|
|
2428
|
+
// Give up: hard-max (was working but ran too long) vs idle (went quiet).
|
|
2429
|
+
const reason = elapsed >= maxMs ? 'hard-max' : 'idle';
|
|
2430
|
+
pending._fireTimeout(reason, probe);
|
|
2431
|
+
}
|
|
2432
|
+
|
|
2335
2433
|
/**
|
|
2336
2434
|
* Is claude actually still working, regardless of the resolved-turn flag?
|
|
2337
2435
|
*
|
|
@@ -2425,6 +2523,16 @@ class CliProcess extends Process {
|
|
|
2425
2523
|
return this._bgWorkSince !== null;
|
|
2426
2524
|
}
|
|
2427
2525
|
|
|
2526
|
+
/**
|
|
2527
|
+
* 0.16 (MF-B): does any in-flight turn have a busy-aware ceiling EXTENSION
|
|
2528
|
+
* active? Such a turn can hold its slot up to the hard backstop, so the LRU
|
|
2529
|
+
* treats it as a durable pin (soft-overflow) rather than a transient turn.
|
|
2530
|
+
*/
|
|
2531
|
+
hasExtendedTurn() {
|
|
2532
|
+
for (const p of this.pendingTurns.values()) if (p._extended) return true;
|
|
2533
|
+
return false;
|
|
2534
|
+
}
|
|
2535
|
+
|
|
2428
2536
|
/**
|
|
2429
2537
|
* Resolve the model / effort for a spawn context using the topic→chat→
|
|
2430
2538
|
* fallback precedence (mirrors the spawn path). Single source of truth shared
|
package/lib/process-manager.js
CHANGED
|
@@ -53,6 +53,12 @@ const CALLBACK_TO_EVENT = {
|
|
|
53
53
|
// posts/edits a "⏳ working in background" status message so a long job reads as
|
|
54
54
|
// working, not stuck. See docs/0.12.0-background-work-lifecycle-plan.md.
|
|
55
55
|
onBgWorkStatus: 'bg-work-status',
|
|
56
|
+
// 0.16 busy-aware ceiling: CliProcess emits 'turn-extended' the FIRST time a
|
|
57
|
+
// turn passes the 30-min checkpoint while still provably working. The callback
|
|
58
|
+
// posts a one-time "⏳ still working — /stop to cancel" message so a long turn
|
|
59
|
+
// reads as alive (not the old false "stream interrupted"). See
|
|
60
|
+
// docs/0.16-turn-ceiling-busy-aware-spec.md.
|
|
61
|
+
onTurnExtended: 'turn-extended',
|
|
56
62
|
// 0.12 interactive questions: CliProcess emits 'question-asked'
|
|
57
63
|
// {sessionKey, chatId, threadId, turnId, toolCallId, questions} when claude calls
|
|
58
64
|
// the `ask` tool. The callback (polygram) renders the Telegram inline keyboard;
|
|
@@ -351,6 +357,12 @@ class ProcessManager {
|
|
|
351
357
|
_hasPinnedSession() {
|
|
352
358
|
for (const p of this.procs.values()) {
|
|
353
359
|
if (!p.inFlight && p.hasActiveBackgroundWork()) return true;
|
|
360
|
+
// 0.16 (MF-B): an extended busy-aware-ceiling turn is a DURABLE blocker —
|
|
361
|
+
// it can hold its slot up to the hard backstop (90min), not "seconds" like
|
|
362
|
+
// a normal in-flight turn. Treat it as a pin so getOrSpawn SOFT-overflows
|
|
363
|
+
// (spawn over budget + warn) instead of park-then-reject, which would deny
|
|
364
|
+
// service to other chats for the full 5-min LRU wait.
|
|
365
|
+
if (p.inFlight && typeof p.hasExtendedTurn === 'function' && p.hasExtendedTurn()) return true;
|
|
354
366
|
}
|
|
355
367
|
return false;
|
|
356
368
|
}
|
|
@@ -359,6 +371,7 @@ class ProcessManager {
|
|
|
359
371
|
const keys = [];
|
|
360
372
|
for (const [k, p] of this.procs.entries()) {
|
|
361
373
|
if (!p.inFlight && p.hasActiveBackgroundWork()) keys.push(k);
|
|
374
|
+
else if (p.inFlight && typeof p.hasExtendedTurn === 'function' && p.hasExtendedTurn()) keys.push(k);
|
|
362
375
|
}
|
|
363
376
|
return keys;
|
|
364
377
|
}
|
package/lib/sdk/callbacks.js
CHANGED
|
@@ -411,6 +411,41 @@ function createSdkCallbacks({
|
|
|
411
411
|
}
|
|
412
412
|
},
|
|
413
413
|
|
|
414
|
+
// 0.16 busy-aware ceiling: CliProcess emits 'turn-extended' the FIRST time a
|
|
415
|
+
// turn passes the 30-min checkpoint while still provably working. Post ONE
|
|
416
|
+
// honest "still working — /stop" message so a long turn reads as alive
|
|
417
|
+
// instead of the old false "stream interrupted". The cli side flags
|
|
418
|
+
// _extended once per turn, so this fires at most once per long turn.
|
|
419
|
+
// Opt-out per chat via progressPings:false. NO "ask how it's going" — a
|
|
420
|
+
// foreground-streaming turn can't answer (review F1).
|
|
421
|
+
onTurnExtended: async (sessionKey, payload) => {
|
|
422
|
+
try {
|
|
423
|
+
if (!bot) return;
|
|
424
|
+
const chatId = getChatIdFromKey(sessionKey);
|
|
425
|
+
const chatCfg = (config && config.chats && config.chats[chatId]) || {};
|
|
426
|
+
// Precedence: per-chat overrides default (a chat can re-enable pings even
|
|
427
|
+
// if the global default disables them, and vice-versa). Default ON.
|
|
428
|
+
const chatPings = chatCfg.progressPings;
|
|
429
|
+
const enabled = chatPings !== undefined
|
|
430
|
+
? chatPings !== false
|
|
431
|
+
: (config && config.defaults && config.defaults.progressPings) !== false;
|
|
432
|
+
if (!enabled) return;
|
|
433
|
+
const threadIdRaw = getThreadIdFromKey(sessionKey);
|
|
434
|
+
const threadId = threadIdRaw ? parseInt(threadIdRaw, 10) : null;
|
|
435
|
+
await tg(bot, 'sendMessage', {
|
|
436
|
+
chat_id: chatId,
|
|
437
|
+
text: '⏳ Still working on this — it\'s taking a while. Send /stop to cancel.',
|
|
438
|
+
...(Number.isInteger(threadId) && { message_thread_id: threadId }),
|
|
439
|
+
}, { source: 'turn-extended', botName });
|
|
440
|
+
logEvent('turn-extended-ping', {
|
|
441
|
+
chat_id: chatId, session_key: sessionKey, thread_id: threadIdRaw,
|
|
442
|
+
elapsed_ms: payload?.elapsedMs ?? null,
|
|
443
|
+
});
|
|
444
|
+
} catch (err) {
|
|
445
|
+
logger.error?.(`[${botName}] turn-extended handler: ${err.message}`);
|
|
446
|
+
}
|
|
447
|
+
},
|
|
448
|
+
|
|
414
449
|
// R8: a failed autosteer paste. injectUserMessage fires
|
|
415
450
|
// `inject-fail` when its fire-and-forget paste rejects (tmux
|
|
416
451
|
// server gone, paste-buffer error, etc.). Before this handler was
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "polygram",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.16.0",
|
|
4
4
|
"description": "Telegram daemon for Claude Code that preserves the OpenClaw per-chat session model. Migration path for OpenClaw users moving to Claude Code.",
|
|
5
5
|
"main": "lib/ipc/client.js",
|
|
6
6
|
"bin": {
|
package/polygram.js
CHANGED
|
@@ -108,7 +108,7 @@ const { startTyping } = require('./lib/telegram/typing');
|
|
|
108
108
|
const { createReactionManager, classifyToolName } = require('./lib/telegram/reactions');
|
|
109
109
|
const { createMediaGroupBuffer } = require('./lib/media-group-buffer');
|
|
110
110
|
const { applyReactionToMessages } = require('./lib/telegram/album-reactions');
|
|
111
|
-
const { classify: classifyError, detectWedgedSessionError, isTransientHttpError } = require('./lib/error/classify');
|
|
111
|
+
const { classify: classifyError, classifyTurnEndError, detectWedgedSessionError, isTransientHttpError } = require('./lib/error/classify');
|
|
112
112
|
const { createAutoResumeTracker, isAutoResumable } = require('./lib/db/auto-resume');
|
|
113
113
|
const { resolveReplayWindowMs } = require('./lib/db/replay-window');
|
|
114
114
|
const { pruneEvents, resolveRetentionPolicy, validatePolicy } = require('./lib/db/events-retention');
|
|
@@ -510,6 +510,10 @@ async function sendToProcess(sessionKey, prompt, context = {}, { onDispatched }
|
|
|
510
510
|
const chatConfig = config.chats[chatId];
|
|
511
511
|
const timeoutMs = (chatConfig.timeout || config.defaults.timeout) * 1000;
|
|
512
512
|
const maxTurnMs = (chatConfig.maxTurn || config.defaults?.maxTurn || 1800) * 1000;
|
|
513
|
+
// 0.16 busy-aware ceiling: hard wall-clock backstop for a turn that keeps
|
|
514
|
+
// extending while provably working (cli backend). Per-chat → default →
|
|
515
|
+
// 90 min. The checkpoint never extends a turn past this.
|
|
516
|
+
const maxTurnHardMs = (chatConfig.maxTurnHard || config.defaults?.maxTurnHard || 5400) * 1000;
|
|
513
517
|
|
|
514
518
|
// 0.12 Phase 2.1: HeartbeatReactor binding removed for CliProcess.
|
|
515
519
|
// 0.11.0-channels needed a random-cycling working-pool reactor because
|
|
@@ -540,7 +544,7 @@ async function sendToProcess(sessionKey, prompt, context = {}, { onDispatched }
|
|
|
540
544
|
// starts, which is the correct UX (and what the user already expects).
|
|
541
545
|
const release = await stdinLock.acquire(sessionKey);
|
|
542
546
|
try {
|
|
543
|
-
const turnP = pm.send(sessionKey, prompt, { timeoutMs, maxTurnMs, context });
|
|
547
|
+
const turnP = pm.send(sessionKey, prompt, { timeoutMs, maxTurnMs, maxTurnHardMs, context });
|
|
544
548
|
// Phase 3 §4: pm.send synchronously kicks off the turn — the
|
|
545
549
|
// process is now inFlight. Signal the committed-intent latch so
|
|
546
550
|
// it can release; a concurrent handler will then correctly see
|
|
@@ -1711,12 +1715,14 @@ async function handleMessage(sessionKey, chatId, msg, bot) {
|
|
|
1711
1715
|
// On shutdown, leave the reactor state as-is — boot-replay's
|
|
1712
1716
|
// fresh dispatch will set its own reactor.
|
|
1713
1717
|
} else {
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
}
|
|
1718
|
+
// 0.16: branch the bubble suffix + reactor on err.code via the pure
|
|
1719
|
+
// classifyTurnEndError helper (the cli TURN_TIMEOUT message is
|
|
1720
|
+
// `turn timeout (...)`, which does NOT match the legacy regex — branch on
|
|
1721
|
+
// code, not text). TURN_TIMEOUT (went quiet) / TURN_MAX_EXCEEDED (hit hard
|
|
1722
|
+
// cap) → TIMEOUT reactor; anything else → ERROR.
|
|
1723
|
+
const { errorSuffix, reactorState } = classifyTurnEndError(err);
|
|
1724
|
+
await streamer.finalize('', errorSuffix ? { errorSuffix } : {}).catch(() => {});
|
|
1725
|
+
reactor.setState(reactorState);
|
|
1720
1726
|
}
|
|
1721
1727
|
throw err;
|
|
1722
1728
|
} finally {
|