instar 0.28.65 → 0.28.67
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +53 -0
- package/dist/cli.js.map +1 -1
- package/dist/core/AutoDispatcher.d.ts +4 -1
- package/dist/core/AutoDispatcher.d.ts.map +1 -1
- package/dist/core/AutoDispatcher.js +5 -4
- package/dist/core/AutoDispatcher.js.map +1 -1
- package/dist/core/AutoUpdater.d.ts +6 -1
- package/dist/core/AutoUpdater.d.ts.map +1 -1
- package/dist/core/AutoUpdater.js +7 -4
- package/dist/core/AutoUpdater.js.map +1 -1
- package/dist/lifeline/LifelineHealthWatchdog.d.ts +81 -0
- package/dist/lifeline/LifelineHealthWatchdog.d.ts.map +1 -0
- package/dist/lifeline/LifelineHealthWatchdog.js +122 -0
- package/dist/lifeline/LifelineHealthWatchdog.js.map +1 -0
- package/dist/lifeline/RestartOrchestrator.d.ts +73 -0
- package/dist/lifeline/RestartOrchestrator.d.ts.map +1 -0
- package/dist/lifeline/RestartOrchestrator.js +124 -0
- package/dist/lifeline/RestartOrchestrator.js.map +1 -0
- package/dist/lifeline/TelegramLifeline.d.ts +55 -1
- package/dist/lifeline/TelegramLifeline.d.ts.map +1 -1
- package/dist/lifeline/TelegramLifeline.js +364 -41
- package/dist/lifeline/TelegramLifeline.js.map +1 -1
- package/dist/lifeline/droppedMessages.d.ts +67 -0
- package/dist/lifeline/droppedMessages.d.ts.map +1 -0
- package/dist/lifeline/droppedMessages.js +179 -0
- package/dist/lifeline/droppedMessages.js.map +1 -0
- package/dist/lifeline/forwardErrors.d.ts +38 -0
- package/dist/lifeline/forwardErrors.d.ts.map +1 -0
- package/dist/lifeline/forwardErrors.js +53 -0
- package/dist/lifeline/forwardErrors.js.map +1 -0
- package/dist/lifeline/rateLimitState.d.ts +63 -0
- package/dist/lifeline/rateLimitState.d.ts.map +1 -0
- package/dist/lifeline/rateLimitState.js +110 -0
- package/dist/lifeline/rateLimitState.js.map +1 -0
- package/dist/lifeline/retryWithBackoff.d.ts +28 -0
- package/dist/lifeline/retryWithBackoff.d.ts.map +1 -0
- package/dist/lifeline/retryWithBackoff.js +34 -0
- package/dist/lifeline/retryWithBackoff.js.map +1 -0
- package/dist/lifeline/startupMarker.d.ts +20 -0
- package/dist/lifeline/startupMarker.d.ts.map +1 -0
- package/dist/lifeline/startupMarker.js +52 -0
- package/dist/lifeline/startupMarker.js.map +1 -0
- package/dist/lifeline/versionHandshake.d.ts +40 -0
- package/dist/lifeline/versionHandshake.d.ts.map +1 -0
- package/dist/lifeline/versionHandshake.js +45 -0
- package/dist/lifeline/versionHandshake.js.map +1 -0
- package/dist/messaging/shared/compactionResumePayload.d.ts +1 -1
- package/dist/messaging/shared/compactionResumePayload.d.ts.map +1 -1
- package/dist/messaging/shared/compactionResumePayload.js +14 -5
- package/dist/messaging/shared/compactionResumePayload.js.map +1 -1
- package/dist/server/routes.d.ts.map +1 -1
- package/dist/server/routes.js +58 -1
- package/dist/server/routes.js.map +1 -1
- package/package.json +1 -1
- package/src/data/builtin-manifest.json +82 -82
- package/upgrades/0.28.66.md +44 -0
- package/upgrades/0.28.67.md +58 -0
- package/upgrades/side-effects/0.28.65.md +59 -0
- package/upgrades/side-effects/0.28.66.md +130 -0
- package/upgrades/side-effects/lifeline-message-drop-stage-a.md +155 -0
- package/upgrades/side-effects/lifeline-self-restart-stage-b.md +129 -0
- package/upgrades/NEXT.md +0 -53
|
@@ -24,13 +24,21 @@ import fs from 'node:fs';
|
|
|
24
24
|
import os from 'node:os';
|
|
25
25
|
import path from 'node:path';
|
|
26
26
|
import pc from 'picocolors';
|
|
27
|
-
import { loadConfig, ensureStateDir, detectTmuxPath } from '../core/Config.js';
|
|
27
|
+
import { loadConfig, ensureStateDir, detectTmuxPath, getInstarVersion } from '../core/Config.js';
|
|
28
28
|
import { registerAgent, unregisterAgent, startHeartbeat } from '../core/AgentRegistry.js';
|
|
29
29
|
// setup.ts uses @inquirer/prompts which requires Node 20.12+
|
|
30
30
|
// Dynamic import to avoid breaking the lifeline on older Node versions
|
|
31
31
|
// import { installAutoStart } from '../commands/setup.js';
|
|
32
32
|
import { MessageQueue } from './MessageQueue.js';
|
|
33
33
|
import { ServerSupervisor } from './ServerSupervisor.js';
|
|
34
|
+
import { retryWithBackoff } from './retryWithBackoff.js';
|
|
35
|
+
import { notifyMessageDropped } from './droppedMessages.js';
|
|
36
|
+
import { ForwardTransientError, ForwardBadRequestError, ForwardServerBootError, ForwardVersionSkewError, isTerminalForwardError, } from './forwardErrors.js';
|
|
37
|
+
import { writeStartupMarker } from './startupMarker.js';
|
|
38
|
+
import { RestartOrchestrator } from './RestartOrchestrator.js';
|
|
39
|
+
import { LifelineHealthWatchdog, DEFAULT_WATCHDOG_THRESHOLDS, } from './LifelineHealthWatchdog.js';
|
|
40
|
+
import { readRateLimitState, decide as decideRateLimit, writeRateLimitState, isRestartStorm, } from './rateLimitState.js';
|
|
41
|
+
import { DegradationReporter } from '../monitoring/DegradationReporter.js';
|
|
34
42
|
/**
|
|
35
43
|
* Acquire an exclusive lock file to prevent multiple lifeline instances.
|
|
36
44
|
* Returns true if lock acquired, false if another instance holds it.
|
|
@@ -232,7 +240,21 @@ export class TelegramLifeline {
|
|
|
232
240
|
console.log(pc.bold(`Starting Telegram Lifeline for ${pc.cyan(this.projectConfig.projectName)}`));
|
|
233
241
|
console.log(` Port: ${this.projectConfig.port}`);
|
|
234
242
|
console.log(` State: ${this.projectConfig.stateDir}`);
|
|
243
|
+
console.log(` Version: ${this.lifelineVersion}`);
|
|
235
244
|
console.log();
|
|
245
|
+
// Stage B: startup liveness marker. Every startup, regardless of cause,
|
|
246
|
+
// writes this file so `instar lifeline restart` can detect pid changes.
|
|
247
|
+
writeStartupMarker(this.projectConfig.stateDir, this.lifelineVersion);
|
|
248
|
+
// Stage B: startup coherence check. Guards against respawning into a
|
|
249
|
+
// half-written shadow install where the bundled package.json advertises
|
|
250
|
+
// a version but the code is broken or missing. The getInstarVersion()
|
|
251
|
+
// helper is the same one used below; if it returns '0.0.0' (its error
|
|
252
|
+
// fallback), the install is incoherent — exit code 2 so launchd throttles
|
|
253
|
+
// respawn rather than tight-looping.
|
|
254
|
+
if (this.lifelineVersion === '0.0.0') {
|
|
255
|
+
console.error(pc.red('[Lifeline] startup coherence check failed: package.json missing or unreadable. Exiting with code 2 for launchd throttle.'));
|
|
256
|
+
process.exit(2);
|
|
257
|
+
}
|
|
236
258
|
// Acquire exclusive lock — prevent multiple lifeline instances
|
|
237
259
|
if (!acquireLockFile(this.lockPath)) {
|
|
238
260
|
console.error(pc.red('[Lifeline] Another lifeline instance is already running. Exiting.'));
|
|
@@ -279,6 +301,10 @@ export class TelegramLifeline {
|
|
|
279
301
|
this.replayQueue();
|
|
280
302
|
}
|
|
281
303
|
}, 15_000);
|
|
304
|
+
// Stage B: install the restart orchestrator and health watchdog.
|
|
305
|
+
// In unsupervised mode (no INSTAR_SUPERVISED=1 and no launchd parent),
|
|
306
|
+
// the orchestrator emits signals and logs but skips process.exit.
|
|
307
|
+
this.installOrchestratorAndWatchdog();
|
|
282
308
|
// Replay any messages queued from previous lifeline runs
|
|
283
309
|
if (this.queue.length > 0) {
|
|
284
310
|
console.log(` ${this.queue.length} queued messages from previous run`);
|
|
@@ -312,37 +338,209 @@ export class TelegramLifeline {
|
|
|
312
338
|
// without any visible error in the server logs — the agent appears alive
|
|
313
339
|
// but never responds to messages.
|
|
314
340
|
this.selfHealSettingsJson();
|
|
315
|
-
// Graceful shutdown —
|
|
316
|
-
//
|
|
317
|
-
//
|
|
318
|
-
const
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
clearInterval(this.replayInterval);
|
|
325
|
-
try {
|
|
326
|
-
if (this.stopHeartbeat)
|
|
327
|
-
this.stopHeartbeat();
|
|
328
|
-
}
|
|
329
|
-
catch { /* non-critical */ }
|
|
330
|
-
try {
|
|
331
|
-
unregisterAgent(this.projectConfig.projectDir + '-lifeline');
|
|
332
|
-
}
|
|
333
|
-
catch { /* ELOCKED is non-critical during shutdown */ }
|
|
334
|
-
try {
|
|
335
|
-
releaseLockFile(this.lockPath);
|
|
341
|
+
// Graceful shutdown — SIGTERM/SIGINT route through the orchestrator so
|
|
342
|
+
// external restarts (e.g., `instar lifeline restart` → launchctl kickstart)
|
|
343
|
+
// get the same quiesce+persist semantics as self-triggered ones.
|
|
344
|
+
const externalShutdown = async () => {
|
|
345
|
+
if (this.orchestrator) {
|
|
346
|
+
await this.orchestrator.requestRestart({
|
|
347
|
+
reason: 'external-signal',
|
|
348
|
+
bucket: 'watchdog',
|
|
349
|
+
});
|
|
336
350
|
}
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
351
|
+
else {
|
|
352
|
+
// Fallback if orchestrator wasn't installed (should not happen post-Stage-B)
|
|
353
|
+
console.log('\nLifeline shutting down (no orchestrator)...');
|
|
354
|
+
await this.quiesceEverything();
|
|
355
|
+
process.exit(0);
|
|
340
356
|
}
|
|
341
|
-
catch { /* best effort */ }
|
|
342
|
-
process.exit(0);
|
|
343
357
|
};
|
|
344
|
-
process.on('SIGINT',
|
|
345
|
-
process.on('SIGTERM',
|
|
358
|
+
process.on('SIGINT', externalShutdown);
|
|
359
|
+
process.on('SIGTERM', externalShutdown);
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Stop all in-flight / scheduled mutation sources so the queue snapshot
|
|
363
|
+
* is consistent when persisted.
|
|
364
|
+
*/
|
|
365
|
+
async quiesceEverything() {
|
|
366
|
+
this.polling = false;
|
|
367
|
+
if (this.pollTimeout)
|
|
368
|
+
clearTimeout(this.pollTimeout);
|
|
369
|
+
if (this.replayInterval) {
|
|
370
|
+
clearInterval(this.replayInterval);
|
|
371
|
+
this.replayInterval = null;
|
|
372
|
+
}
|
|
373
|
+
if (this.watchdog)
|
|
374
|
+
this.watchdog.stop();
|
|
375
|
+
try {
|
|
376
|
+
if (this.stopHeartbeat)
|
|
377
|
+
this.stopHeartbeat();
|
|
378
|
+
}
|
|
379
|
+
catch { /* non-critical */ }
|
|
380
|
+
try {
|
|
381
|
+
unregisterAgent(this.projectConfig.projectDir + '-lifeline');
|
|
382
|
+
}
|
|
383
|
+
catch { /* non-critical */ }
|
|
384
|
+
try {
|
|
385
|
+
releaseLockFile(this.lockPath);
|
|
386
|
+
}
|
|
387
|
+
catch { /* non-critical */ }
|
|
388
|
+
try {
|
|
389
|
+
await this.supervisor.stop();
|
|
390
|
+
}
|
|
391
|
+
catch { /* best-effort */ }
|
|
392
|
+
}
|
|
393
|
+
/**
|
|
394
|
+
* Install the restart orchestrator and watchdog. Called from start().
|
|
395
|
+
*
|
|
396
|
+
* The orchestrator owns the process.exit call. The watchdog requests
|
|
397
|
+
* restarts via the orchestrator on threshold crossings, subject to
|
|
398
|
+
* rate-limit state on disk.
|
|
399
|
+
*/
|
|
400
|
+
installOrchestratorAndWatchdog() {
|
|
401
|
+
const isSupervised = process.env.INSTAR_SUPERVISED === '1' ||
|
|
402
|
+
process.env.NODE_ENV !== 'test' && process.ppid === 1;
|
|
403
|
+
this.orchestrator = new RestartOrchestrator({
|
|
404
|
+
quiesce: () => this.quiesceEverything(),
|
|
405
|
+
persistAll: async () => {
|
|
406
|
+
// Each persist is best-effort; Promise.all so they run in parallel.
|
|
407
|
+
await Promise.all([
|
|
408
|
+
this.persistRateLimitSafe(),
|
|
409
|
+
// Queue + dropped-messages are already atomically persisted by
|
|
410
|
+
// existing code paths (MessageQueue.save, notifyMessageDropped's
|
|
411
|
+
// atomic write). A no-op here is correct — the goal is "nothing
|
|
412
|
+
// is in-flight that would need a final flush."
|
|
413
|
+
Promise.resolve(),
|
|
414
|
+
]);
|
|
415
|
+
},
|
|
416
|
+
exitFn: (code) => process.exit(code),
|
|
417
|
+
isSupervised,
|
|
418
|
+
isShadowInstallUpdating: () => {
|
|
419
|
+
// Shadow-install sibling path: `.instar/shadow-install/.updating`.
|
|
420
|
+
// stateDir is `.instar/state`; we check one level up for the lockfile.
|
|
421
|
+
const lockPath = path.join(path.dirname(this.projectConfig.stateDir), 'shadow-install', '.updating');
|
|
422
|
+
try {
|
|
423
|
+
return fs.existsSync(lockPath);
|
|
424
|
+
}
|
|
425
|
+
catch {
|
|
426
|
+
return false;
|
|
427
|
+
}
|
|
428
|
+
},
|
|
429
|
+
});
|
|
430
|
+
const onTrip = (result) => {
|
|
431
|
+
this.initiateRestart('watchdog', result.primary ?? 'unknown', {
|
|
432
|
+
tripped: result.tripped,
|
|
433
|
+
snapshot: result.snapshot,
|
|
434
|
+
});
|
|
435
|
+
};
|
|
436
|
+
this.watchdog = new LifelineHealthWatchdog({
|
|
437
|
+
thresholds: this.loadThresholdOverrides(),
|
|
438
|
+
getInputs: () => ({
|
|
439
|
+
now: Date.now(),
|
|
440
|
+
oldestQueueItemEnqueuedAt: this.oldestQueueItemEnqueuedAt(),
|
|
441
|
+
consecutiveForwardFailures: this.consecutiveForwardFailures,
|
|
442
|
+
conflict409StartedAt: this.conflict409StartedAt,
|
|
443
|
+
serverHealthy: this.supervisor.getStatus().healthy,
|
|
444
|
+
}),
|
|
445
|
+
onTrip,
|
|
446
|
+
onStarved: (gap) => {
|
|
447
|
+
DegradationReporter.getInstance().report({
|
|
448
|
+
feature: 'TelegramLifeline.watchdogStarved',
|
|
449
|
+
primary: 'Watchdog tick on schedule',
|
|
450
|
+
fallback: `Tick gap ${Math.round(gap / 1000)}s — event loop blocked`,
|
|
451
|
+
reason: 'setInterval delayed by blocked loop',
|
|
452
|
+
impact: 'Observability only; watchdog still functional at coarser granularity.',
|
|
453
|
+
});
|
|
454
|
+
},
|
|
455
|
+
autoStart: process.env.NODE_ENV !== 'test',
|
|
456
|
+
});
|
|
457
|
+
}
|
|
458
|
+
/** Extract oldest queue item's enqueue timestamp as ms, if any. */
|
|
459
|
+
oldestQueueItemEnqueuedAt() {
|
|
460
|
+
const peeked = this.queue.peek();
|
|
461
|
+
if (peeked.length === 0)
|
|
462
|
+
return undefined;
|
|
463
|
+
const ts = Date.parse(peeked[0].timestamp);
|
|
464
|
+
return Number.isFinite(ts) ? ts : undefined;
|
|
465
|
+
}
|
|
466
|
+
/** Read config overrides for watchdog thresholds. */
|
|
467
|
+
loadThresholdOverrides() {
|
|
468
|
+
const raw = this.projectConfig.lifeline?.watchdog;
|
|
469
|
+
if (!raw || typeof raw !== 'object')
|
|
470
|
+
return {};
|
|
471
|
+
const valid = (v) => typeof v === 'number' && Number.isFinite(v) && v > 0;
|
|
472
|
+
const out = {};
|
|
473
|
+
if (valid(raw.tickIntervalMs))
|
|
474
|
+
out.tickIntervalMs = raw.tickIntervalMs;
|
|
475
|
+
if (valid(raw.noForwardStuckMs))
|
|
476
|
+
out.noForwardStuckMs = raw.noForwardStuckMs;
|
|
477
|
+
if (valid(raw.consecutiveFailureMax))
|
|
478
|
+
out.consecutiveFailureMax = raw.consecutiveFailureMax;
|
|
479
|
+
if (valid(raw.conflict409StuckMs))
|
|
480
|
+
out.conflict409StuckMs = raw.conflict409StuckMs;
|
|
481
|
+
let hadInvalid = false;
|
|
482
|
+
for (const k of Object.keys(raw)) {
|
|
483
|
+
if (!(k in DEFAULT_WATCHDOG_THRESHOLDS))
|
|
484
|
+
hadInvalid = true;
|
|
485
|
+
else if (!valid(raw[k]))
|
|
486
|
+
hadInvalid = true;
|
|
487
|
+
}
|
|
488
|
+
if (hadInvalid) {
|
|
489
|
+
DegradationReporter.getInstance().report({
|
|
490
|
+
feature: 'TelegramLifeline.configInvalid',
|
|
491
|
+
primary: 'Valid watchdog threshold overrides',
|
|
492
|
+
fallback: 'Falling back to defaults for invalid keys',
|
|
493
|
+
reason: 'Non-finite, non-positive, or unknown override key in lifeline.watchdog',
|
|
494
|
+
impact: 'Threshold uses default; behavior unchanged but config is misleading.',
|
|
495
|
+
});
|
|
496
|
+
}
|
|
497
|
+
return out;
|
|
498
|
+
}
|
|
499
|
+
/** Persist rate-limit state. Safe to call during orchestrator persist. */
|
|
500
|
+
async persistRateLimitSafe() {
|
|
501
|
+
// The orchestrator invokes this while transitioning to 'persisting';
|
|
502
|
+
// rate-limit history was already written by initiateRestart() before
|
|
503
|
+
// the orchestrator was called. This is a final no-op flush.
|
|
504
|
+
return;
|
|
505
|
+
}
|
|
506
|
+
/**
|
|
507
|
+
* Unified restart initiator: checks rate limit, writes history, then
|
|
508
|
+
* calls the orchestrator. Used by both the watchdog tick (bucket=watchdog)
|
|
509
|
+
* and the version-skew handler (bucket=versionSkew).
|
|
510
|
+
*/
|
|
511
|
+
initiateRestart(bucket, reason, context) {
|
|
512
|
+
const outcome = readRateLimitState(this.projectConfig.stateDir);
|
|
513
|
+
const dec = decideRateLimit(outcome, bucket);
|
|
514
|
+
if (!dec.allowed) {
|
|
515
|
+
console.log(`[Lifeline] restart suppressed by rate limit: ${dec.reason} (bucket=${bucket} reason=${reason})`);
|
|
516
|
+
return;
|
|
517
|
+
}
|
|
518
|
+
// Storm escalation signal (fires in addition to the normal restart
|
|
519
|
+
// signal so the operator sees that self-heal is not converging).
|
|
520
|
+
if (dec.stormActive || isRestartStorm(outcome.kind === 'ok' ? outcome.state : null)) {
|
|
521
|
+
DegradationReporter.getInstance().report({
|
|
522
|
+
feature: 'TelegramLifeline.restartStorm',
|
|
523
|
+
primary: 'Rate-limited self-restarts within ceiling',
|
|
524
|
+
fallback: 'Continuing to restart — underlying cause unresolved',
|
|
525
|
+
reason: `>= 6 restarts within the last hour; latest bucket=${bucket} reason=${reason}`,
|
|
526
|
+
impact: 'Operator should investigate; self-heal is not converging.',
|
|
527
|
+
});
|
|
528
|
+
}
|
|
529
|
+
// Write the history entry BEFORE calling process.exit so the new lifeline
|
|
530
|
+
// sees the rate-limit state on startup. Best-effort — failure here still
|
|
531
|
+
// lets the restart proceed (orchestrator is authoritative).
|
|
532
|
+
try {
|
|
533
|
+
const prior = outcome.kind === 'ok' ? outcome.state : null;
|
|
534
|
+
writeRateLimitState(this.projectConfig.stateDir, reason, bucket, prior);
|
|
535
|
+
}
|
|
536
|
+
catch (err) {
|
|
537
|
+
console.error(`[Lifeline] failed to write rate-limit state: ${err}`);
|
|
538
|
+
}
|
|
539
|
+
if (!this.orchestrator) {
|
|
540
|
+
console.error('[Lifeline] initiateRestart called before orchestrator was installed');
|
|
541
|
+
return;
|
|
542
|
+
}
|
|
543
|
+
void this.orchestrator.requestRestart({ reason, bucket, context });
|
|
346
544
|
}
|
|
347
545
|
// ── Stale Connection Flush ───────────────────────────────
|
|
348
546
|
/**
|
|
@@ -415,6 +613,8 @@ export class TelegramLifeline {
|
|
|
415
613
|
this.saveOffset();
|
|
416
614
|
}
|
|
417
615
|
// Success — reset backoff counters
|
|
616
|
+
if (this.consecutive409s > 0)
|
|
617
|
+
this.conflict409StartedAt = null; // 0→... edge
|
|
418
618
|
this.consecutive409s = 0;
|
|
419
619
|
this.consecutive429s = 0;
|
|
420
620
|
this.pollBackoffMs = this.config.pollIntervalMs ?? 2000;
|
|
@@ -428,6 +628,9 @@ export class TelegramLifeline {
|
|
|
428
628
|
}
|
|
429
629
|
// Handle 409 Conflict (multiple bot instances polling)
|
|
430
630
|
if (errMsg.includes('409') && errMsg.includes('Conflict')) {
|
|
631
|
+
// 0→>0 edge: record when conflict started so watchdog can time the stuck state.
|
|
632
|
+
if (this.consecutive409s === 0)
|
|
633
|
+
this.conflict409StartedAt = Date.now();
|
|
431
634
|
this.consecutive409s++;
|
|
432
635
|
// Exponential backoff: 4s, 8s, 16s, 32s, max 60s
|
|
433
636
|
this.pollBackoffMs = Math.min(60_000, 2000 * Math.pow(2, this.consecutive409s));
|
|
@@ -743,9 +946,35 @@ export class TelegramLifeline {
|
|
|
743
946
|
}
|
|
744
947
|
/**
|
|
745
948
|
* Forward a message to the Instar server's Telegram webhook.
|
|
949
|
+
*
|
|
950
|
+
* Attempts up to FORWARD_ATTEMPTS times with exponential backoff
|
|
951
|
+
* (1s, 2s base). A single 10s-timeout fetch per attempt. Returns true
|
|
952
|
+
* on the first success, false after all attempts fail. Giving the
|
|
953
|
+
* handoff a real chance to succeed closes the silent-drop window that
|
|
954
|
+
* the caller's queue-and-retry path papered over.
|
|
955
|
+
*/
|
|
956
|
+
static FORWARD_ATTEMPTS = 3;
|
|
957
|
+
static FORWARD_BACKOFF_BASE_MS = 1000;
|
|
958
|
+
/**
|
|
959
|
+
* `legacyStrict` — if a pre-Stage-B server strictly validates JSON and
|
|
960
|
+
* rejects the unknown `lifelineVersion` field with 400, the lifeline
|
|
961
|
+
* falls back to omitting it and pins this flag for the session.
|
|
746
962
|
*/
|
|
963
|
+
legacyStrictServer = false;
|
|
964
|
+
/** Full semver of this lifeline, read once at construction. */
|
|
965
|
+
lifelineVersion = getInstarVersion();
|
|
747
966
|
async forwardToServer(topicId, text, rawMsg) {
|
|
748
|
-
|
|
967
|
+
const buildBody = (includeVersion) => JSON.stringify({
|
|
968
|
+
topicId,
|
|
969
|
+
text,
|
|
970
|
+
fromUserId: rawMsg.from.id,
|
|
971
|
+
fromUsername: rawMsg.from.username,
|
|
972
|
+
fromFirstName: rawMsg.from.first_name,
|
|
973
|
+
messageId: rawMsg.message_id,
|
|
974
|
+
timestamp: new Date(rawMsg.date * 1000).toISOString(),
|
|
975
|
+
...(includeVersion ? { lifelineVersion: this.lifelineVersion } : {}),
|
|
976
|
+
});
|
|
977
|
+
const doForward = async () => {
|
|
749
978
|
const controller = new AbortController();
|
|
750
979
|
const timer = setTimeout(() => controller.abort(), 10_000);
|
|
751
980
|
try {
|
|
@@ -756,27 +985,101 @@ export class TelegramLifeline {
|
|
|
756
985
|
const response = await fetch(`http://127.0.0.1:${this.projectConfig.port}/internal/telegram-forward`, {
|
|
757
986
|
method: 'POST',
|
|
758
987
|
headers: fwdHeaders,
|
|
759
|
-
body:
|
|
760
|
-
topicId,
|
|
761
|
-
text,
|
|
762
|
-
fromUserId: rawMsg.from.id,
|
|
763
|
-
fromUsername: rawMsg.from.username,
|
|
764
|
-
fromFirstName: rawMsg.from.first_name,
|
|
765
|
-
messageId: rawMsg.message_id,
|
|
766
|
-
timestamp: new Date(rawMsg.date * 1000).toISOString(),
|
|
767
|
-
}),
|
|
988
|
+
body: buildBody(!this.legacyStrictServer),
|
|
768
989
|
signal: controller.signal,
|
|
769
990
|
});
|
|
770
|
-
|
|
991
|
+
if (response.ok)
|
|
992
|
+
return true;
|
|
993
|
+
if (response.status === 426) {
|
|
994
|
+
const body = (await response.json().catch(() => ({})));
|
|
995
|
+
throw new ForwardVersionSkewError(426, body);
|
|
996
|
+
}
|
|
997
|
+
if (response.status === 503) {
|
|
998
|
+
const body = (await response.json().catch(() => ({})));
|
|
999
|
+
throw new ForwardServerBootError(body.retryAfterMs ?? 1000);
|
|
1000
|
+
}
|
|
1001
|
+
if (response.status === 400) {
|
|
1002
|
+
const body = await response.json().catch(() => ({}));
|
|
1003
|
+
// Graceful degradation: if we included lifelineVersion and the
|
|
1004
|
+
// server rejected the request, retry once without it.
|
|
1005
|
+
if (!this.legacyStrictServer) {
|
|
1006
|
+
this.legacyStrictServer = true;
|
|
1007
|
+
console.warn(`[Lifeline] server returned 400 with lifelineVersion; ` +
|
|
1008
|
+
`retrying without (legacyStrictServer=true)`);
|
|
1009
|
+
// Re-issue the request WITHOUT the version field and return the
|
|
1010
|
+
// result of that retry. If still 400, it's a genuine bad request.
|
|
1011
|
+
const r2 = await fetch(`http://127.0.0.1:${this.projectConfig.port}/internal/telegram-forward`, { method: 'POST', headers: fwdHeaders, body: buildBody(false) });
|
|
1012
|
+
if (r2.ok)
|
|
1013
|
+
return true;
|
|
1014
|
+
if (r2.status === 400)
|
|
1015
|
+
throw new ForwardBadRequestError(await r2.json().catch(() => ({})));
|
|
1016
|
+
throw new ForwardTransientError(r2.status);
|
|
1017
|
+
}
|
|
1018
|
+
throw new ForwardBadRequestError(body);
|
|
1019
|
+
}
|
|
1020
|
+
throw new ForwardTransientError(response.status);
|
|
771
1021
|
}
|
|
772
1022
|
finally {
|
|
773
1023
|
clearTimeout(timer);
|
|
774
1024
|
}
|
|
1025
|
+
};
|
|
1026
|
+
try {
|
|
1027
|
+
await retryWithBackoff(doForward, {
|
|
1028
|
+
attempts: TelegramLifeline.FORWARD_ATTEMPTS,
|
|
1029
|
+
baseMs: TelegramLifeline.FORWARD_BACKOFF_BASE_MS,
|
|
1030
|
+
isTerminal: isTerminalForwardError,
|
|
1031
|
+
onAttempt: (n, lastErr) => {
|
|
1032
|
+
if (n > 1) {
|
|
1033
|
+
console.warn(`[Lifeline] forwardToServer retry ${n}/${TelegramLifeline.FORWARD_ATTEMPTS} ` +
|
|
1034
|
+
`(topic ${topicId}, msg ${rawMsg.message_id}) — prior: ${lastErr?.message ?? 'unknown'}`);
|
|
1035
|
+
}
|
|
1036
|
+
},
|
|
1037
|
+
});
|
|
1038
|
+
// Record success for watchdog.
|
|
1039
|
+
this.consecutiveForwardFailures = 0;
|
|
1040
|
+
this.lastForwardSuccessAt = Date.now();
|
|
1041
|
+
return true;
|
|
775
1042
|
}
|
|
776
|
-
catch {
|
|
1043
|
+
catch (err) {
|
|
1044
|
+
// Version-skew handler: emit signal + request restart via orchestrator.
|
|
1045
|
+
if (err instanceof ForwardVersionSkewError) {
|
|
1046
|
+
this.handleVersionSkew(err);
|
|
1047
|
+
return false;
|
|
1048
|
+
}
|
|
1049
|
+
this.consecutiveForwardFailures++;
|
|
777
1050
|
return false;
|
|
778
1051
|
}
|
|
779
1052
|
}
|
|
1053
|
+
/**
|
|
1054
|
+
* Handle a 426 response from the server. Validates the response body's
|
|
1055
|
+
* `serverVersion` differs from this lifeline's, then requests restart
|
|
1056
|
+
* through the orchestrator. If the body is malformed or the versions
|
|
1057
|
+
* match (loopback impostor), treat as transient.
|
|
1058
|
+
*/
|
|
1059
|
+
handleVersionSkew(err) {
|
|
1060
|
+
const { body } = err;
|
|
1061
|
+
if (body.upgradeRequired !== true) {
|
|
1062
|
+
// Not a genuine Stage-B upgrade directive; treat as transient noise.
|
|
1063
|
+
this.consecutiveForwardFailures++;
|
|
1064
|
+
return;
|
|
1065
|
+
}
|
|
1066
|
+
if (typeof body.serverVersion !== 'string' || body.serverVersion === this.lifelineVersion) {
|
|
1067
|
+
// Loopback impostor or malformed body — don't trust it.
|
|
1068
|
+
console.warn(`[Lifeline] ignoring 426 with missing/matching serverVersion`);
|
|
1069
|
+
this.consecutiveForwardFailures++;
|
|
1070
|
+
return;
|
|
1071
|
+
}
|
|
1072
|
+
this.initiateRestart('versionSkew', 'version-skew', {
|
|
1073
|
+
serverVersion: body.serverVersion,
|
|
1074
|
+
lifelineVersion: this.lifelineVersion,
|
|
1075
|
+
});
|
|
1076
|
+
}
|
|
1077
|
+
/** Watchdog-tracked counters/state. */
|
|
1078
|
+
consecutiveForwardFailures = 0;
|
|
1079
|
+
lastForwardSuccessAt = 0;
|
|
1080
|
+
conflict409StartedAt = null;
|
|
1081
|
+
orchestrator = null;
|
|
1082
|
+
watchdog = null;
|
|
780
1083
|
// ── Lifeline Commands ─────────────────────────────────────
|
|
781
1084
|
async handleLifelineCommand(text, topicId, fromUserId) {
|
|
782
1085
|
const cmd = text.trim().toLowerCase();
|
|
@@ -884,6 +1187,26 @@ export class TelegramLifeline {
|
|
|
884
1187
|
const failures = msg.replayFailures ?? 0;
|
|
885
1188
|
if (failures >= TelegramLifeline.MAX_REPLAY_FAILURES) {
|
|
886
1189
|
dropped++;
|
|
1190
|
+
// Before the drop becomes silent: persist the record, report a
|
|
1191
|
+
// degradation, and tell the original sender their message was lost.
|
|
1192
|
+
try {
|
|
1193
|
+
await notifyMessageDropped({
|
|
1194
|
+
stateDir: this.projectConfig.stateDir,
|
|
1195
|
+
topicId: msg.topicId,
|
|
1196
|
+
messageId: msg.id,
|
|
1197
|
+
senderName: msg.fromFirstName ?? msg.fromUsername ?? String(msg.fromUserId),
|
|
1198
|
+
text: msg.text,
|
|
1199
|
+
retryCount: failures,
|
|
1200
|
+
reason: `Handoff to server failed after ${failures} replay attempts`,
|
|
1201
|
+
sendToTopic: (topicId, body) => this.sendToTopic(topicId, body),
|
|
1202
|
+
});
|
|
1203
|
+
}
|
|
1204
|
+
catch (err) {
|
|
1205
|
+
// notifyMessageDropped only throws on true disk failure after the notice/report paths
|
|
1206
|
+
// had their chance — surface and continue; we still want to drop this message so
|
|
1207
|
+
// the queue doesn't stall.
|
|
1208
|
+
console.error(`[Lifeline] notifyMessageDropped threw for ${msg.id}:`, err instanceof Error ? err.message : err);
|
|
1209
|
+
}
|
|
887
1210
|
console.warn(`[Lifeline] Dropping message ${msg.id} after ${failures} replay failures: ${msg.text.slice(0, 80)}`);
|
|
888
1211
|
continue;
|
|
889
1212
|
}
|