instar 0.28.65 → 0.28.67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/dist/cli.js +53 -0
  2. package/dist/cli.js.map +1 -1
  3. package/dist/core/AutoDispatcher.d.ts +4 -1
  4. package/dist/core/AutoDispatcher.d.ts.map +1 -1
  5. package/dist/core/AutoDispatcher.js +5 -4
  6. package/dist/core/AutoDispatcher.js.map +1 -1
  7. package/dist/core/AutoUpdater.d.ts +6 -1
  8. package/dist/core/AutoUpdater.d.ts.map +1 -1
  9. package/dist/core/AutoUpdater.js +7 -4
  10. package/dist/core/AutoUpdater.js.map +1 -1
  11. package/dist/lifeline/LifelineHealthWatchdog.d.ts +81 -0
  12. package/dist/lifeline/LifelineHealthWatchdog.d.ts.map +1 -0
  13. package/dist/lifeline/LifelineHealthWatchdog.js +122 -0
  14. package/dist/lifeline/LifelineHealthWatchdog.js.map +1 -0
  15. package/dist/lifeline/RestartOrchestrator.d.ts +73 -0
  16. package/dist/lifeline/RestartOrchestrator.d.ts.map +1 -0
  17. package/dist/lifeline/RestartOrchestrator.js +124 -0
  18. package/dist/lifeline/RestartOrchestrator.js.map +1 -0
  19. package/dist/lifeline/TelegramLifeline.d.ts +55 -1
  20. package/dist/lifeline/TelegramLifeline.d.ts.map +1 -1
  21. package/dist/lifeline/TelegramLifeline.js +364 -41
  22. package/dist/lifeline/TelegramLifeline.js.map +1 -1
  23. package/dist/lifeline/droppedMessages.d.ts +67 -0
  24. package/dist/lifeline/droppedMessages.d.ts.map +1 -0
  25. package/dist/lifeline/droppedMessages.js +179 -0
  26. package/dist/lifeline/droppedMessages.js.map +1 -0
  27. package/dist/lifeline/forwardErrors.d.ts +38 -0
  28. package/dist/lifeline/forwardErrors.d.ts.map +1 -0
  29. package/dist/lifeline/forwardErrors.js +53 -0
  30. package/dist/lifeline/forwardErrors.js.map +1 -0
  31. package/dist/lifeline/rateLimitState.d.ts +63 -0
  32. package/dist/lifeline/rateLimitState.d.ts.map +1 -0
  33. package/dist/lifeline/rateLimitState.js +110 -0
  34. package/dist/lifeline/rateLimitState.js.map +1 -0
  35. package/dist/lifeline/retryWithBackoff.d.ts +28 -0
  36. package/dist/lifeline/retryWithBackoff.d.ts.map +1 -0
  37. package/dist/lifeline/retryWithBackoff.js +34 -0
  38. package/dist/lifeline/retryWithBackoff.js.map +1 -0
  39. package/dist/lifeline/startupMarker.d.ts +20 -0
  40. package/dist/lifeline/startupMarker.d.ts.map +1 -0
  41. package/dist/lifeline/startupMarker.js +52 -0
  42. package/dist/lifeline/startupMarker.js.map +1 -0
  43. package/dist/lifeline/versionHandshake.d.ts +40 -0
  44. package/dist/lifeline/versionHandshake.d.ts.map +1 -0
  45. package/dist/lifeline/versionHandshake.js +45 -0
  46. package/dist/lifeline/versionHandshake.js.map +1 -0
  47. package/dist/messaging/shared/compactionResumePayload.d.ts +1 -1
  48. package/dist/messaging/shared/compactionResumePayload.d.ts.map +1 -1
  49. package/dist/messaging/shared/compactionResumePayload.js +14 -5
  50. package/dist/messaging/shared/compactionResumePayload.js.map +1 -1
  51. package/dist/server/routes.d.ts.map +1 -1
  52. package/dist/server/routes.js +58 -1
  53. package/dist/server/routes.js.map +1 -1
  54. package/package.json +1 -1
  55. package/src/data/builtin-manifest.json +82 -82
  56. package/upgrades/0.28.66.md +44 -0
  57. package/upgrades/0.28.67.md +58 -0
  58. package/upgrades/side-effects/0.28.65.md +59 -0
  59. package/upgrades/side-effects/0.28.66.md +130 -0
  60. package/upgrades/side-effects/lifeline-message-drop-stage-a.md +155 -0
  61. package/upgrades/side-effects/lifeline-self-restart-stage-b.md +129 -0
  62. package/upgrades/NEXT.md +0 -53
@@ -24,13 +24,21 @@ import fs from 'node:fs';
24
24
  import os from 'node:os';
25
25
  import path from 'node:path';
26
26
  import pc from 'picocolors';
27
- import { loadConfig, ensureStateDir, detectTmuxPath } from '../core/Config.js';
27
+ import { loadConfig, ensureStateDir, detectTmuxPath, getInstarVersion } from '../core/Config.js';
28
28
  import { registerAgent, unregisterAgent, startHeartbeat } from '../core/AgentRegistry.js';
29
29
  // setup.ts uses @inquirer/prompts which requires Node 20.12+
30
30
  // Dynamic import to avoid breaking the lifeline on older Node versions
31
31
  // import { installAutoStart } from '../commands/setup.js';
32
32
  import { MessageQueue } from './MessageQueue.js';
33
33
  import { ServerSupervisor } from './ServerSupervisor.js';
34
+ import { retryWithBackoff } from './retryWithBackoff.js';
35
+ import { notifyMessageDropped } from './droppedMessages.js';
36
+ import { ForwardTransientError, ForwardBadRequestError, ForwardServerBootError, ForwardVersionSkewError, isTerminalForwardError, } from './forwardErrors.js';
37
+ import { writeStartupMarker } from './startupMarker.js';
38
+ import { RestartOrchestrator } from './RestartOrchestrator.js';
39
+ import { LifelineHealthWatchdog, DEFAULT_WATCHDOG_THRESHOLDS, } from './LifelineHealthWatchdog.js';
40
+ import { readRateLimitState, decide as decideRateLimit, writeRateLimitState, isRestartStorm, } from './rateLimitState.js';
41
+ import { DegradationReporter } from '../monitoring/DegradationReporter.js';
34
42
  /**
35
43
  * Acquire an exclusive lock file to prevent multiple lifeline instances.
36
44
  * Returns true if lock acquired, false if another instance holds it.
@@ -232,7 +240,21 @@ export class TelegramLifeline {
232
240
  console.log(pc.bold(`Starting Telegram Lifeline for ${pc.cyan(this.projectConfig.projectName)}`));
233
241
  console.log(` Port: ${this.projectConfig.port}`);
234
242
  console.log(` State: ${this.projectConfig.stateDir}`);
243
+ console.log(` Version: ${this.lifelineVersion}`);
235
244
  console.log();
245
+ // Stage B: startup liveness marker. Every startup, regardless of cause,
246
+ // writes this file so `instar lifeline restart` can detect pid changes.
247
+ writeStartupMarker(this.projectConfig.stateDir, this.lifelineVersion);
248
+ // Stage B: startup coherence check. Guards against respawning into a
249
+ // half-written shadow install where the bundled package.json advertises
250
+ // a version but the code is broken or missing. The getInstarVersion()
251
+ // helper is the same one used below; if it returns '0.0.0' (its error
252
+ // fallback), the install is incoherent — exit code 2 so launchd throttles
253
+ // respawn rather than tight-looping.
254
+ if (this.lifelineVersion === '0.0.0') {
255
+ console.error(pc.red('[Lifeline] startup coherence check failed: package.json missing or unreadable. Exiting with code 2 for launchd throttle.'));
256
+ process.exit(2);
257
+ }
236
258
  // Acquire exclusive lock — prevent multiple lifeline instances
237
259
  if (!acquireLockFile(this.lockPath)) {
238
260
  console.error(pc.red('[Lifeline] Another lifeline instance is already running. Exiting.'));
@@ -279,6 +301,10 @@ export class TelegramLifeline {
279
301
  this.replayQueue();
280
302
  }
281
303
  }, 15_000);
304
+ // Stage B: install the restart orchestrator and health watchdog.
305
+ // In unsupervised mode (no INSTAR_SUPERVISED=1 and no launchd parent),
306
+ // the orchestrator emits signals and logs but skips process.exit.
307
+ this.installOrchestratorAndWatchdog();
282
308
  // Replay any messages queued from previous lifeline runs
283
309
  if (this.queue.length > 0) {
284
310
  console.log(` ${this.queue.length} queued messages from previous run`);
@@ -312,37 +338,209 @@ export class TelegramLifeline {
312
338
  // without any visible error in the server logs — the agent appears alive
313
339
  // but never responds to messages.
314
340
  this.selfHealSettingsJson();
315
- // Graceful shutdown — every step is wrapped in try-catch because a crash
316
- // during shutdown leaves the lifeline in a half-alive state that confuses
317
- // launchd's KeepAlive restart logic.
318
- const shutdown = async () => {
319
- console.log('\nLifeline shutting down...');
320
- this.polling = false;
321
- if (this.pollTimeout)
322
- clearTimeout(this.pollTimeout);
323
- if (this.replayInterval)
324
- clearInterval(this.replayInterval);
325
- try {
326
- if (this.stopHeartbeat)
327
- this.stopHeartbeat();
328
- }
329
- catch { /* non-critical */ }
330
- try {
331
- unregisterAgent(this.projectConfig.projectDir + '-lifeline');
332
- }
333
- catch { /* ELOCKED is non-critical during shutdown */ }
334
- try {
335
- releaseLockFile(this.lockPath);
341
+ // Graceful shutdown — SIGTERM/SIGINT route through the orchestrator so
342
+ // external restarts (e.g., `instar lifeline restart` launchctl kickstart)
343
+ // get the same quiesce+persist semantics as self-triggered ones.
344
+ const externalShutdown = async () => {
345
+ if (this.orchestrator) {
346
+ await this.orchestrator.requestRestart({
347
+ reason: 'external-signal',
348
+ bucket: 'watchdog',
349
+ });
336
350
  }
337
- catch { /* non-critical */ }
338
- try {
339
- await this.supervisor.stop();
351
+ else {
352
+ // Fallback if orchestrator wasn't installed (should not happen post-Stage-B)
353
+ console.log('\nLifeline shutting down (no orchestrator)...');
354
+ await this.quiesceEverything();
355
+ process.exit(0);
340
356
  }
341
- catch { /* best effort */ }
342
- process.exit(0);
343
357
  };
344
- process.on('SIGINT', shutdown);
345
- process.on('SIGTERM', shutdown);
358
+ process.on('SIGINT', externalShutdown);
359
+ process.on('SIGTERM', externalShutdown);
360
+ }
361
+ /**
362
+ * Stop all in-flight / scheduled mutation sources so the queue snapshot
363
+ * is consistent when persisted.
364
+ */
365
+ async quiesceEverything() {
366
+ this.polling = false;
367
+ if (this.pollTimeout)
368
+ clearTimeout(this.pollTimeout);
369
+ if (this.replayInterval) {
370
+ clearInterval(this.replayInterval);
371
+ this.replayInterval = null;
372
+ }
373
+ if (this.watchdog)
374
+ this.watchdog.stop();
375
+ try {
376
+ if (this.stopHeartbeat)
377
+ this.stopHeartbeat();
378
+ }
379
+ catch { /* non-critical */ }
380
+ try {
381
+ unregisterAgent(this.projectConfig.projectDir + '-lifeline');
382
+ }
383
+ catch { /* non-critical */ }
384
+ try {
385
+ releaseLockFile(this.lockPath);
386
+ }
387
+ catch { /* non-critical */ }
388
+ try {
389
+ await this.supervisor.stop();
390
+ }
391
+ catch { /* best-effort */ }
392
+ }
393
+ /**
394
+ * Install the restart orchestrator and watchdog. Called from start().
395
+ *
396
+ * The orchestrator owns the process.exit call. The watchdog requests
397
+ * restarts via the orchestrator on threshold crossings, subject to
398
+ * rate-limit state on disk.
399
+ */
400
+ installOrchestratorAndWatchdog() {
401
+ const isSupervised = process.env.INSTAR_SUPERVISED === '1' ||
402
+ process.env.NODE_ENV !== 'test' && process.ppid === 1;
403
+ this.orchestrator = new RestartOrchestrator({
404
+ quiesce: () => this.quiesceEverything(),
405
+ persistAll: async () => {
406
+ // Each persist is best-effort; Promise.all so they run in parallel.
407
+ await Promise.all([
408
+ this.persistRateLimitSafe(),
409
+ // Queue + dropped-messages are already atomically persisted by
410
+ // existing code paths (MessageQueue.save, notifyMessageDropped's
411
+ // atomic write). A no-op here is correct — the goal is "nothing
412
+ // is in-flight that would need a final flush."
413
+ Promise.resolve(),
414
+ ]);
415
+ },
416
+ exitFn: (code) => process.exit(code),
417
+ isSupervised,
418
+ isShadowInstallUpdating: () => {
419
+ // Shadow-install sibling path: `.instar/shadow-install/.updating`.
420
+ // stateDir is `.instar/state`; we check one level up for the lockfile.
421
+ const lockPath = path.join(path.dirname(this.projectConfig.stateDir), 'shadow-install', '.updating');
422
+ try {
423
+ return fs.existsSync(lockPath);
424
+ }
425
+ catch {
426
+ return false;
427
+ }
428
+ },
429
+ });
430
+ const onTrip = (result) => {
431
+ this.initiateRestart('watchdog', result.primary ?? 'unknown', {
432
+ tripped: result.tripped,
433
+ snapshot: result.snapshot,
434
+ });
435
+ };
436
+ this.watchdog = new LifelineHealthWatchdog({
437
+ thresholds: this.loadThresholdOverrides(),
438
+ getInputs: () => ({
439
+ now: Date.now(),
440
+ oldestQueueItemEnqueuedAt: this.oldestQueueItemEnqueuedAt(),
441
+ consecutiveForwardFailures: this.consecutiveForwardFailures,
442
+ conflict409StartedAt: this.conflict409StartedAt,
443
+ serverHealthy: this.supervisor.getStatus().healthy,
444
+ }),
445
+ onTrip,
446
+ onStarved: (gap) => {
447
+ DegradationReporter.getInstance().report({
448
+ feature: 'TelegramLifeline.watchdogStarved',
449
+ primary: 'Watchdog tick on schedule',
450
+ fallback: `Tick gap ${Math.round(gap / 1000)}s — event loop blocked`,
451
+ reason: 'setInterval delayed by blocked loop',
452
+ impact: 'Observability only; watchdog still functional at coarser granularity.',
453
+ });
454
+ },
455
+ autoStart: process.env.NODE_ENV !== 'test',
456
+ });
457
+ }
458
+ /** Extract oldest queue item's enqueue timestamp as ms, if any. */
459
+ oldestQueueItemEnqueuedAt() {
460
+ const peeked = this.queue.peek();
461
+ if (peeked.length === 0)
462
+ return undefined;
463
+ const ts = Date.parse(peeked[0].timestamp);
464
+ return Number.isFinite(ts) ? ts : undefined;
465
+ }
466
+ /** Read config overrides for watchdog thresholds. */
467
+ loadThresholdOverrides() {
468
+ const raw = this.projectConfig.lifeline?.watchdog;
469
+ if (!raw || typeof raw !== 'object')
470
+ return {};
471
+ const valid = (v) => typeof v === 'number' && Number.isFinite(v) && v > 0;
472
+ const out = {};
473
+ if (valid(raw.tickIntervalMs))
474
+ out.tickIntervalMs = raw.tickIntervalMs;
475
+ if (valid(raw.noForwardStuckMs))
476
+ out.noForwardStuckMs = raw.noForwardStuckMs;
477
+ if (valid(raw.consecutiveFailureMax))
478
+ out.consecutiveFailureMax = raw.consecutiveFailureMax;
479
+ if (valid(raw.conflict409StuckMs))
480
+ out.conflict409StuckMs = raw.conflict409StuckMs;
481
+ let hadInvalid = false;
482
+ for (const k of Object.keys(raw)) {
483
+ if (!(k in DEFAULT_WATCHDOG_THRESHOLDS))
484
+ hadInvalid = true;
485
+ else if (!valid(raw[k]))
486
+ hadInvalid = true;
487
+ }
488
+ if (hadInvalid) {
489
+ DegradationReporter.getInstance().report({
490
+ feature: 'TelegramLifeline.configInvalid',
491
+ primary: 'Valid watchdog threshold overrides',
492
+ fallback: 'Falling back to defaults for invalid keys',
493
+ reason: 'Non-finite, non-positive, or unknown override key in lifeline.watchdog',
494
+ impact: 'Threshold uses default; behavior unchanged but config is misleading.',
495
+ });
496
+ }
497
+ return out;
498
+ }
499
+ /** Persist rate-limit state. Safe to call during orchestrator persist. */
500
+ async persistRateLimitSafe() {
501
+ // The orchestrator invokes this while transitioning to 'persisting';
502
+ // rate-limit history was already written by initiateRestart() before
503
+ // the orchestrator was called. This is a final no-op flush.
504
+ return;
505
+ }
506
+ /**
507
+ * Unified restart initiator: checks rate limit, writes history, then
508
+ * calls the orchestrator. Used by both the watchdog tick (bucket=watchdog)
509
+ * and the version-skew handler (bucket=versionSkew).
510
+ */
511
+ initiateRestart(bucket, reason, context) {
512
+ const outcome = readRateLimitState(this.projectConfig.stateDir);
513
+ const dec = decideRateLimit(outcome, bucket);
514
+ if (!dec.allowed) {
515
+ console.log(`[Lifeline] restart suppressed by rate limit: ${dec.reason} (bucket=${bucket} reason=${reason})`);
516
+ return;
517
+ }
518
+ // Storm escalation signal (fires in addition to the normal restart
519
+ // signal so the operator sees that self-heal is not converging).
520
+ if (dec.stormActive || isRestartStorm(outcome.kind === 'ok' ? outcome.state : null)) {
521
+ DegradationReporter.getInstance().report({
522
+ feature: 'TelegramLifeline.restartStorm',
523
+ primary: 'Rate-limited self-restarts within ceiling',
524
+ fallback: 'Continuing to restart — underlying cause unresolved',
525
+ reason: `>= 6 restarts within the last hour; latest bucket=${bucket} reason=${reason}`,
526
+ impact: 'Operator should investigate; self-heal is not converging.',
527
+ });
528
+ }
529
+ // Write the history entry BEFORE calling process.exit so the new lifeline
530
+ // sees the rate-limit state on startup. Best-effort — failure here still
531
+ // lets the restart proceed (orchestrator is authoritative).
532
+ try {
533
+ const prior = outcome.kind === 'ok' ? outcome.state : null;
534
+ writeRateLimitState(this.projectConfig.stateDir, reason, bucket, prior);
535
+ }
536
+ catch (err) {
537
+ console.error(`[Lifeline] failed to write rate-limit state: ${err}`);
538
+ }
539
+ if (!this.orchestrator) {
540
+ console.error('[Lifeline] initiateRestart called before orchestrator was installed');
541
+ return;
542
+ }
543
+ void this.orchestrator.requestRestart({ reason, bucket, context });
346
544
  }
347
545
  // ── Stale Connection Flush ───────────────────────────────
348
546
  /**
@@ -415,6 +613,8 @@ export class TelegramLifeline {
415
613
  this.saveOffset();
416
614
  }
417
615
  // Success — reset backoff counters
616
+ if (this.consecutive409s > 0)
617
+ this.conflict409StartedAt = null; // 0→... edge
418
618
  this.consecutive409s = 0;
419
619
  this.consecutive429s = 0;
420
620
  this.pollBackoffMs = this.config.pollIntervalMs ?? 2000;
@@ -428,6 +628,9 @@ export class TelegramLifeline {
428
628
  }
429
629
  // Handle 409 Conflict (multiple bot instances polling)
430
630
  if (errMsg.includes('409') && errMsg.includes('Conflict')) {
631
+ // 0→>0 edge: record when conflict started so watchdog can time the stuck state.
632
+ if (this.consecutive409s === 0)
633
+ this.conflict409StartedAt = Date.now();
431
634
  this.consecutive409s++;
432
635
  // Exponential backoff: 4s, 8s, 16s, 32s, max 60s
433
636
  this.pollBackoffMs = Math.min(60_000, 2000 * Math.pow(2, this.consecutive409s));
@@ -743,9 +946,35 @@ export class TelegramLifeline {
743
946
  }
744
947
  /**
745
948
  * Forward a message to the Instar server's Telegram webhook.
949
+ *
950
+ * Attempts up to FORWARD_ATTEMPTS times with exponential backoff
951
+ * (1s, 2s base). A single 10s-timeout fetch per attempt. Returns true
952
+ * on the first success, false after all attempts fail. Giving the
953
+ * handoff a real chance to succeed closes the silent-drop window that
954
+ * the caller's queue-and-retry path papered over.
955
+ */
956
+ static FORWARD_ATTEMPTS = 3;
957
+ static FORWARD_BACKOFF_BASE_MS = 1000;
958
+ /**
959
+ * `legacyStrict` — if a pre-Stage-B server strictly validates JSON and
960
+ * rejects the unknown `lifelineVersion` field with 400, the lifeline
961
+ * falls back to omitting it and pins this flag for the session.
746
962
  */
963
+ legacyStrictServer = false;
964
+ /** Full semver of this lifeline, read once at construction. */
965
+ lifelineVersion = getInstarVersion();
747
966
  async forwardToServer(topicId, text, rawMsg) {
748
- try {
967
+ const buildBody = (includeVersion) => JSON.stringify({
968
+ topicId,
969
+ text,
970
+ fromUserId: rawMsg.from.id,
971
+ fromUsername: rawMsg.from.username,
972
+ fromFirstName: rawMsg.from.first_name,
973
+ messageId: rawMsg.message_id,
974
+ timestamp: new Date(rawMsg.date * 1000).toISOString(),
975
+ ...(includeVersion ? { lifelineVersion: this.lifelineVersion } : {}),
976
+ });
977
+ const doForward = async () => {
749
978
  const controller = new AbortController();
750
979
  const timer = setTimeout(() => controller.abort(), 10_000);
751
980
  try {
@@ -756,27 +985,101 @@ export class TelegramLifeline {
756
985
  const response = await fetch(`http://127.0.0.1:${this.projectConfig.port}/internal/telegram-forward`, {
757
986
  method: 'POST',
758
987
  headers: fwdHeaders,
759
- body: JSON.stringify({
760
- topicId,
761
- text,
762
- fromUserId: rawMsg.from.id,
763
- fromUsername: rawMsg.from.username,
764
- fromFirstName: rawMsg.from.first_name,
765
- messageId: rawMsg.message_id,
766
- timestamp: new Date(rawMsg.date * 1000).toISOString(),
767
- }),
988
+ body: buildBody(!this.legacyStrictServer),
768
989
  signal: controller.signal,
769
990
  });
770
- return response.ok;
991
+ if (response.ok)
992
+ return true;
993
+ if (response.status === 426) {
994
+ const body = (await response.json().catch(() => ({})));
995
+ throw new ForwardVersionSkewError(426, body);
996
+ }
997
+ if (response.status === 503) {
998
+ const body = (await response.json().catch(() => ({})));
999
+ throw new ForwardServerBootError(body.retryAfterMs ?? 1000);
1000
+ }
1001
+ if (response.status === 400) {
1002
+ const body = await response.json().catch(() => ({}));
1003
+ // Graceful degradation: if we included lifelineVersion and the
1004
+ // server rejected the request, retry once without it.
1005
+ if (!this.legacyStrictServer) {
1006
+ this.legacyStrictServer = true;
1007
+ console.warn(`[Lifeline] server returned 400 with lifelineVersion; ` +
1008
+ `retrying without (legacyStrictServer=true)`);
1009
+ // Re-issue the request WITHOUT the version field and return the
1010
+ // result of that retry. If still 400, it's a genuine bad request.
1011
+ const r2 = await fetch(`http://127.0.0.1:${this.projectConfig.port}/internal/telegram-forward`, { method: 'POST', headers: fwdHeaders, body: buildBody(false) });
1012
+ if (r2.ok)
1013
+ return true;
1014
+ if (r2.status === 400)
1015
+ throw new ForwardBadRequestError(await r2.json().catch(() => ({})));
1016
+ throw new ForwardTransientError(r2.status);
1017
+ }
1018
+ throw new ForwardBadRequestError(body);
1019
+ }
1020
+ throw new ForwardTransientError(response.status);
771
1021
  }
772
1022
  finally {
773
1023
  clearTimeout(timer);
774
1024
  }
1025
+ };
1026
+ try {
1027
+ await retryWithBackoff(doForward, {
1028
+ attempts: TelegramLifeline.FORWARD_ATTEMPTS,
1029
+ baseMs: TelegramLifeline.FORWARD_BACKOFF_BASE_MS,
1030
+ isTerminal: isTerminalForwardError,
1031
+ onAttempt: (n, lastErr) => {
1032
+ if (n > 1) {
1033
+ console.warn(`[Lifeline] forwardToServer retry ${n}/${TelegramLifeline.FORWARD_ATTEMPTS} ` +
1034
+ `(topic ${topicId}, msg ${rawMsg.message_id}) — prior: ${lastErr?.message ?? 'unknown'}`);
1035
+ }
1036
+ },
1037
+ });
1038
+ // Record success for watchdog.
1039
+ this.consecutiveForwardFailures = 0;
1040
+ this.lastForwardSuccessAt = Date.now();
1041
+ return true;
775
1042
  }
776
- catch {
1043
+ catch (err) {
1044
+ // Version-skew handler: emit signal + request restart via orchestrator.
1045
+ if (err instanceof ForwardVersionSkewError) {
1046
+ this.handleVersionSkew(err);
1047
+ return false;
1048
+ }
1049
+ this.consecutiveForwardFailures++;
777
1050
  return false;
778
1051
  }
779
1052
  }
1053
+ /**
1054
+ * Handle a 426 response from the server. Validates the response body's
1055
+ * `serverVersion` differs from this lifeline's, then requests restart
1056
+ * through the orchestrator. If the body is malformed or the versions
1057
+ * match (loopback impostor), treat as transient.
1058
+ */
1059
+ handleVersionSkew(err) {
1060
+ const { body } = err;
1061
+ if (body.upgradeRequired !== true) {
1062
+ // Not a genuine Stage-B upgrade directive; treat as transient noise.
1063
+ this.consecutiveForwardFailures++;
1064
+ return;
1065
+ }
1066
+ if (typeof body.serverVersion !== 'string' || body.serverVersion === this.lifelineVersion) {
1067
+ // Loopback impostor or malformed body — don't trust it.
1068
+ console.warn(`[Lifeline] ignoring 426 with missing/matching serverVersion`);
1069
+ this.consecutiveForwardFailures++;
1070
+ return;
1071
+ }
1072
+ this.initiateRestart('versionSkew', 'version-skew', {
1073
+ serverVersion: body.serverVersion,
1074
+ lifelineVersion: this.lifelineVersion,
1075
+ });
1076
+ }
1077
+ /** Watchdog-tracked counters/state. */
1078
+ consecutiveForwardFailures = 0;
1079
+ lastForwardSuccessAt = 0;
1080
+ conflict409StartedAt = null;
1081
+ orchestrator = null;
1082
+ watchdog = null;
780
1083
  // ── Lifeline Commands ─────────────────────────────────────
781
1084
  async handleLifelineCommand(text, topicId, fromUserId) {
782
1085
  const cmd = text.trim().toLowerCase();
@@ -884,6 +1187,26 @@ export class TelegramLifeline {
884
1187
  const failures = msg.replayFailures ?? 0;
885
1188
  if (failures >= TelegramLifeline.MAX_REPLAY_FAILURES) {
886
1189
  dropped++;
1190
+ // Before the drop becomes silent: persist the record, report a
1191
+ // degradation, and tell the original sender their message was lost.
1192
+ try {
1193
+ await notifyMessageDropped({
1194
+ stateDir: this.projectConfig.stateDir,
1195
+ topicId: msg.topicId,
1196
+ messageId: msg.id,
1197
+ senderName: msg.fromFirstName ?? msg.fromUsername ?? String(msg.fromUserId),
1198
+ text: msg.text,
1199
+ retryCount: failures,
1200
+ reason: `Handoff to server failed after ${failures} replay attempts`,
1201
+ sendToTopic: (topicId, body) => this.sendToTopic(topicId, body),
1202
+ });
1203
+ }
1204
+ catch (err) {
1205
+ // notifyMessageDropped only throws on true disk failure after the notice/report paths
1206
+ // had their chance — surface and continue; we still want to drop this message so
1207
+ // the queue doesn't stall.
1208
+ console.error(`[Lifeline] notifyMessageDropped threw for ${msg.id}:`, err instanceof Error ? err.message : err);
1209
+ }
887
1210
  console.warn(`[Lifeline] Dropping message ${msg.id} after ${failures} replay failures: ${msg.text.slice(0, 80)}`);
888
1211
  continue;
889
1212
  }