instar 0.28.66 → 0.28.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/dist/cli.js +53 -0
  2. package/dist/cli.js.map +1 -1
  3. package/dist/commands/server.d.ts.map +1 -1
  4. package/dist/commands/server.js +4 -1
  5. package/dist/commands/server.js.map +1 -1
  6. package/dist/lifeline/LifelineHealthWatchdog.d.ts +81 -0
  7. package/dist/lifeline/LifelineHealthWatchdog.d.ts.map +1 -0
  8. package/dist/lifeline/LifelineHealthWatchdog.js +122 -0
  9. package/dist/lifeline/LifelineHealthWatchdog.js.map +1 -0
  10. package/dist/lifeline/RestartOrchestrator.d.ts +73 -0
  11. package/dist/lifeline/RestartOrchestrator.d.ts.map +1 -0
  12. package/dist/lifeline/RestartOrchestrator.js +124 -0
  13. package/dist/lifeline/RestartOrchestrator.js.map +1 -0
  14. package/dist/lifeline/TelegramLifeline.d.ts +46 -0
  15. package/dist/lifeline/TelegramLifeline.d.ts.map +1 -1
  16. package/dist/lifeline/TelegramLifeline.js +319 -42
  17. package/dist/lifeline/TelegramLifeline.js.map +1 -1
  18. package/dist/lifeline/forwardErrors.d.ts +38 -0
  19. package/dist/lifeline/forwardErrors.d.ts.map +1 -0
  20. package/dist/lifeline/forwardErrors.js +53 -0
  21. package/dist/lifeline/forwardErrors.js.map +1 -0
  22. package/dist/lifeline/rateLimitState.d.ts +63 -0
  23. package/dist/lifeline/rateLimitState.d.ts.map +1 -0
  24. package/dist/lifeline/rateLimitState.js +110 -0
  25. package/dist/lifeline/rateLimitState.js.map +1 -0
  26. package/dist/lifeline/retryWithBackoff.d.ts +6 -0
  27. package/dist/lifeline/retryWithBackoff.d.ts.map +1 -1
  28. package/dist/lifeline/retryWithBackoff.js +2 -0
  29. package/dist/lifeline/retryWithBackoff.js.map +1 -1
  30. package/dist/lifeline/startupMarker.d.ts +20 -0
  31. package/dist/lifeline/startupMarker.d.ts.map +1 -0
  32. package/dist/lifeline/startupMarker.js +52 -0
  33. package/dist/lifeline/startupMarker.js.map +1 -0
  34. package/dist/lifeline/versionHandshake.d.ts +40 -0
  35. package/dist/lifeline/versionHandshake.d.ts.map +1 -0
  36. package/dist/lifeline/versionHandshake.js +45 -0
  37. package/dist/lifeline/versionHandshake.js.map +1 -0
  38. package/dist/monitoring/probes/LifelineProbe.d.ts +7 -2
  39. package/dist/monitoring/probes/LifelineProbe.d.ts.map +1 -1
  40. package/dist/monitoring/probes/LifelineProbe.js +91 -89
  41. package/dist/monitoring/probes/LifelineProbe.js.map +1 -1
  42. package/dist/server/routes.d.ts.map +1 -1
  43. package/dist/server/routes.js +58 -1
  44. package/dist/server/routes.js.map +1 -1
  45. package/package.json +1 -1
  46. package/scripts/fix-better-sqlite3.cjs +209 -43
  47. package/src/data/builtin-manifest.json +80 -80
  48. package/upgrades/0.28.67.md +58 -0
  49. package/upgrades/0.28.68.md +60 -0
  50. package/upgrades/NEXT.md +53 -0
  51. package/upgrades/side-effects/0.28.65.md +59 -0
  52. package/upgrades/side-effects/lifeline-self-restart-stage-b.md +129 -0
  53. package/upgrades/side-effects/lifeline-stage-c-chaos-tests.md +71 -0
  54. package/upgrades/side-effects/lifeline-supervisor-probe-optional.md +76 -0
  55. package/upgrades/side-effects/native-module-source-build-fallback.md +86 -0
@@ -24,7 +24,7 @@ import fs from 'node:fs';
24
24
  import os from 'node:os';
25
25
  import path from 'node:path';
26
26
  import pc from 'picocolors';
27
- import { loadConfig, ensureStateDir, detectTmuxPath } from '../core/Config.js';
27
+ import { loadConfig, ensureStateDir, detectTmuxPath, getInstarVersion } from '../core/Config.js';
28
28
  import { registerAgent, unregisterAgent, startHeartbeat } from '../core/AgentRegistry.js';
29
29
  // setup.ts uses @inquirer/prompts which requires Node 20.12+
30
30
  // Dynamic import to avoid breaking the lifeline on older Node versions
@@ -33,6 +33,12 @@ import { MessageQueue } from './MessageQueue.js';
33
33
  import { ServerSupervisor } from './ServerSupervisor.js';
34
34
  import { retryWithBackoff } from './retryWithBackoff.js';
35
35
  import { notifyMessageDropped } from './droppedMessages.js';
36
+ import { ForwardTransientError, ForwardBadRequestError, ForwardServerBootError, ForwardVersionSkewError, isTerminalForwardError, } from './forwardErrors.js';
37
+ import { writeStartupMarker } from './startupMarker.js';
38
+ import { RestartOrchestrator } from './RestartOrchestrator.js';
39
+ import { LifelineHealthWatchdog, DEFAULT_WATCHDOG_THRESHOLDS, } from './LifelineHealthWatchdog.js';
40
+ import { readRateLimitState, decide as decideRateLimit, writeRateLimitState, isRestartStorm, } from './rateLimitState.js';
41
+ import { DegradationReporter } from '../monitoring/DegradationReporter.js';
36
42
  /**
37
43
  * Acquire an exclusive lock file to prevent multiple lifeline instances.
38
44
  * Returns true if lock acquired, false if another instance holds it.
@@ -234,7 +240,21 @@ export class TelegramLifeline {
234
240
  console.log(pc.bold(`Starting Telegram Lifeline for ${pc.cyan(this.projectConfig.projectName)}`));
235
241
  console.log(` Port: ${this.projectConfig.port}`);
236
242
  console.log(` State: ${this.projectConfig.stateDir}`);
243
+ console.log(` Version: ${this.lifelineVersion}`);
237
244
  console.log();
245
+ // Stage B: startup liveness marker. Every startup, regardless of cause,
246
+ // writes this file so `instar lifeline restart` can detect pid changes.
247
+ writeStartupMarker(this.projectConfig.stateDir, this.lifelineVersion);
248
+ // Stage B: startup coherence check. Guards against respawning into a
249
+ // half-written shadow install where the bundled package.json advertises
250
+ // a version but the code is broken or missing. The getInstarVersion()
251
+ // helper is the same one used below; if it returns '0.0.0' (its error
252
+ // fallback), the install is incoherent — exit code 2 so launchd throttles
253
+ // respawn rather than tight-looping.
254
+ if (this.lifelineVersion === '0.0.0') {
255
+ console.error(pc.red('[Lifeline] startup coherence check failed: package.json missing or unreadable. Exiting with code 2 for launchd throttle.'));
256
+ process.exit(2);
257
+ }
238
258
  // Acquire exclusive lock — prevent multiple lifeline instances
239
259
  if (!acquireLockFile(this.lockPath)) {
240
260
  console.error(pc.red('[Lifeline] Another lifeline instance is already running. Exiting.'));
@@ -281,6 +301,10 @@ export class TelegramLifeline {
281
301
  this.replayQueue();
282
302
  }
283
303
  }, 15_000);
304
+ // Stage B: install the restart orchestrator and health watchdog.
305
+ // In unsupervised mode (no INSTAR_SUPERVISED=1 and no launchd parent),
306
+ // the orchestrator emits signals and logs but skips process.exit.
307
+ this.installOrchestratorAndWatchdog();
284
308
  // Replay any messages queued from previous lifeline runs
285
309
  if (this.queue.length > 0) {
286
310
  console.log(` ${this.queue.length} queued messages from previous run`);
@@ -314,37 +338,209 @@ export class TelegramLifeline {
314
338
  // without any visible error in the server logs — the agent appears alive
315
339
  // but never responds to messages.
316
340
  this.selfHealSettingsJson();
317
- // Graceful shutdown — every step is wrapped in try-catch because a crash
318
- // during shutdown leaves the lifeline in a half-alive state that confuses
319
- // launchd's KeepAlive restart logic.
320
- const shutdown = async () => {
321
- console.log('\nLifeline shutting down...');
322
- this.polling = false;
323
- if (this.pollTimeout)
324
- clearTimeout(this.pollTimeout);
325
- if (this.replayInterval)
326
- clearInterval(this.replayInterval);
327
- try {
328
- if (this.stopHeartbeat)
329
- this.stopHeartbeat();
330
- }
331
- catch { /* non-critical */ }
332
- try {
333
- unregisterAgent(this.projectConfig.projectDir + '-lifeline');
334
- }
335
- catch { /* ELOCKED is non-critical during shutdown */ }
336
- try {
337
- releaseLockFile(this.lockPath);
341
+ // Graceful shutdown — SIGTERM/SIGINT route through the orchestrator so
342
+ // external restarts (e.g., `instar lifeline restart` launchctl kickstart)
343
+ // get the same quiesce+persist semantics as self-triggered ones.
344
+ const externalShutdown = async () => {
345
+ if (this.orchestrator) {
346
+ await this.orchestrator.requestRestart({
347
+ reason: 'external-signal',
348
+ bucket: 'watchdog',
349
+ });
338
350
  }
339
- catch { /* non-critical */ }
340
- try {
341
- await this.supervisor.stop();
351
+ else {
352
+ // Fallback if orchestrator wasn't installed (should not happen post-Stage-B)
353
+ console.log('\nLifeline shutting down (no orchestrator)...');
354
+ await this.quiesceEverything();
355
+ process.exit(0);
342
356
  }
343
- catch { /* best effort */ }
344
- process.exit(0);
345
357
  };
346
- process.on('SIGINT', shutdown);
347
- process.on('SIGTERM', shutdown);
358
+ process.on('SIGINT', externalShutdown);
359
+ process.on('SIGTERM', externalShutdown);
360
+ }
361
+ /**
362
+ * Stop all in-flight / scheduled mutation sources so the queue snapshot
363
+ * is consistent when persisted.
364
+ */
365
+ async quiesceEverything() {
366
+ this.polling = false;
367
+ if (this.pollTimeout)
368
+ clearTimeout(this.pollTimeout);
369
+ if (this.replayInterval) {
370
+ clearInterval(this.replayInterval);
371
+ this.replayInterval = null;
372
+ }
373
+ if (this.watchdog)
374
+ this.watchdog.stop();
375
+ try {
376
+ if (this.stopHeartbeat)
377
+ this.stopHeartbeat();
378
+ }
379
+ catch { /* non-critical */ }
380
+ try {
381
+ unregisterAgent(this.projectConfig.projectDir + '-lifeline');
382
+ }
383
+ catch { /* non-critical */ }
384
+ try {
385
+ releaseLockFile(this.lockPath);
386
+ }
387
+ catch { /* non-critical */ }
388
+ try {
389
+ await this.supervisor.stop();
390
+ }
391
+ catch { /* best-effort */ }
392
+ }
393
+ /**
394
+ * Install the restart orchestrator and watchdog. Called from start().
395
+ *
396
+ * The orchestrator owns the process.exit call. The watchdog requests
397
+ * restarts via the orchestrator on threshold crossings, subject to
398
+ * rate-limit state on disk.
399
+ */
400
+ installOrchestratorAndWatchdog() {
401
+ const isSupervised = process.env.INSTAR_SUPERVISED === '1' ||
402
+ process.env.NODE_ENV !== 'test' && process.ppid === 1;
403
+ this.orchestrator = new RestartOrchestrator({
404
+ quiesce: () => this.quiesceEverything(),
405
+ persistAll: async () => {
406
+ // Each persist is best-effort; Promise.all so they run in parallel.
407
+ await Promise.all([
408
+ this.persistRateLimitSafe(),
409
+ // Queue + dropped-messages are already atomically persisted by
410
+ // existing code paths (MessageQueue.save, notifyMessageDropped's
411
+ // atomic write). A no-op here is correct — the goal is "nothing
412
+ // is in-flight that would need a final flush."
413
+ Promise.resolve(),
414
+ ]);
415
+ },
416
+ exitFn: (code) => process.exit(code),
417
+ isSupervised,
418
+ isShadowInstallUpdating: () => {
419
+ // Shadow-install sibling path: `.instar/shadow-install/.updating`.
420
+ // stateDir is `.instar/state`; we check one level up for the lockfile.
421
+ const lockPath = path.join(path.dirname(this.projectConfig.stateDir), 'shadow-install', '.updating');
422
+ try {
423
+ return fs.existsSync(lockPath);
424
+ }
425
+ catch {
426
+ return false;
427
+ }
428
+ },
429
+ });
430
+ const onTrip = (result) => {
431
+ this.initiateRestart('watchdog', result.primary ?? 'unknown', {
432
+ tripped: result.tripped,
433
+ snapshot: result.snapshot,
434
+ });
435
+ };
436
+ this.watchdog = new LifelineHealthWatchdog({
437
+ thresholds: this.loadThresholdOverrides(),
438
+ getInputs: () => ({
439
+ now: Date.now(),
440
+ oldestQueueItemEnqueuedAt: this.oldestQueueItemEnqueuedAt(),
441
+ consecutiveForwardFailures: this.consecutiveForwardFailures,
442
+ conflict409StartedAt: this.conflict409StartedAt,
443
+ serverHealthy: this.supervisor.getStatus().healthy,
444
+ }),
445
+ onTrip,
446
+ onStarved: (gap) => {
447
+ DegradationReporter.getInstance().report({
448
+ feature: 'TelegramLifeline.watchdogStarved',
449
+ primary: 'Watchdog tick on schedule',
450
+ fallback: `Tick gap ${Math.round(gap / 1000)}s — event loop blocked`,
451
+ reason: 'setInterval delayed by blocked loop',
452
+ impact: 'Observability only; watchdog still functional at coarser granularity.',
453
+ });
454
+ },
455
+ autoStart: process.env.NODE_ENV !== 'test',
456
+ });
457
+ }
458
+ /** Extract oldest queue item's enqueue timestamp as ms, if any. */
459
+ oldestQueueItemEnqueuedAt() {
460
+ const peeked = this.queue.peek();
461
+ if (peeked.length === 0)
462
+ return undefined;
463
+ const ts = Date.parse(peeked[0].timestamp);
464
+ return Number.isFinite(ts) ? ts : undefined;
465
+ }
466
+ /** Read config overrides for watchdog thresholds. */
467
+ loadThresholdOverrides() {
468
+ const raw = this.projectConfig.lifeline?.watchdog;
469
+ if (!raw || typeof raw !== 'object')
470
+ return {};
471
+ const valid = (v) => typeof v === 'number' && Number.isFinite(v) && v > 0;
472
+ const out = {};
473
+ if (valid(raw.tickIntervalMs))
474
+ out.tickIntervalMs = raw.tickIntervalMs;
475
+ if (valid(raw.noForwardStuckMs))
476
+ out.noForwardStuckMs = raw.noForwardStuckMs;
477
+ if (valid(raw.consecutiveFailureMax))
478
+ out.consecutiveFailureMax = raw.consecutiveFailureMax;
479
+ if (valid(raw.conflict409StuckMs))
480
+ out.conflict409StuckMs = raw.conflict409StuckMs;
481
+ let hadInvalid = false;
482
+ for (const k of Object.keys(raw)) {
483
+ if (!(k in DEFAULT_WATCHDOG_THRESHOLDS))
484
+ hadInvalid = true;
485
+ else if (!valid(raw[k]))
486
+ hadInvalid = true;
487
+ }
488
+ if (hadInvalid) {
489
+ DegradationReporter.getInstance().report({
490
+ feature: 'TelegramLifeline.configInvalid',
491
+ primary: 'Valid watchdog threshold overrides',
492
+ fallback: 'Falling back to defaults for invalid keys',
493
+ reason: 'Non-finite, non-positive, or unknown override key in lifeline.watchdog',
494
+ impact: 'Threshold uses default; behavior unchanged but config is misleading.',
495
+ });
496
+ }
497
+ return out;
498
+ }
499
+ /** Persist rate-limit state. Safe to call during orchestrator persist. */
500
+ async persistRateLimitSafe() {
501
+ // The orchestrator invokes this while transitioning to 'persisting';
502
+ // rate-limit history was already written by initiateRestart() before
503
+ // the orchestrator was called. This is a final no-op flush.
504
+ return;
505
+ }
506
+ /**
507
+ * Unified restart initiator: checks rate limit, writes history, then
508
+ * calls the orchestrator. Used by both the watchdog tick (bucket=watchdog)
509
+ * and the version-skew handler (bucket=versionSkew).
510
+ */
511
+ initiateRestart(bucket, reason, context) {
512
+ const outcome = readRateLimitState(this.projectConfig.stateDir);
513
+ const dec = decideRateLimit(outcome, bucket);
514
+ if (!dec.allowed) {
515
+ console.log(`[Lifeline] restart suppressed by rate limit: ${dec.reason} (bucket=${bucket} reason=${reason})`);
516
+ return;
517
+ }
518
+ // Storm escalation signal (fires in addition to the normal restart
519
+ // signal so the operator sees that self-heal is not converging).
520
+ if (dec.stormActive || isRestartStorm(outcome.kind === 'ok' ? outcome.state : null)) {
521
+ DegradationReporter.getInstance().report({
522
+ feature: 'TelegramLifeline.restartStorm',
523
+ primary: 'Rate-limited self-restarts within ceiling',
524
+ fallback: 'Continuing to restart — underlying cause unresolved',
525
+ reason: `>= 6 restarts within the last hour; latest bucket=${bucket} reason=${reason}`,
526
+ impact: 'Operator should investigate; self-heal is not converging.',
527
+ });
528
+ }
529
+ // Write the history entry BEFORE calling process.exit so the new lifeline
530
+ // sees the rate-limit state on startup. Best-effort — failure here still
531
+ // lets the restart proceed (orchestrator is authoritative).
532
+ try {
533
+ const prior = outcome.kind === 'ok' ? outcome.state : null;
534
+ writeRateLimitState(this.projectConfig.stateDir, reason, bucket, prior);
535
+ }
536
+ catch (err) {
537
+ console.error(`[Lifeline] failed to write rate-limit state: ${err}`);
538
+ }
539
+ if (!this.orchestrator) {
540
+ console.error('[Lifeline] initiateRestart called before orchestrator was installed');
541
+ return;
542
+ }
543
+ void this.orchestrator.requestRestart({ reason, bucket, context });
348
544
  }
349
545
  // ── Stale Connection Flush ───────────────────────────────
350
546
  /**
@@ -417,6 +613,8 @@ export class TelegramLifeline {
417
613
  this.saveOffset();
418
614
  }
419
615
  // Success — reset backoff counters
616
+ if (this.consecutive409s > 0)
617
+ this.conflict409StartedAt = null; // 0→... edge
420
618
  this.consecutive409s = 0;
421
619
  this.consecutive429s = 0;
422
620
  this.pollBackoffMs = this.config.pollIntervalMs ?? 2000;
@@ -430,6 +628,9 @@ export class TelegramLifeline {
430
628
  }
431
629
  // Handle 409 Conflict (multiple bot instances polling)
432
630
  if (errMsg.includes('409') && errMsg.includes('Conflict')) {
631
+ // 0→>0 edge: record when conflict started so watchdog can time the stuck state.
632
+ if (this.consecutive409s === 0)
633
+ this.conflict409StartedAt = Date.now();
433
634
  this.consecutive409s++;
434
635
  // Exponential backoff: 4s, 8s, 16s, 32s, max 60s
435
636
  this.pollBackoffMs = Math.min(60_000, 2000 * Math.pow(2, this.consecutive409s));
@@ -754,7 +955,25 @@ export class TelegramLifeline {
754
955
  */
755
956
  static FORWARD_ATTEMPTS = 3;
756
957
  static FORWARD_BACKOFF_BASE_MS = 1000;
958
+ /**
959
+ * `legacyStrict` — if a pre-Stage-B server strictly validates JSON and
960
+ * rejects the unknown `lifelineVersion` field with 400, the lifeline
961
+ * falls back to omitting it and pins this flag for the session.
962
+ */
963
+ legacyStrictServer = false;
964
+ /** Full semver of this lifeline, read once at construction. */
965
+ lifelineVersion = getInstarVersion();
757
966
  async forwardToServer(topicId, text, rawMsg) {
967
+ const buildBody = (includeVersion) => JSON.stringify({
968
+ topicId,
969
+ text,
970
+ fromUserId: rawMsg.from.id,
971
+ fromUsername: rawMsg.from.username,
972
+ fromFirstName: rawMsg.from.first_name,
973
+ messageId: rawMsg.message_id,
974
+ timestamp: new Date(rawMsg.date * 1000).toISOString(),
975
+ ...(includeVersion ? { lifelineVersion: this.lifelineVersion } : {}),
976
+ });
758
977
  const doForward = async () => {
759
978
  const controller = new AbortController();
760
979
  const timer = setTimeout(() => controller.abort(), 10_000);
@@ -766,21 +985,39 @@ export class TelegramLifeline {
766
985
  const response = await fetch(`http://127.0.0.1:${this.projectConfig.port}/internal/telegram-forward`, {
767
986
  method: 'POST',
768
987
  headers: fwdHeaders,
769
- body: JSON.stringify({
770
- topicId,
771
- text,
772
- fromUserId: rawMsg.from.id,
773
- fromUsername: rawMsg.from.username,
774
- fromFirstName: rawMsg.from.first_name,
775
- messageId: rawMsg.message_id,
776
- timestamp: new Date(rawMsg.date * 1000).toISOString(),
777
- }),
988
+ body: buildBody(!this.legacyStrictServer),
778
989
  signal: controller.signal,
779
990
  });
780
- if (!response.ok) {
781
- throw new Error(`forward responded ${response.status}`);
991
+ if (response.ok)
992
+ return true;
993
+ if (response.status === 426) {
994
+ const body = (await response.json().catch(() => ({})));
995
+ throw new ForwardVersionSkewError(426, body);
782
996
  }
783
- return true;
997
+ if (response.status === 503) {
998
+ const body = (await response.json().catch(() => ({})));
999
+ throw new ForwardServerBootError(body.retryAfterMs ?? 1000);
1000
+ }
1001
+ if (response.status === 400) {
1002
+ const body = await response.json().catch(() => ({}));
1003
+ // Graceful degradation: if we included lifelineVersion and the
1004
+ // server rejected the request, retry once without it.
1005
+ if (!this.legacyStrictServer) {
1006
+ this.legacyStrictServer = true;
1007
+ console.warn(`[Lifeline] server returned 400 with lifelineVersion; ` +
1008
+ `retrying without (legacyStrictServer=true)`);
1009
+ // Re-issue the request WITHOUT the version field and return the
1010
+ // result of that retry. If still 400, it's a genuine bad request.
1011
+ const r2 = await fetch(`http://127.0.0.1:${this.projectConfig.port}/internal/telegram-forward`, { method: 'POST', headers: fwdHeaders, body: buildBody(false) });
1012
+ if (r2.ok)
1013
+ return true;
1014
+ if (r2.status === 400)
1015
+ throw new ForwardBadRequestError(await r2.json().catch(() => ({})));
1016
+ throw new ForwardTransientError(r2.status);
1017
+ }
1018
+ throw new ForwardBadRequestError(body);
1019
+ }
1020
+ throw new ForwardTransientError(response.status);
784
1021
  }
785
1022
  finally {
786
1023
  clearTimeout(timer);
@@ -790,6 +1027,7 @@ export class TelegramLifeline {
790
1027
  await retryWithBackoff(doForward, {
791
1028
  attempts: TelegramLifeline.FORWARD_ATTEMPTS,
792
1029
  baseMs: TelegramLifeline.FORWARD_BACKOFF_BASE_MS,
1030
+ isTerminal: isTerminalForwardError,
793
1031
  onAttempt: (n, lastErr) => {
794
1032
  if (n > 1) {
795
1033
  console.warn(`[Lifeline] forwardToServer retry ${n}/${TelegramLifeline.FORWARD_ATTEMPTS} ` +
@@ -797,12 +1035,51 @@ export class TelegramLifeline {
797
1035
  }
798
1036
  },
799
1037
  });
1038
+ // Record success for watchdog.
1039
+ this.consecutiveForwardFailures = 0;
1040
+ this.lastForwardSuccessAt = Date.now();
800
1041
  return true;
801
1042
  }
802
- catch {
1043
+ catch (err) {
1044
+ // Version-skew handler: emit signal + request restart via orchestrator.
1045
+ if (err instanceof ForwardVersionSkewError) {
1046
+ this.handleVersionSkew(err);
1047
+ return false;
1048
+ }
1049
+ this.consecutiveForwardFailures++;
803
1050
  return false;
804
1051
  }
805
1052
  }
1053
+ /**
1054
+ * Handle a 426 response from the server. Validates the response body's
1055
+ * `serverVersion` differs from this lifeline's, then requests restart
1056
+ * through the orchestrator. If the body is malformed or the versions
1057
+ * match (loopback impostor), treat as transient.
1058
+ */
1059
+ handleVersionSkew(err) {
1060
+ const { body } = err;
1061
+ if (body.upgradeRequired !== true) {
1062
+ // Not a genuine Stage-B upgrade directive; treat as transient noise.
1063
+ this.consecutiveForwardFailures++;
1064
+ return;
1065
+ }
1066
+ if (typeof body.serverVersion !== 'string' || body.serverVersion === this.lifelineVersion) {
1067
+ // Loopback impostor or malformed body — don't trust it.
1068
+ console.warn(`[Lifeline] ignoring 426 with missing/matching serverVersion`);
1069
+ this.consecutiveForwardFailures++;
1070
+ return;
1071
+ }
1072
+ this.initiateRestart('versionSkew', 'version-skew', {
1073
+ serverVersion: body.serverVersion,
1074
+ lifelineVersion: this.lifelineVersion,
1075
+ });
1076
+ }
1077
+ /** Watchdog-tracked counters/state. */
1078
+ consecutiveForwardFailures = 0;
1079
+ lastForwardSuccessAt = 0;
1080
+ conflict409StartedAt = null;
1081
+ orchestrator = null;
1082
+ watchdog = null;
806
1083
  // ── Lifeline Commands ─────────────────────────────────────
807
1084
  async handleLifelineCommand(text, topicId, fromUserId) {
808
1085
  const cmd = text.trim().toLowerCase();