instar 0.28.66 → 0.28.68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +53 -0
- package/dist/cli.js.map +1 -1
- package/dist/commands/server.d.ts.map +1 -1
- package/dist/commands/server.js +4 -1
- package/dist/commands/server.js.map +1 -1
- package/dist/lifeline/LifelineHealthWatchdog.d.ts +81 -0
- package/dist/lifeline/LifelineHealthWatchdog.d.ts.map +1 -0
- package/dist/lifeline/LifelineHealthWatchdog.js +122 -0
- package/dist/lifeline/LifelineHealthWatchdog.js.map +1 -0
- package/dist/lifeline/RestartOrchestrator.d.ts +73 -0
- package/dist/lifeline/RestartOrchestrator.d.ts.map +1 -0
- package/dist/lifeline/RestartOrchestrator.js +124 -0
- package/dist/lifeline/RestartOrchestrator.js.map +1 -0
- package/dist/lifeline/TelegramLifeline.d.ts +46 -0
- package/dist/lifeline/TelegramLifeline.d.ts.map +1 -1
- package/dist/lifeline/TelegramLifeline.js +319 -42
- package/dist/lifeline/TelegramLifeline.js.map +1 -1
- package/dist/lifeline/forwardErrors.d.ts +38 -0
- package/dist/lifeline/forwardErrors.d.ts.map +1 -0
- package/dist/lifeline/forwardErrors.js +53 -0
- package/dist/lifeline/forwardErrors.js.map +1 -0
- package/dist/lifeline/rateLimitState.d.ts +63 -0
- package/dist/lifeline/rateLimitState.d.ts.map +1 -0
- package/dist/lifeline/rateLimitState.js +110 -0
- package/dist/lifeline/rateLimitState.js.map +1 -0
- package/dist/lifeline/retryWithBackoff.d.ts +6 -0
- package/dist/lifeline/retryWithBackoff.d.ts.map +1 -1
- package/dist/lifeline/retryWithBackoff.js +2 -0
- package/dist/lifeline/retryWithBackoff.js.map +1 -1
- package/dist/lifeline/startupMarker.d.ts +20 -0
- package/dist/lifeline/startupMarker.d.ts.map +1 -0
- package/dist/lifeline/startupMarker.js +52 -0
- package/dist/lifeline/startupMarker.js.map +1 -0
- package/dist/lifeline/versionHandshake.d.ts +40 -0
- package/dist/lifeline/versionHandshake.d.ts.map +1 -0
- package/dist/lifeline/versionHandshake.js +45 -0
- package/dist/lifeline/versionHandshake.js.map +1 -0
- package/dist/monitoring/probes/LifelineProbe.d.ts +7 -2
- package/dist/monitoring/probes/LifelineProbe.d.ts.map +1 -1
- package/dist/monitoring/probes/LifelineProbe.js +91 -89
- package/dist/monitoring/probes/LifelineProbe.js.map +1 -1
- package/dist/server/routes.d.ts.map +1 -1
- package/dist/server/routes.js +58 -1
- package/dist/server/routes.js.map +1 -1
- package/package.json +1 -1
- package/scripts/fix-better-sqlite3.cjs +209 -43
- package/src/data/builtin-manifest.json +80 -80
- package/upgrades/0.28.67.md +58 -0
- package/upgrades/0.28.68.md +60 -0
- package/upgrades/NEXT.md +53 -0
- package/upgrades/side-effects/0.28.65.md +59 -0
- package/upgrades/side-effects/lifeline-self-restart-stage-b.md +129 -0
- package/upgrades/side-effects/lifeline-stage-c-chaos-tests.md +71 -0
- package/upgrades/side-effects/lifeline-supervisor-probe-optional.md +76 -0
- package/upgrades/side-effects/native-module-source-build-fallback.md +86 -0
|
@@ -24,7 +24,7 @@ import fs from 'node:fs';
|
|
|
24
24
|
import os from 'node:os';
|
|
25
25
|
import path from 'node:path';
|
|
26
26
|
import pc from 'picocolors';
|
|
27
|
-
import { loadConfig, ensureStateDir, detectTmuxPath } from '../core/Config.js';
|
|
27
|
+
import { loadConfig, ensureStateDir, detectTmuxPath, getInstarVersion } from '../core/Config.js';
|
|
28
28
|
import { registerAgent, unregisterAgent, startHeartbeat } from '../core/AgentRegistry.js';
|
|
29
29
|
// setup.ts uses @inquirer/prompts which requires Node 20.12+
|
|
30
30
|
// Dynamic import to avoid breaking the lifeline on older Node versions
|
|
@@ -33,6 +33,12 @@ import { MessageQueue } from './MessageQueue.js';
|
|
|
33
33
|
import { ServerSupervisor } from './ServerSupervisor.js';
|
|
34
34
|
import { retryWithBackoff } from './retryWithBackoff.js';
|
|
35
35
|
import { notifyMessageDropped } from './droppedMessages.js';
|
|
36
|
+
import { ForwardTransientError, ForwardBadRequestError, ForwardServerBootError, ForwardVersionSkewError, isTerminalForwardError, } from './forwardErrors.js';
|
|
37
|
+
import { writeStartupMarker } from './startupMarker.js';
|
|
38
|
+
import { RestartOrchestrator } from './RestartOrchestrator.js';
|
|
39
|
+
import { LifelineHealthWatchdog, DEFAULT_WATCHDOG_THRESHOLDS, } from './LifelineHealthWatchdog.js';
|
|
40
|
+
import { readRateLimitState, decide as decideRateLimit, writeRateLimitState, isRestartStorm, } from './rateLimitState.js';
|
|
41
|
+
import { DegradationReporter } from '../monitoring/DegradationReporter.js';
|
|
36
42
|
/**
|
|
37
43
|
* Acquire an exclusive lock file to prevent multiple lifeline instances.
|
|
38
44
|
* Returns true if lock acquired, false if another instance holds it.
|
|
@@ -234,7 +240,21 @@ export class TelegramLifeline {
|
|
|
234
240
|
console.log(pc.bold(`Starting Telegram Lifeline for ${pc.cyan(this.projectConfig.projectName)}`));
|
|
235
241
|
console.log(` Port: ${this.projectConfig.port}`);
|
|
236
242
|
console.log(` State: ${this.projectConfig.stateDir}`);
|
|
243
|
+
console.log(` Version: ${this.lifelineVersion}`);
|
|
237
244
|
console.log();
|
|
245
|
+
// Stage B: startup liveness marker. Every startup, regardless of cause,
|
|
246
|
+
// writes this file so `instar lifeline restart` can detect pid changes.
|
|
247
|
+
writeStartupMarker(this.projectConfig.stateDir, this.lifelineVersion);
|
|
248
|
+
// Stage B: startup coherence check. Guards against respawning into a
|
|
249
|
+
// half-written shadow install where the bundled package.json advertises
|
|
250
|
+
// a version but the code is broken or missing. The getInstarVersion()
|
|
251
|
+
// helper is the same one used below; if it returns '0.0.0' (its error
|
|
252
|
+
// fallback), the install is incoherent — exit code 2 so launchd throttles
|
|
253
|
+
// respawn rather than tight-looping.
|
|
254
|
+
if (this.lifelineVersion === '0.0.0') {
|
|
255
|
+
console.error(pc.red('[Lifeline] startup coherence check failed: package.json missing or unreadable. Exiting with code 2 for launchd throttle.'));
|
|
256
|
+
process.exit(2);
|
|
257
|
+
}
|
|
238
258
|
// Acquire exclusive lock — prevent multiple lifeline instances
|
|
239
259
|
if (!acquireLockFile(this.lockPath)) {
|
|
240
260
|
console.error(pc.red('[Lifeline] Another lifeline instance is already running. Exiting.'));
|
|
@@ -281,6 +301,10 @@ export class TelegramLifeline {
|
|
|
281
301
|
this.replayQueue();
|
|
282
302
|
}
|
|
283
303
|
}, 15_000);
|
|
304
|
+
// Stage B: install the restart orchestrator and health watchdog.
|
|
305
|
+
// In unsupervised mode (no INSTAR_SUPERVISED=1 and no launchd parent),
|
|
306
|
+
// the orchestrator emits signals and logs but skips process.exit.
|
|
307
|
+
this.installOrchestratorAndWatchdog();
|
|
284
308
|
// Replay any messages queued from previous lifeline runs
|
|
285
309
|
if (this.queue.length > 0) {
|
|
286
310
|
console.log(` ${this.queue.length} queued messages from previous run`);
|
|
@@ -314,37 +338,209 @@ export class TelegramLifeline {
|
|
|
314
338
|
// without any visible error in the server logs — the agent appears alive
|
|
315
339
|
// but never responds to messages.
|
|
316
340
|
this.selfHealSettingsJson();
|
|
317
|
-
// Graceful shutdown —
|
|
318
|
-
//
|
|
319
|
-
//
|
|
320
|
-
const
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
clearInterval(this.replayInterval);
|
|
327
|
-
try {
|
|
328
|
-
if (this.stopHeartbeat)
|
|
329
|
-
this.stopHeartbeat();
|
|
330
|
-
}
|
|
331
|
-
catch { /* non-critical */ }
|
|
332
|
-
try {
|
|
333
|
-
unregisterAgent(this.projectConfig.projectDir + '-lifeline');
|
|
334
|
-
}
|
|
335
|
-
catch { /* ELOCKED is non-critical during shutdown */ }
|
|
336
|
-
try {
|
|
337
|
-
releaseLockFile(this.lockPath);
|
|
341
|
+
// Graceful shutdown — SIGTERM/SIGINT route through the orchestrator so
|
|
342
|
+
// external restarts (e.g., `instar lifeline restart` → launchctl kickstart)
|
|
343
|
+
// get the same quiesce+persist semantics as self-triggered ones.
|
|
344
|
+
const externalShutdown = async () => {
|
|
345
|
+
if (this.orchestrator) {
|
|
346
|
+
await this.orchestrator.requestRestart({
|
|
347
|
+
reason: 'external-signal',
|
|
348
|
+
bucket: 'watchdog',
|
|
349
|
+
});
|
|
338
350
|
}
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
351
|
+
else {
|
|
352
|
+
// Fallback if orchestrator wasn't installed (should not happen post-Stage-B)
|
|
353
|
+
console.log('\nLifeline shutting down (no orchestrator)...');
|
|
354
|
+
await this.quiesceEverything();
|
|
355
|
+
process.exit(0);
|
|
342
356
|
}
|
|
343
|
-
catch { /* best effort */ }
|
|
344
|
-
process.exit(0);
|
|
345
357
|
};
|
|
346
|
-
process.on('SIGINT',
|
|
347
|
-
process.on('SIGTERM',
|
|
358
|
+
process.on('SIGINT', externalShutdown);
|
|
359
|
+
process.on('SIGTERM', externalShutdown);
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Stop all in-flight / scheduled mutation sources so the queue snapshot
|
|
363
|
+
* is consistent when persisted.
|
|
364
|
+
*/
|
|
365
|
+
async quiesceEverything() {
|
|
366
|
+
this.polling = false;
|
|
367
|
+
if (this.pollTimeout)
|
|
368
|
+
clearTimeout(this.pollTimeout);
|
|
369
|
+
if (this.replayInterval) {
|
|
370
|
+
clearInterval(this.replayInterval);
|
|
371
|
+
this.replayInterval = null;
|
|
372
|
+
}
|
|
373
|
+
if (this.watchdog)
|
|
374
|
+
this.watchdog.stop();
|
|
375
|
+
try {
|
|
376
|
+
if (this.stopHeartbeat)
|
|
377
|
+
this.stopHeartbeat();
|
|
378
|
+
}
|
|
379
|
+
catch { /* non-critical */ }
|
|
380
|
+
try {
|
|
381
|
+
unregisterAgent(this.projectConfig.projectDir + '-lifeline');
|
|
382
|
+
}
|
|
383
|
+
catch { /* non-critical */ }
|
|
384
|
+
try {
|
|
385
|
+
releaseLockFile(this.lockPath);
|
|
386
|
+
}
|
|
387
|
+
catch { /* non-critical */ }
|
|
388
|
+
try {
|
|
389
|
+
await this.supervisor.stop();
|
|
390
|
+
}
|
|
391
|
+
catch { /* best-effort */ }
|
|
392
|
+
}
|
|
393
|
+
/**
|
|
394
|
+
* Install the restart orchestrator and watchdog. Called from start().
|
|
395
|
+
*
|
|
396
|
+
* The orchestrator owns the process.exit call. The watchdog requests
|
|
397
|
+
* restarts via the orchestrator on threshold crossings, subject to
|
|
398
|
+
* rate-limit state on disk.
|
|
399
|
+
*/
|
|
400
|
+
installOrchestratorAndWatchdog() {
|
|
401
|
+
const isSupervised = process.env.INSTAR_SUPERVISED === '1' ||
|
|
402
|
+
process.env.NODE_ENV !== 'test' && process.ppid === 1;
|
|
403
|
+
this.orchestrator = new RestartOrchestrator({
|
|
404
|
+
quiesce: () => this.quiesceEverything(),
|
|
405
|
+
persistAll: async () => {
|
|
406
|
+
// Each persist is best-effort; Promise.all so they run in parallel.
|
|
407
|
+
await Promise.all([
|
|
408
|
+
this.persistRateLimitSafe(),
|
|
409
|
+
// Queue + dropped-messages are already atomically persisted by
|
|
410
|
+
// existing code paths (MessageQueue.save, notifyMessageDropped's
|
|
411
|
+
// atomic write). A no-op here is correct — the goal is "nothing
|
|
412
|
+
// is in-flight that would need a final flush."
|
|
413
|
+
Promise.resolve(),
|
|
414
|
+
]);
|
|
415
|
+
},
|
|
416
|
+
exitFn: (code) => process.exit(code),
|
|
417
|
+
isSupervised,
|
|
418
|
+
isShadowInstallUpdating: () => {
|
|
419
|
+
// Shadow-install sibling path: `.instar/shadow-install/.updating`.
|
|
420
|
+
// stateDir is `.instar/state`; we check one level up for the lockfile.
|
|
421
|
+
const lockPath = path.join(path.dirname(this.projectConfig.stateDir), 'shadow-install', '.updating');
|
|
422
|
+
try {
|
|
423
|
+
return fs.existsSync(lockPath);
|
|
424
|
+
}
|
|
425
|
+
catch {
|
|
426
|
+
return false;
|
|
427
|
+
}
|
|
428
|
+
},
|
|
429
|
+
});
|
|
430
|
+
const onTrip = (result) => {
|
|
431
|
+
this.initiateRestart('watchdog', result.primary ?? 'unknown', {
|
|
432
|
+
tripped: result.tripped,
|
|
433
|
+
snapshot: result.snapshot,
|
|
434
|
+
});
|
|
435
|
+
};
|
|
436
|
+
this.watchdog = new LifelineHealthWatchdog({
|
|
437
|
+
thresholds: this.loadThresholdOverrides(),
|
|
438
|
+
getInputs: () => ({
|
|
439
|
+
now: Date.now(),
|
|
440
|
+
oldestQueueItemEnqueuedAt: this.oldestQueueItemEnqueuedAt(),
|
|
441
|
+
consecutiveForwardFailures: this.consecutiveForwardFailures,
|
|
442
|
+
conflict409StartedAt: this.conflict409StartedAt,
|
|
443
|
+
serverHealthy: this.supervisor.getStatus().healthy,
|
|
444
|
+
}),
|
|
445
|
+
onTrip,
|
|
446
|
+
onStarved: (gap) => {
|
|
447
|
+
DegradationReporter.getInstance().report({
|
|
448
|
+
feature: 'TelegramLifeline.watchdogStarved',
|
|
449
|
+
primary: 'Watchdog tick on schedule',
|
|
450
|
+
fallback: `Tick gap ${Math.round(gap / 1000)}s — event loop blocked`,
|
|
451
|
+
reason: 'setInterval delayed by blocked loop',
|
|
452
|
+
impact: 'Observability only; watchdog still functional at coarser granularity.',
|
|
453
|
+
});
|
|
454
|
+
},
|
|
455
|
+
autoStart: process.env.NODE_ENV !== 'test',
|
|
456
|
+
});
|
|
457
|
+
}
|
|
458
|
+
/** Extract oldest queue item's enqueue timestamp as ms, if any. */
|
|
459
|
+
oldestQueueItemEnqueuedAt() {
|
|
460
|
+
const peeked = this.queue.peek();
|
|
461
|
+
if (peeked.length === 0)
|
|
462
|
+
return undefined;
|
|
463
|
+
const ts = Date.parse(peeked[0].timestamp);
|
|
464
|
+
return Number.isFinite(ts) ? ts : undefined;
|
|
465
|
+
}
|
|
466
|
+
/** Read config overrides for watchdog thresholds. */
|
|
467
|
+
loadThresholdOverrides() {
|
|
468
|
+
const raw = this.projectConfig.lifeline?.watchdog;
|
|
469
|
+
if (!raw || typeof raw !== 'object')
|
|
470
|
+
return {};
|
|
471
|
+
const valid = (v) => typeof v === 'number' && Number.isFinite(v) && v > 0;
|
|
472
|
+
const out = {};
|
|
473
|
+
if (valid(raw.tickIntervalMs))
|
|
474
|
+
out.tickIntervalMs = raw.tickIntervalMs;
|
|
475
|
+
if (valid(raw.noForwardStuckMs))
|
|
476
|
+
out.noForwardStuckMs = raw.noForwardStuckMs;
|
|
477
|
+
if (valid(raw.consecutiveFailureMax))
|
|
478
|
+
out.consecutiveFailureMax = raw.consecutiveFailureMax;
|
|
479
|
+
if (valid(raw.conflict409StuckMs))
|
|
480
|
+
out.conflict409StuckMs = raw.conflict409StuckMs;
|
|
481
|
+
let hadInvalid = false;
|
|
482
|
+
for (const k of Object.keys(raw)) {
|
|
483
|
+
if (!(k in DEFAULT_WATCHDOG_THRESHOLDS))
|
|
484
|
+
hadInvalid = true;
|
|
485
|
+
else if (!valid(raw[k]))
|
|
486
|
+
hadInvalid = true;
|
|
487
|
+
}
|
|
488
|
+
if (hadInvalid) {
|
|
489
|
+
DegradationReporter.getInstance().report({
|
|
490
|
+
feature: 'TelegramLifeline.configInvalid',
|
|
491
|
+
primary: 'Valid watchdog threshold overrides',
|
|
492
|
+
fallback: 'Falling back to defaults for invalid keys',
|
|
493
|
+
reason: 'Non-finite, non-positive, or unknown override key in lifeline.watchdog',
|
|
494
|
+
impact: 'Threshold uses default; behavior unchanged but config is misleading.',
|
|
495
|
+
});
|
|
496
|
+
}
|
|
497
|
+
return out;
|
|
498
|
+
}
|
|
499
|
+
/** Persist rate-limit state. Safe to call during orchestrator persist. */
|
|
500
|
+
async persistRateLimitSafe() {
|
|
501
|
+
// The orchestrator invokes this while transitioning to 'persisting';
|
|
502
|
+
// rate-limit history was already written by initiateRestart() before
|
|
503
|
+
// the orchestrator was called. This is a final no-op flush.
|
|
504
|
+
return;
|
|
505
|
+
}
|
|
506
|
+
/**
|
|
507
|
+
* Unified restart initiator: checks rate limit, writes history, then
|
|
508
|
+
* calls the orchestrator. Used by both the watchdog tick (bucket=watchdog)
|
|
509
|
+
* and the version-skew handler (bucket=versionSkew).
|
|
510
|
+
*/
|
|
511
|
+
initiateRestart(bucket, reason, context) {
|
|
512
|
+
const outcome = readRateLimitState(this.projectConfig.stateDir);
|
|
513
|
+
const dec = decideRateLimit(outcome, bucket);
|
|
514
|
+
if (!dec.allowed) {
|
|
515
|
+
console.log(`[Lifeline] restart suppressed by rate limit: ${dec.reason} (bucket=${bucket} reason=${reason})`);
|
|
516
|
+
return;
|
|
517
|
+
}
|
|
518
|
+
// Storm escalation signal (fires in addition to the normal restart
|
|
519
|
+
// signal so the operator sees that self-heal is not converging).
|
|
520
|
+
if (dec.stormActive || isRestartStorm(outcome.kind === 'ok' ? outcome.state : null)) {
|
|
521
|
+
DegradationReporter.getInstance().report({
|
|
522
|
+
feature: 'TelegramLifeline.restartStorm',
|
|
523
|
+
primary: 'Rate-limited self-restarts within ceiling',
|
|
524
|
+
fallback: 'Continuing to restart — underlying cause unresolved',
|
|
525
|
+
reason: `>= 6 restarts within the last hour; latest bucket=${bucket} reason=${reason}`,
|
|
526
|
+
impact: 'Operator should investigate; self-heal is not converging.',
|
|
527
|
+
});
|
|
528
|
+
}
|
|
529
|
+
// Write the history entry BEFORE calling process.exit so the new lifeline
|
|
530
|
+
// sees the rate-limit state on startup. Best-effort — failure here still
|
|
531
|
+
// lets the restart proceed (orchestrator is authoritative).
|
|
532
|
+
try {
|
|
533
|
+
const prior = outcome.kind === 'ok' ? outcome.state : null;
|
|
534
|
+
writeRateLimitState(this.projectConfig.stateDir, reason, bucket, prior);
|
|
535
|
+
}
|
|
536
|
+
catch (err) {
|
|
537
|
+
console.error(`[Lifeline] failed to write rate-limit state: ${err}`);
|
|
538
|
+
}
|
|
539
|
+
if (!this.orchestrator) {
|
|
540
|
+
console.error('[Lifeline] initiateRestart called before orchestrator was installed');
|
|
541
|
+
return;
|
|
542
|
+
}
|
|
543
|
+
void this.orchestrator.requestRestart({ reason, bucket, context });
|
|
348
544
|
}
|
|
349
545
|
// ── Stale Connection Flush ───────────────────────────────
|
|
350
546
|
/**
|
|
@@ -417,6 +613,8 @@ export class TelegramLifeline {
|
|
|
417
613
|
this.saveOffset();
|
|
418
614
|
}
|
|
419
615
|
// Success — reset backoff counters
|
|
616
|
+
if (this.consecutive409s > 0)
|
|
617
|
+
this.conflict409StartedAt = null; // 0→... edge
|
|
420
618
|
this.consecutive409s = 0;
|
|
421
619
|
this.consecutive429s = 0;
|
|
422
620
|
this.pollBackoffMs = this.config.pollIntervalMs ?? 2000;
|
|
@@ -430,6 +628,9 @@ export class TelegramLifeline {
|
|
|
430
628
|
}
|
|
431
629
|
// Handle 409 Conflict (multiple bot instances polling)
|
|
432
630
|
if (errMsg.includes('409') && errMsg.includes('Conflict')) {
|
|
631
|
+
// 0→>0 edge: record when conflict started so watchdog can time the stuck state.
|
|
632
|
+
if (this.consecutive409s === 0)
|
|
633
|
+
this.conflict409StartedAt = Date.now();
|
|
433
634
|
this.consecutive409s++;
|
|
434
635
|
// Exponential backoff: 4s, 8s, 16s, 32s, max 60s
|
|
435
636
|
this.pollBackoffMs = Math.min(60_000, 2000 * Math.pow(2, this.consecutive409s));
|
|
@@ -754,7 +955,25 @@ export class TelegramLifeline {
|
|
|
754
955
|
*/
|
|
755
956
|
static FORWARD_ATTEMPTS = 3;
|
|
756
957
|
static FORWARD_BACKOFF_BASE_MS = 1000;
|
|
958
|
+
/**
|
|
959
|
+
* `legacyStrict` — if a pre-Stage-B server strictly validates JSON and
|
|
960
|
+
* rejects the unknown `lifelineVersion` field with 400, the lifeline
|
|
961
|
+
* falls back to omitting it and pins this flag for the session.
|
|
962
|
+
*/
|
|
963
|
+
legacyStrictServer = false;
|
|
964
|
+
/** Full semver of this lifeline, read once at construction. */
|
|
965
|
+
lifelineVersion = getInstarVersion();
|
|
757
966
|
async forwardToServer(topicId, text, rawMsg) {
|
|
967
|
+
const buildBody = (includeVersion) => JSON.stringify({
|
|
968
|
+
topicId,
|
|
969
|
+
text,
|
|
970
|
+
fromUserId: rawMsg.from.id,
|
|
971
|
+
fromUsername: rawMsg.from.username,
|
|
972
|
+
fromFirstName: rawMsg.from.first_name,
|
|
973
|
+
messageId: rawMsg.message_id,
|
|
974
|
+
timestamp: new Date(rawMsg.date * 1000).toISOString(),
|
|
975
|
+
...(includeVersion ? { lifelineVersion: this.lifelineVersion } : {}),
|
|
976
|
+
});
|
|
758
977
|
const doForward = async () => {
|
|
759
978
|
const controller = new AbortController();
|
|
760
979
|
const timer = setTimeout(() => controller.abort(), 10_000);
|
|
@@ -766,21 +985,39 @@ export class TelegramLifeline {
|
|
|
766
985
|
const response = await fetch(`http://127.0.0.1:${this.projectConfig.port}/internal/telegram-forward`, {
|
|
767
986
|
method: 'POST',
|
|
768
987
|
headers: fwdHeaders,
|
|
769
|
-
body:
|
|
770
|
-
topicId,
|
|
771
|
-
text,
|
|
772
|
-
fromUserId: rawMsg.from.id,
|
|
773
|
-
fromUsername: rawMsg.from.username,
|
|
774
|
-
fromFirstName: rawMsg.from.first_name,
|
|
775
|
-
messageId: rawMsg.message_id,
|
|
776
|
-
timestamp: new Date(rawMsg.date * 1000).toISOString(),
|
|
777
|
-
}),
|
|
988
|
+
body: buildBody(!this.legacyStrictServer),
|
|
778
989
|
signal: controller.signal,
|
|
779
990
|
});
|
|
780
|
-
if (
|
|
781
|
-
|
|
991
|
+
if (response.ok)
|
|
992
|
+
return true;
|
|
993
|
+
if (response.status === 426) {
|
|
994
|
+
const body = (await response.json().catch(() => ({})));
|
|
995
|
+
throw new ForwardVersionSkewError(426, body);
|
|
782
996
|
}
|
|
783
|
-
|
|
997
|
+
if (response.status === 503) {
|
|
998
|
+
const body = (await response.json().catch(() => ({})));
|
|
999
|
+
throw new ForwardServerBootError(body.retryAfterMs ?? 1000);
|
|
1000
|
+
}
|
|
1001
|
+
if (response.status === 400) {
|
|
1002
|
+
const body = await response.json().catch(() => ({}));
|
|
1003
|
+
// Graceful degradation: if we included lifelineVersion and the
|
|
1004
|
+
// server rejected the request, retry once without it.
|
|
1005
|
+
if (!this.legacyStrictServer) {
|
|
1006
|
+
this.legacyStrictServer = true;
|
|
1007
|
+
console.warn(`[Lifeline] server returned 400 with lifelineVersion; ` +
|
|
1008
|
+
`retrying without (legacyStrictServer=true)`);
|
|
1009
|
+
// Re-issue the request WITHOUT the version field and return the
|
|
1010
|
+
// result of that retry. If still 400, it's a genuine bad request.
|
|
1011
|
+
const r2 = await fetch(`http://127.0.0.1:${this.projectConfig.port}/internal/telegram-forward`, { method: 'POST', headers: fwdHeaders, body: buildBody(false) });
|
|
1012
|
+
if (r2.ok)
|
|
1013
|
+
return true;
|
|
1014
|
+
if (r2.status === 400)
|
|
1015
|
+
throw new ForwardBadRequestError(await r2.json().catch(() => ({})));
|
|
1016
|
+
throw new ForwardTransientError(r2.status);
|
|
1017
|
+
}
|
|
1018
|
+
throw new ForwardBadRequestError(body);
|
|
1019
|
+
}
|
|
1020
|
+
throw new ForwardTransientError(response.status);
|
|
784
1021
|
}
|
|
785
1022
|
finally {
|
|
786
1023
|
clearTimeout(timer);
|
|
@@ -790,6 +1027,7 @@ export class TelegramLifeline {
|
|
|
790
1027
|
await retryWithBackoff(doForward, {
|
|
791
1028
|
attempts: TelegramLifeline.FORWARD_ATTEMPTS,
|
|
792
1029
|
baseMs: TelegramLifeline.FORWARD_BACKOFF_BASE_MS,
|
|
1030
|
+
isTerminal: isTerminalForwardError,
|
|
793
1031
|
onAttempt: (n, lastErr) => {
|
|
794
1032
|
if (n > 1) {
|
|
795
1033
|
console.warn(`[Lifeline] forwardToServer retry ${n}/${TelegramLifeline.FORWARD_ATTEMPTS} ` +
|
|
@@ -797,12 +1035,51 @@ export class TelegramLifeline {
|
|
|
797
1035
|
}
|
|
798
1036
|
},
|
|
799
1037
|
});
|
|
1038
|
+
// Record success for watchdog.
|
|
1039
|
+
this.consecutiveForwardFailures = 0;
|
|
1040
|
+
this.lastForwardSuccessAt = Date.now();
|
|
800
1041
|
return true;
|
|
801
1042
|
}
|
|
802
|
-
catch {
|
|
1043
|
+
catch (err) {
|
|
1044
|
+
// Version-skew handler: emit signal + request restart via orchestrator.
|
|
1045
|
+
if (err instanceof ForwardVersionSkewError) {
|
|
1046
|
+
this.handleVersionSkew(err);
|
|
1047
|
+
return false;
|
|
1048
|
+
}
|
|
1049
|
+
this.consecutiveForwardFailures++;
|
|
803
1050
|
return false;
|
|
804
1051
|
}
|
|
805
1052
|
}
|
|
1053
|
+
/**
|
|
1054
|
+
* Handle a 426 response from the server. Validates the response body's
|
|
1055
|
+
* `serverVersion` differs from this lifeline's, then requests restart
|
|
1056
|
+
* through the orchestrator. If the body is malformed or the versions
|
|
1057
|
+
* match (loopback impostor), treat as transient.
|
|
1058
|
+
*/
|
|
1059
|
+
handleVersionSkew(err) {
|
|
1060
|
+
const { body } = err;
|
|
1061
|
+
if (body.upgradeRequired !== true) {
|
|
1062
|
+
// Not a genuine Stage-B upgrade directive; treat as transient noise.
|
|
1063
|
+
this.consecutiveForwardFailures++;
|
|
1064
|
+
return;
|
|
1065
|
+
}
|
|
1066
|
+
if (typeof body.serverVersion !== 'string' || body.serverVersion === this.lifelineVersion) {
|
|
1067
|
+
// Loopback impostor or malformed body — don't trust it.
|
|
1068
|
+
console.warn(`[Lifeline] ignoring 426 with missing/matching serverVersion`);
|
|
1069
|
+
this.consecutiveForwardFailures++;
|
|
1070
|
+
return;
|
|
1071
|
+
}
|
|
1072
|
+
this.initiateRestart('versionSkew', 'version-skew', {
|
|
1073
|
+
serverVersion: body.serverVersion,
|
|
1074
|
+
lifelineVersion: this.lifelineVersion,
|
|
1075
|
+
});
|
|
1076
|
+
}
|
|
1077
|
+
/** Watchdog-tracked counters/state. */
|
|
1078
|
+
consecutiveForwardFailures = 0;
|
|
1079
|
+
lastForwardSuccessAt = 0;
|
|
1080
|
+
conflict409StartedAt = null;
|
|
1081
|
+
orchestrator = null;
|
|
1082
|
+
watchdog = null;
|
|
806
1083
|
// ── Lifeline Commands ─────────────────────────────────────
|
|
807
1084
|
async handleLifelineCommand(text, topicId, fromUserId) {
|
|
808
1085
|
const cmd = text.trim().toLowerCase();
|