specmem-hardwicksoftware 3.5.99 → 3.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -48,6 +48,11 @@ const DEFAULT_CONFIG = {
48
48
  autoStart: process.env['SPECMEM_EMBEDDING_AUTO_START'] !== 'false',
49
49
  killStaleOnStart: process.env['SPECMEM_EMBEDDING_KILL_STALE'] !== 'false',
50
50
  maxProcessAgeHours: parseFloat(process.env['SPECMEM_EMBEDDING_MAX_AGE_HOURS'] || '1'),
51
+ // Circuit breaker configuration (Issue #10)
52
+ cbRestartWindowMs: parseInt(process.env['SPECMEM_RESTART_WINDOW_MS'] || '300000', 10),
53
+ cbMaxRestartsInWindow: parseInt(process.env['SPECMEM_RESTART_MAX_IN_WINDOW'] || '5', 10),
54
+ cbCooldownMs: parseInt(process.env['SPECMEM_RESTART_COOLDOWN_MS'] || '60000', 10),
55
+ cbMaxCooldownMs: parseInt(process.env['SPECMEM_RESTART_MAX_COOLDOWN_MS'] || '600000', 10),
51
56
  };
52
57
  // ============================================================================
53
58
  // EMBEDDING SERVER MANAGER
@@ -89,6 +94,13 @@ export class EmbeddingServerManager extends EventEmitter {
89
94
  stoppedFlagPath;
90
95
  // Phase 4: Track restart timestamps for loop detection
91
96
  restartTimestamps = [];
97
+ // Circuit breaker state (Issue #10)
98
+ // States: 'closed' (normal), 'open' (tripped, blocking restarts), 'half-open' (testing one restart)
99
+ cbState = 'closed';
100
+ cbRestartTimestamps = []; // sliding window of restart timestamps
101
+ cbCurrentCooldownMs = 0; // current cooldown duration (doubles on repeated failures)
102
+ cbCooldownUntil = 0; // timestamp when cooldown expires
103
+ cbLastStateChange = Date.now();
92
104
  // KYS (Keep Yourself Safe) heartbeat timer - sends heartbeat every 25s to embedding server
93
105
  // If embedding server doesn't receive heartbeat within 90s, it commits suicide
94
106
  // This prevents zombie embedding servers when MCP crashes (increased from 30s for startup tolerance)
@@ -329,6 +341,56 @@ export class EmbeddingServerManager extends EventEmitter {
329
341
  logger.warn({ error: err }, '[EmbeddingServerManager] Failed to remove old socket');
330
342
  }
331
343
  }
344
+ // ═══════════════════════════════════════════════════════════════════════════
345
+ // PRE-SPAWN ORPHAN KILL: Ensure NO other Frankenstein is running for this socket
346
+ // This is the LAST line of defense before spawning a new process
347
+ // ═══════════════════════════════════════════════════════════════════════════
348
+ try {
349
+ const killWaitMs = parseInt(process.env['SPECMEM_ORPHAN_KILL_WAIT_MS'] || '2000', 10);
350
+ // 1. Kill via PID file
351
+ const pidFilePath = join(dirname(this.socketPath), 'embedding.pid');
352
+ if (existsSync(pidFilePath)) {
353
+ const pidContent = readFileSync(pidFilePath, 'utf8').trim();
354
+ const oldPid = parseInt(pidContent.split(':')[0], 10);
355
+ if (oldPid && !isNaN(oldPid) && oldPid !== process.pid) {
356
+ try {
357
+ process.kill(oldPid, 0);
358
+ logger.info({ pid: oldPid }, '[EmbeddingServerManager] Killing existing process before spawn');
359
+ process.kill(oldPid, 'SIGTERM');
360
+ await this.sleep(killWaitMs);
361
+ try {
362
+ process.kill(oldPid, 0);
363
+ process.kill(oldPid, 'SIGKILL');
364
+ logger.warn({ pid: oldPid }, '[EmbeddingServerManager] Force killed stubborn process');
365
+ } catch { /* dead */ }
366
+ } catch { /* not running */ }
367
+ }
368
+ }
369
+ // 2. Kill via pgrep as fallback (catches processes without PID files)
370
+ try {
371
+ const { execSync: execSyncLocal } = await import('child_process');
372
+ const pids = execSyncLocal(`pgrep -f "frankenstein-embeddings.py" 2>/dev/null || true`, { encoding: 'utf8' }).trim().split('\n').filter(Boolean);
373
+ for (const pidStr of pids) {
374
+ const pid = parseInt(pidStr, 10);
375
+ if (pid && pid !== process.pid) {
376
+ try {
377
+ process.kill(pid, 'SIGTERM');
378
+ logger.info({ pid }, '[EmbeddingServerManager] Killed orphan frankenstein process (pgrep)');
379
+ } catch { /* already dead */ }
380
+ }
381
+ }
382
+ if (pids.filter(Boolean).length > 0) {
383
+ await this.sleep(killWaitMs);
384
+ }
385
+ } catch { /* pgrep not available or no matches */ }
386
+ // 3. Clean stale socket
387
+ if (existsSync(this.socketPath)) {
388
+ unlinkSync(this.socketPath);
389
+ logger.debug('[EmbeddingServerManager] Removed stale socket before spawn');
390
+ }
391
+ } catch (preSpawnErr) {
392
+ logger.debug({ error: preSpawnErr }, '[EmbeddingServerManager] Pre-spawn cleanup failed (non-fatal)');
393
+ }
332
394
  // Find the embedding script (prefers warm-start.sh Docker mode)
333
395
  const scriptInfo = this.findEmbeddingScript();
334
396
  if (!scriptInfo) {
@@ -1064,6 +1126,8 @@ export class EmbeddingServerManager extends EventEmitter {
1064
1126
  this.restartTimestamps = [];
1065
1127
  this.consecutiveFailures = 0;
1066
1128
  this.isShuttingDown = false;
1129
+ // Reset circuit breaker on user-initiated start (Issue #10)
1130
+ this.resetCircuitBreaker();
1067
1131
  // Kill existing and start fresh
1068
1132
  await this.stop();
1069
1133
  // Clear the shutdown flag that stop() sets
@@ -1109,6 +1173,7 @@ export class EmbeddingServerManager extends EventEmitter {
1109
1173
  ...this.getStatus(),
1110
1174
  stoppedByUser: this.isStoppedByUser(),
1111
1175
  restartLoop: this.getRestartLoopInfo(),
1176
+ circuitBreaker: this.getCircuitBreakerStatus(),
1112
1177
  startupGrace: graceActive ? {
1113
1178
  active: true,
1114
1179
  remainingMs: this.startupGraceUntil - Date.now(),
@@ -1117,6 +1182,59 @@ export class EmbeddingServerManager extends EventEmitter {
1117
1182
  };
1118
1183
  }
1119
1184
  // ==========================================================================
1185
+ // CIRCUIT BREAKER (Issue #10)
1186
+ // ==========================================================================
1187
+ /**
1188
+ * Manually reset the circuit breaker - callable from MCP tools
1189
+ * Resets the circuit breaker to closed state, clears all cooldowns and counters.
1190
+ * Use this when the underlying issue has been resolved (e.g., model file fixed, dependency installed).
1191
+ */
1192
+ resetCircuitBreaker() {
1193
+ const previousState = this.cbState;
1194
+ this.cbState = 'closed';
1195
+ this.cbRestartTimestamps = [];
1196
+ this.cbCurrentCooldownMs = 0;
1197
+ this.cbCooldownUntil = 0;
1198
+ this.cbLastStateChange = Date.now();
1199
+ // Also reset the legacy restart counters
1200
+ this.restartCount = 0;
1201
+ this.restartTimestamps = [];
1202
+ this.consecutiveFailures = 0;
1203
+ logger.info({
1204
+ previousState,
1205
+ newState: 'closed',
1206
+ }, '[EmbeddingServerManager] Circuit breaker manually reset: -> closed (all counters cleared)');
1207
+ this.emit('circuit_breaker', { state: 'closed', manualReset: true });
1208
+ return {
1209
+ success: true,
1210
+ previousState,
1211
+ newState: 'closed',
1212
+ message: `Circuit breaker reset from '${previousState}' to 'closed'. All cooldowns and counters cleared.`,
1213
+ };
1214
+ }
1215
+ /**
1216
+ * Get circuit breaker status for diagnostics
1217
+ */
1218
+ getCircuitBreakerStatus() {
1219
+ const now = Date.now();
1220
+ // Prune window for accurate count
1221
+ const restartsInWindow = this.cbRestartTimestamps.filter(
1222
+ ts => (now - ts) < this.config.cbRestartWindowMs
1223
+ ).length;
1224
+ return {
1225
+ state: this.cbState,
1226
+ restartsInWindow,
1227
+ maxRestartsInWindow: this.config.cbMaxRestartsInWindow,
1228
+ windowMs: this.config.cbRestartWindowMs,
1229
+ currentCooldownMs: this.cbCurrentCooldownMs,
1230
+ maxCooldownMs: this.config.cbMaxCooldownMs,
1231
+ cooldownUntil: this.cbCooldownUntil > 0 ? new Date(this.cbCooldownUntil).toISOString() : null,
1232
+ cooldownRemainingMs: this.cbCooldownUntil > now ? this.cbCooldownUntil - now : 0,
1233
+ lastStateChange: new Date(this.cbLastStateChange).toISOString(),
1234
+ timeSinceLastStateChangeMs: now - this.cbLastStateChange,
1235
+ };
1236
+ }
1237
+ // ==========================================================================
1120
1238
  // PRIVATE METHODS
1121
1239
  // ==========================================================================
1122
1240
  /**
@@ -1610,7 +1728,12 @@ export class EmbeddingServerManager extends EventEmitter {
1610
1728
  this.attemptRestart();
1611
1729
  }
1612
1730
  /**
1613
- * Attempt to restart the server
1731
+ * Attempt to restart the server (with circuit breaker - Issue #10)
1732
+ *
1733
+ * Circuit breaker pattern:
1734
+ * - CLOSED: Normal operation, restarts allowed. Track restarts in sliding window.
1735
+ * - OPEN: Too many restarts in window, block all restarts, wait for cooldown.
1736
+ * - HALF-OPEN: After cooldown, allow ONE test restart. Success -> CLOSED, failure -> OPEN (doubled cooldown).
1614
1737
  */
1615
1738
  async attemptRestart() {
1616
1739
  // Phase 4: Don't restart if user manually stopped
@@ -1618,7 +1741,60 @@ export class EmbeddingServerManager extends EventEmitter {
1618
1741
  logger.info('[EmbeddingServerManager] Skipping restart - stopped by user');
1619
1742
  return;
1620
1743
  }
1621
- // Phase 4: Check for restart loop (>3 restarts in 60 seconds)
1744
+ const now = Date.now();
1745
+ // --- Circuit Breaker Logic (Issue #10) ---
1746
+ // Prune the sliding window: remove timestamps older than cbRestartWindowMs
1747
+ this.cbRestartTimestamps = this.cbRestartTimestamps.filter(
1748
+ ts => (now - ts) < this.config.cbRestartWindowMs
1749
+ );
1750
+ if (this.cbState === 'open') {
1751
+ // Circuit is OPEN - check if cooldown has elapsed
1752
+ if (now < this.cbCooldownUntil) {
1753
+ const remainingMs = this.cbCooldownUntil - now;
1754
+ logger.warn({
1755
+ cbState: this.cbState,
1756
+ cooldownRemainingMs: remainingMs,
1757
+ currentCooldownMs: this.cbCurrentCooldownMs,
1758
+ }, '[EmbeddingServerManager] Circuit breaker OPEN - restart blocked, waiting for cooldown');
1759
+ return;
1760
+ }
1761
+ // Cooldown elapsed - transition to half-open
1762
+ this.cbState = 'half-open';
1763
+ this.cbLastStateChange = now;
1764
+ logger.info({
1765
+ previousState: 'open',
1766
+ newState: 'half-open',
1767
+ cooldownMs: this.cbCurrentCooldownMs,
1768
+ }, '[EmbeddingServerManager] Circuit breaker: open -> half-open (allowing one test restart)');
1769
+ this.emit('circuit_breaker', { state: 'half-open', cooldownMs: this.cbCurrentCooldownMs });
1770
+ }
1771
+ if (this.cbState === 'closed') {
1772
+ // Check if we should trip the breaker
1773
+ if (this.cbRestartTimestamps.length >= this.config.cbMaxRestartsInWindow) {
1774
+ // Trip the circuit breaker
1775
+ this.cbState = 'open';
1776
+ this.cbCurrentCooldownMs = this.cbCurrentCooldownMs || this.config.cbCooldownMs;
1777
+ this.cbCooldownUntil = now + this.cbCurrentCooldownMs;
1778
+ this.cbLastStateChange = now;
1779
+ logger.error({
1780
+ previousState: 'closed',
1781
+ newState: 'open',
1782
+ restartsInWindow: this.cbRestartTimestamps.length,
1783
+ windowMs: this.config.cbRestartWindowMs,
1784
+ maxAllowed: this.config.cbMaxRestartsInWindow,
1785
+ cooldownMs: this.cbCurrentCooldownMs,
1786
+ cooldownUntil: new Date(this.cbCooldownUntil).toISOString(),
1787
+ }, '[EmbeddingServerManager] Circuit breaker TRIPPED: closed -> open (too many restarts in window)');
1788
+ this.emit('circuit_breaker', {
1789
+ state: 'open',
1790
+ restartsInWindow: this.cbRestartTimestamps.length,
1791
+ cooldownMs: this.cbCurrentCooldownMs,
1792
+ });
1793
+ return;
1794
+ }
1795
+ }
1796
+ // --- End Circuit Breaker pre-check ---
1797
+ // Phase 4: Check for restart loop (>3 restarts in 60 seconds) - legacy check
1622
1798
  const loopInfo = this.getRestartLoopInfo();
1623
1799
  if (loopInfo.inLoop) {
1624
1800
  logger.error({
@@ -1632,7 +1808,7 @@ export class EmbeddingServerManager extends EventEmitter {
1632
1808
  await this.sleep(backoffMs);
1633
1809
  }
1634
1810
  // Check cooldown
1635
- const timeSinceLastRestart = Date.now() - this.lastRestartTime;
1811
+ const timeSinceLastRestart = now - this.lastRestartTime;
1636
1812
  if (timeSinceLastRestart < this.config.restartCooldownMs) {
1637
1813
  const waitTime = this.config.restartCooldownMs - timeSinceLastRestart;
1638
1814
  logger.debug({ waitTime }, '[EmbeddingServerManager] Waiting for restart cooldown');
@@ -1649,15 +1825,57 @@ export class EmbeddingServerManager extends EventEmitter {
1649
1825
  }
1650
1826
  this.restartCount++;
1651
1827
  this.lastRestartTime = Date.now();
1652
- // Phase 4: Track restart timestamp for loop detection
1653
- this.restartTimestamps.push(Date.now());
1654
- // Keep only last 10 timestamps
1828
+ // Track restart timestamp for both legacy loop detection and circuit breaker window
1829
+ const restartTs = Date.now();
1830
+ this.restartTimestamps.push(restartTs);
1831
+ this.cbRestartTimestamps.push(restartTs);
1832
+ // Keep only last 10 timestamps for legacy tracking
1655
1833
  if (this.restartTimestamps.length > 10) {
1656
1834
  this.restartTimestamps.shift();
1657
1835
  }
1658
- logger.info({ attempt: this.restartCount }, '[EmbeddingServerManager] Attempting restart');
1836
+ logger.info({
1837
+ attempt: this.restartCount,
1838
+ cbState: this.cbState,
1839
+ restartsInWindow: this.cbRestartTimestamps.length,
1840
+ }, '[EmbeddingServerManager] Attempting restart');
1659
1841
  this.emit('restarting', { attempt: this.restartCount });
1660
1842
  const success = await this.start();
1843
+ // --- Circuit Breaker post-restart evaluation ---
1844
+ if (this.cbState === 'half-open') {
1845
+ if (success) {
1846
+ // Test restart succeeded - close the circuit breaker
1847
+ this.cbState = 'closed';
1848
+ this.cbCurrentCooldownMs = 0; // Reset cooldown on success
1849
+ this.cbRestartTimestamps = [];
1850
+ this.cbLastStateChange = Date.now();
1851
+ logger.info({
1852
+ previousState: 'half-open',
1853
+ newState: 'closed',
1854
+ }, '[EmbeddingServerManager] Circuit breaker: half-open -> closed (restart succeeded, counters reset)');
1855
+ this.emit('circuit_breaker', { state: 'closed' });
1856
+ }
1857
+ else {
1858
+ // Test restart failed - reopen with doubled cooldown
1859
+ this.cbState = 'open';
1860
+ this.cbCurrentCooldownMs = Math.min(
1861
+ this.cbCurrentCooldownMs * 2,
1862
+ this.config.cbMaxCooldownMs
1863
+ );
1864
+ this.cbCooldownUntil = Date.now() + this.cbCurrentCooldownMs;
1865
+ this.cbLastStateChange = Date.now();
1866
+ logger.error({
1867
+ previousState: 'half-open',
1868
+ newState: 'open',
1869
+ newCooldownMs: this.cbCurrentCooldownMs,
1870
+ maxCooldownMs: this.config.cbMaxCooldownMs,
1871
+ cooldownUntil: new Date(this.cbCooldownUntil).toISOString(),
1872
+ }, '[EmbeddingServerManager] Circuit breaker: half-open -> open (restart failed, cooldown doubled)');
1873
+ this.emit('circuit_breaker', {
1874
+ state: 'open',
1875
+ cooldownMs: this.cbCurrentCooldownMs,
1876
+ });
1877
+ }
1878
+ }
1661
1879
  if (!success) {
1662
1880
  // Will retry on next health check
1663
1881
  logger.warn('[EmbeddingServerManager] Restart attempt failed');
@@ -31,7 +31,9 @@ export var ComponentHealth;
31
31
  ComponentHealth["UNKNOWN"] = "unknown";
32
32
  })(ComponentHealth || (ComponentHealth = {}));
33
33
  const DEFAULT_CONFIG = {
34
- checkIntervalMs: parseInt(process.env['SPECMEM_HEALTH_CHECK_INTERVAL'] || '30000', 10),
34
+ checkIntervalMs: parseInt(process.env['SPECMEM_HEALTH_CHECK_INTERVAL_MS'] || process.env['SPECMEM_HEALTH_CHECK_INTERVAL'] || '30000', 10),
35
+ // Adaptive interval for unhealthy state (Issue #16)
36
+ unhealthyCheckIntervalMs: parseInt(process.env['SPECMEM_HEALTH_CHECK_UNHEALTHY_INTERVAL_MS'] || '5000', 10),
35
37
  dbTimeoutMs: parseInt(process.env['SPECMEM_HEALTH_DB_TIMEOUT'] || '5000', 10),
36
38
  // Use unified timeout config for embedding health checks
37
39
  embeddingTimeoutMs: getEmbeddingTimeout('health'),
@@ -58,6 +60,14 @@ export class HealthMonitor extends EventEmitter {
58
60
  checkTimer = null;
59
61
  logTimer = null;
60
62
  isRunning = false;
63
+ // Issue #16: Concurrency guard - prevents overlapping health checks
64
+ isCheckRunning = false;
65
+ // Issue #16: Diagnostics
66
+ totalHealthChecks = 0;
67
+ lastCheckTimestamp = 0;
68
+ // Issue #16: Adaptive interval tracking
69
+ currentCheckIntervalMs = 0;
70
+ consecutiveHealthyChecks = 0;
61
71
  // Component references
62
72
  resilientTransport = null;
63
73
  database = null;
@@ -76,8 +86,11 @@ export class HealthMonitor extends EventEmitter {
76
86
  this.transportHealth = this.createInitialHealth('transport');
77
87
  this.databaseHealth = this.createInitialHealth('database');
78
88
  this.embeddingHealth = this.createInitialHealth('embedding');
89
+ // Issue #16: Initialize adaptive interval to healthy rate
90
+ this.currentCheckIntervalMs = this.config.checkIntervalMs;
79
91
  logger.info({
80
92
  checkIntervalMs: this.config.checkIntervalMs,
93
+ unhealthyCheckIntervalMs: this.config.unhealthyCheckIntervalMs,
81
94
  autoRecoveryEnabled: this.config.autoRecoveryEnabled,
82
95
  logHealthStatus: this.config.logHealthStatus
83
96
  }, '[HealthMonitor] Initialized with config');
@@ -116,6 +129,10 @@ export class HealthMonitor extends EventEmitter {
116
129
  }
117
130
  /**
118
131
  * Start the health monitoring loop
132
+ *
133
+ * Issue #16: Uses setTimeout + recursive scheduling instead of setInterval
134
+ * to prevent check stacking when a health check takes longer than the interval.
135
+ * Uses adaptive intervals: faster checks when unhealthy, slower when healthy.
119
136
  */
120
137
  start() {
121
138
  if (this.isRunning) {
@@ -124,13 +141,9 @@ export class HealthMonitor extends EventEmitter {
124
141
  }
125
142
  this.isRunning = true;
126
143
  this.startTime = Date.now();
127
- // Start periodic health checks
128
- this.checkTimer = setInterval(() => {
129
- this.runHealthChecks().catch(err => {
130
- logger.error({ error: err }, '[HealthMonitor] Health check error');
131
- });
132
- }, this.config.checkIntervalMs);
133
- this.checkTimer.unref();
144
+ this.currentCheckIntervalMs = this.config.checkIntervalMs;
145
+ // Issue #16: Start recursive setTimeout-based health check loop
146
+ this.scheduleNextHealthCheck();
134
147
  // Start periodic health logging if enabled
135
148
  if (this.config.logHealthStatus) {
136
149
  this.logTimer = setInterval(() => {
@@ -138,11 +151,40 @@ export class HealthMonitor extends EventEmitter {
138
151
  }, this.config.logIntervalMs);
139
152
  this.logTimer.unref();
140
153
  }
141
- // Run initial health check
154
+ // Run initial health check immediately
142
155
  this.runHealthChecks().catch(err => {
143
156
  logger.error({ error: err }, '[HealthMonitor] Initial health check error');
144
157
  });
145
- logger.info('[HealthMonitor] Health monitoring started');
158
+ logger.info({
159
+ initialIntervalMs: this.currentCheckIntervalMs,
160
+ unhealthyIntervalMs: this.config.unhealthyCheckIntervalMs,
161
+ }, '[HealthMonitor] Health monitoring started (adaptive setTimeout scheduling)');
162
+ }
163
+ /**
164
+ * Issue #16: Schedule the next health check using setTimeout (prevents stacking)
165
+ * Each check schedules the next one after completing, so checks never overlap from the timer.
166
+ */
167
+ scheduleNextHealthCheck() {
168
+ if (!this.isRunning) {
169
+ return;
170
+ }
171
+ // Clear any existing timer to prevent duplicates
172
+ if (this.checkTimer) {
173
+ clearTimeout(this.checkTimer);
174
+ this.checkTimer = null;
175
+ }
176
+ this.checkTimer = setTimeout(async () => {
177
+ if (!this.isRunning) return;
178
+ try {
179
+ await this.runHealthChecks();
180
+ }
181
+ catch (err) {
182
+ logger.error({ error: err }, '[HealthMonitor] Health check error');
183
+ }
184
+ // Schedule the next check (recursive scheduling)
185
+ this.scheduleNextHealthCheck();
186
+ }, this.currentCheckIntervalMs);
187
+ this.checkTimer.unref();
146
188
  }
147
189
  /**
148
190
  * Stop the health monitoring loop
@@ -152,36 +194,122 @@ export class HealthMonitor extends EventEmitter {
152
194
  return;
153
195
  }
154
196
  this.isRunning = false;
197
+ // Issue #16: checkTimer is now a setTimeout, use clearTimeout
155
198
  if (this.checkTimer) {
156
- clearInterval(this.checkTimer);
199
+ clearTimeout(this.checkTimer);
157
200
  this.checkTimer = null;
158
201
  }
159
202
  if (this.logTimer) {
160
203
  clearInterval(this.logTimer);
161
204
  this.logTimer = null;
162
205
  }
163
- logger.info('[HealthMonitor] Health monitoring stopped');
206
+ logger.info({
207
+ totalHealthChecks: this.totalHealthChecks,
208
+ lastCheckTimestamp: this.lastCheckTimestamp > 0 ? new Date(this.lastCheckTimestamp).toISOString() : null,
209
+ uptimeMs: Date.now() - this.startTime,
210
+ }, '[HealthMonitor] Health monitoring stopped');
211
+ }
212
+ /**
213
+ * Issue #16: Full cleanup/destroy method for graceful shutdown
214
+ * Stops all timers, removes all listeners, and resets all state.
215
+ */
216
+ destroy() {
217
+ this.stop();
218
+ this.removeAllListeners();
219
+ this.resilientTransport = null;
220
+ this.database = null;
221
+ this.embeddingSocketPath = null;
222
+ this.isCheckRunning = false;
223
+ this.totalHealthChecks = 0;
224
+ this.lastCheckTimestamp = 0;
225
+ this.consecutiveHealthyChecks = 0;
226
+ logger.info('[HealthMonitor] Destroyed - all resources released');
164
227
  }
165
228
  /**
166
229
  * Run all health checks (non-blocking)
230
+ *
231
+ * Issue #16: Added concurrency guard to prevent overlapping checks,
232
+ * diagnostic counters, and adaptive interval adjustment.
167
233
  */
168
234
  async runHealthChecks() {
169
- const checkPromises = [
170
- this.checkTransportHealth(),
171
- this.checkDatabaseHealth(),
172
- this.checkEmbeddingHealth()
173
- ];
174
- // Run all checks in parallel - they shouldn't block each other
175
- await Promise.allSettled(checkPromises);
176
- // Calculate overall health
177
- const result = this.getSystemHealth();
178
- // Emit health event
179
- this.emit('health', result);
180
- // Check for auto-recovery needs
181
- if (this.config.autoRecoveryEnabled) {
182
- await this.attemptAutoRecovery();
183
- }
184
- return result;
235
+ // Issue #16: Concurrency guard - prevent overlapping health checks
236
+ if (this.isCheckRunning) {
237
+ logger.debug('[HealthMonitor] Health check already in progress, skipping to prevent stacking');
238
+ return this.getSystemHealth();
239
+ }
240
+ this.isCheckRunning = true;
241
+ try {
242
+ const checkPromises = [
243
+ this.checkTransportHealth(),
244
+ this.checkDatabaseHealth(),
245
+ this.checkEmbeddingHealth()
246
+ ];
247
+ // Run all checks in parallel - they shouldn't block each other
248
+ await Promise.allSettled(checkPromises);
249
+ // Issue #16: Update diagnostics
250
+ this.totalHealthChecks++;
251
+ this.lastCheckTimestamp = Date.now();
252
+ // Calculate overall health
253
+ const result = this.getSystemHealth();
254
+ // Issue #16: Adaptive interval adjustment
255
+ this.adjustCheckInterval(result.overallHealth);
256
+ // Emit health event
257
+ this.emit('health', result);
258
+ // Check for auto-recovery needs
259
+ if (this.config.autoRecoveryEnabled) {
260
+ await this.attemptAutoRecovery();
261
+ }
262
+ return result;
263
+ }
264
+ finally {
265
+ this.isCheckRunning = false;
266
+ }
267
+ }
268
+ /**
269
+ * Issue #16: Adjust the check interval based on current system health
270
+ *
271
+ * - Healthy: use normal interval (SPECMEM_HEALTH_CHECK_INTERVAL_MS, default 30s)
272
+ * - Unhealthy: use fast interval (SPECMEM_HEALTH_CHECK_UNHEALTHY_INTERVAL_MS, default 5s)
273
+ * - Recovering: gradually increase from unhealthy to healthy interval
274
+ * (each consecutive healthy check increases interval by 25% toward the healthy rate)
275
+ */
276
+ adjustCheckInterval(overallHealth) {
277
+ const healthyInterval = this.config.checkIntervalMs;
278
+ const unhealthyInterval = this.config.unhealthyCheckIntervalMs;
279
+ const previousInterval = this.currentCheckIntervalMs;
280
+ if (overallHealth === ComponentHealth.UNHEALTHY) {
281
+ // Unhealthy: switch to fast polling immediately
282
+ this.currentCheckIntervalMs = unhealthyInterval;
283
+ this.consecutiveHealthyChecks = 0;
284
+ }
285
+ else if (overallHealth === ComponentHealth.DEGRADED) {
286
+ // Degraded: use midpoint between unhealthy and healthy
287
+ this.currentCheckIntervalMs = Math.round((unhealthyInterval + healthyInterval) / 2);
288
+ this.consecutiveHealthyChecks = 0;
289
+ }
290
+ else if (overallHealth === ComponentHealth.HEALTHY) {
291
+ this.consecutiveHealthyChecks++;
292
+ if (this.currentCheckIntervalMs < healthyInterval) {
293
+ // Recovering: gradually increase interval back to healthy rate
294
+ // Each consecutive healthy check moves 25% closer to the healthy interval
295
+ const step = (healthyInterval - this.currentCheckIntervalMs) * 0.25;
296
+ this.currentCheckIntervalMs = Math.round(
297
+ Math.min(this.currentCheckIntervalMs + Math.max(step, 1000), healthyInterval)
298
+ );
299
+ }
300
+ else {
301
+ this.currentCheckIntervalMs = healthyInterval;
302
+ }
303
+ }
304
+ // Log interval changes
305
+ if (previousInterval !== this.currentCheckIntervalMs) {
306
+ logger.info({
307
+ previousIntervalMs: previousInterval,
308
+ newIntervalMs: this.currentCheckIntervalMs,
309
+ overallHealth,
310
+ consecutiveHealthyChecks: this.consecutiveHealthyChecks,
311
+ }, '[HealthMonitor] Adaptive interval adjusted');
312
+ }
185
313
  }
186
314
  /**
187
315
  * Check transport health
@@ -590,7 +718,12 @@ export class HealthMonitor extends EventEmitter {
590
718
  embedding: { ...this.embeddingHealth }
591
719
  },
592
720
  uptime: Date.now() - this.startTime,
593
- timestamp: new Date().toISOString()
721
+ timestamp: new Date().toISOString(),
722
+ // Issue #16: Diagnostics
723
+ totalHealthChecks: this.totalHealthChecks,
724
+ lastCheckTimestamp: this.lastCheckTimestamp > 0 ? new Date(this.lastCheckTimestamp).toISOString() : null,
725
+ currentCheckIntervalMs: this.currentCheckIntervalMs,
726
+ consecutiveHealthyChecks: this.consecutiveHealthyChecks,
594
727
  };
595
728
  }
596
729
  /**
@@ -667,7 +800,7 @@ export function resetHealthMonitor(projectPath) {
667
800
  const targetProject = projectPath || getProjectPath();
668
801
  const monitor = healthMonitorsByProject.get(targetProject);
669
802
  if (monitor) {
670
- monitor.stop();
803
+ monitor.destroy();
671
804
  healthMonitorsByProject.delete(targetProject);
672
805
  logger.debug({ projectPath: targetProject }, '[HealthMonitor] Reset instance for project');
673
806
  }
@@ -677,8 +810,8 @@ export function resetHealthMonitor(projectPath) {
677
810
  */
678
811
  export function resetAllHealthMonitors() {
679
812
  for (const [projectPath, monitor] of healthMonitorsByProject) {
680
- monitor.stop();
681
- logger.debug({ projectPath }, '[HealthMonitor] Stopped instance for project');
813
+ monitor.destroy();
814
+ logger.debug({ projectPath }, '[HealthMonitor] Destroyed instance for project');
682
815
  }
683
816
  healthMonitorsByProject.clear();
684
817
  }
@@ -12,6 +12,7 @@
12
12
  */
13
13
  import { getEmbeddingServerManager } from '../embeddingServerManager.js';
14
14
  import { logger } from '../../utils/logger.js';
15
+ import { execSync } from 'child_process';
15
16
  // Module-level reference to embedding provider for socket reset
16
17
  let embeddingProviderRef = null;
17
18
  /**
@@ -56,6 +57,36 @@ export class EmbeddingStart {
56
57
  status: manager.getExtendedStatus()
57
58
  };
58
59
  }
60
+ // ═══════════════════════════════════════════════════════════════
61
+ // HARD KILL: Force kill ALL frankenstein processes before restart
62
+ // This ensures force start actually works even when manager lost
63
+ // track of the process (e.g., after MCP reconnect)
64
+ // ═══════════════════════════════════════════════════════════════
65
+ try {
66
+ const pids = execSync('pgrep -f "frankenstein-embeddings.py" 2>/dev/null || true', { encoding: 'utf8' }).trim().split('\n').filter(Boolean);
67
+ if (pids.length > 0) {
68
+ logger.info({ pids, count: pids.length }, '[EmbeddingStart] Hard killing all frankenstein processes');
69
+ for (const pidStr of pids) {
70
+ const pid = parseInt(pidStr, 10);
71
+ if (pid && pid !== process.pid) {
72
+ try { process.kill(pid, 'SIGTERM'); } catch { /* already dead */ }
73
+ }
74
+ }
75
+ // Wait for them to die
76
+ const killWaitMs = parseInt(process.env['SPECMEM_ORPHAN_KILL_WAIT_MS'] || '2000', 10);
77
+ await new Promise(r => setTimeout(r, killWaitMs));
78
+ // Force kill survivors
79
+ for (const pidStr of pids) {
80
+ const pid = parseInt(pidStr, 10);
81
+ if (pid && pid !== process.pid) {
82
+ try { process.kill(pid, 'SIGKILL'); } catch { /* already dead */ }
83
+ }
84
+ }
85
+ logger.info('[EmbeddingStart] All old frankenstein processes killed');
86
+ }
87
+ } catch (killErr) {
88
+ logger.debug({ error: killErr }, '[EmbeddingStart] Hard kill failed (non-fatal)');
89
+ }
59
90
  const result = await manager.userStart();
60
91
  // CRITICAL: Reset the MCP's socket connection to pick up new server
61
92
  if (result.success && embeddingProviderRef?.resetSocket) {
@@ -29,6 +29,7 @@ import { logger } from '../../utils/logger.js';
29
29
  import { v4 as uuidv4 } from 'uuid';
30
30
  import { createHash } from 'crypto';
31
31
  import fs from 'fs';
32
+ import path from 'path';
32
33
  import { getProjectPathForInsert } from '../../services/ProjectContext.js';
33
34
  import { getFileCommsTransport } from '../../comms/fileCommsTransport.js';
34
35
  import { smartCompress } from '../../utils/tokenCompressor.js';
@@ -649,6 +650,13 @@ Examples:
649
650
  logger.warn({ error: fileErr }, '[TeamComms] File fallback failed - message only visible in current process');
650
651
  }
651
652
  }
653
+ // Write latest team message to statusbar state file for live display
654
+ try {
655
+ const statusFile = path.join(projectPath, 'specmem', 'sockets', 'team-comms-latest.json');
656
+ const commsState = { sender: senderDisplayName, message: message.slice(0, 80), timestamp, channel: channel || 'main' };
657
+ fs.writeFileSync(statusFile, JSON.stringify(commsState));
658
+ }
659
+ catch (_e) { /* non-fatal */ }
652
660
  logToTeamChannel('send_message', {
653
661
  messageId,
654
662
  type,
@@ -1130,6 +1138,14 @@ Use cross_project: true for system-wide announcements (use sparingly).`;
1130
1138
  };
1131
1139
  teamMessagesMemory.set(messageId, teamMessage);
1132
1140
  }
1141
+ // Write latest broadcast to statusbar state file for live display
1142
+ try {
1143
+ const broadcastProjectPath = cross_project ? (process.env.SPECMEM_PROJECT_PATH || process.cwd()) : projectPath;
1144
+ const statusFile = path.join(broadcastProjectPath, 'specmem', 'sockets', 'team-comms-latest.json');
1145
+ const commsState = { sender: senderName, message: message.slice(0, 80), timestamp, channel: 'broadcast' };
1146
+ fs.writeFileSync(statusFile, JSON.stringify(commsState));
1147
+ }
1148
+ catch (_e) { /* non-fatal */ }
1133
1149
  logToTeamChannel('broadcast', {
1134
1150
  messageId,
1135
1151
  broadcast_type,