specmem-hardwicksoftware 3.5.99 → 3.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/specmem-statusbar.cjs +154 -298
- package/claude-hooks/agent-loading-hook.js +8 -4
- package/claude-hooks/team-comms-enforcer.cjs +109 -92
- package/dist/config/embeddingTimeouts.js +4 -4
- package/dist/database.js +52 -6
- package/dist/db/bigBrainMigrations.js +7 -6
- package/dist/db/memoryDrilldown.sql +1 -1
- package/dist/db/projectSchemaInit.sql +21 -0
- package/dist/index.js +238 -13
- package/dist/installer/firstRun.js +2 -2
- package/dist/mcp/embeddingServerManager.js +225 -7
- package/dist/mcp/healthMonitor.js +165 -32
- package/dist/mcp/tools/embeddingControl.js +31 -0
- package/dist/mcp/tools/teamComms.js +16 -0
- package/dist/mcp/watcherIntegration.js +50 -7
- package/dist/services/CameraZoomSearch.js +62 -5
- package/dist/services/DimensionService.js +73 -6
- package/dist/services/EmbeddingQueue.js +64 -0
- package/dist/services/MemoryDrilldown.js +19 -12
- package/dist/tools/goofy/findCodePointers.js +11 -7
- package/dist/tools/goofy/findWhatISaid.js +145 -53
- package/dist/utils/qoms.js +187 -4
- package/dist/watcher/changeHandler.js +54 -4
- package/dist/watcher/fileWatcher.js +121 -1
- package/dist/watcher/index.js +75 -31
- package/dist/watcher/syncChecker.js +248 -63
- package/embedding-sandbox/__pycache__/frankenstein-embeddings.cpython-313.pyc +0 -0
- package/embedding-sandbox/frankenstein-embeddings.py +175 -64
- package/package.json +1 -1
|
@@ -48,6 +48,11 @@ const DEFAULT_CONFIG = {
|
|
|
48
48
|
autoStart: process.env['SPECMEM_EMBEDDING_AUTO_START'] !== 'false',
|
|
49
49
|
killStaleOnStart: process.env['SPECMEM_EMBEDDING_KILL_STALE'] !== 'false',
|
|
50
50
|
maxProcessAgeHours: parseFloat(process.env['SPECMEM_EMBEDDING_MAX_AGE_HOURS'] || '1'),
|
|
51
|
+
// Circuit breaker configuration (Issue #10)
|
|
52
|
+
cbRestartWindowMs: parseInt(process.env['SPECMEM_RESTART_WINDOW_MS'] || '300000', 10),
|
|
53
|
+
cbMaxRestartsInWindow: parseInt(process.env['SPECMEM_RESTART_MAX_IN_WINDOW'] || '5', 10),
|
|
54
|
+
cbCooldownMs: parseInt(process.env['SPECMEM_RESTART_COOLDOWN_MS'] || '60000', 10),
|
|
55
|
+
cbMaxCooldownMs: parseInt(process.env['SPECMEM_RESTART_MAX_COOLDOWN_MS'] || '600000', 10),
|
|
51
56
|
};
|
|
52
57
|
// ============================================================================
|
|
53
58
|
// EMBEDDING SERVER MANAGER
|
|
@@ -89,6 +94,13 @@ export class EmbeddingServerManager extends EventEmitter {
|
|
|
89
94
|
stoppedFlagPath;
|
|
90
95
|
// Phase 4: Track restart timestamps for loop detection
|
|
91
96
|
restartTimestamps = [];
|
|
97
|
+
// Circuit breaker state (Issue #10)
|
|
98
|
+
// States: 'closed' (normal), 'open' (tripped, blocking restarts), 'half-open' (testing one restart)
|
|
99
|
+
cbState = 'closed';
|
|
100
|
+
cbRestartTimestamps = []; // sliding window of restart timestamps
|
|
101
|
+
cbCurrentCooldownMs = 0; // current cooldown duration (doubles on repeated failures)
|
|
102
|
+
cbCooldownUntil = 0; // timestamp when cooldown expires
|
|
103
|
+
cbLastStateChange = Date.now();
|
|
92
104
|
// KYS (Keep Yourself Safe) heartbeat timer - sends heartbeat every 25s to embedding server
|
|
93
105
|
// If embedding server doesn't receive heartbeat within 90s, it commits suicide
|
|
94
106
|
// This prevents zombie embedding servers when MCP crashes (increased from 30s for startup tolerance)
|
|
@@ -329,6 +341,56 @@ export class EmbeddingServerManager extends EventEmitter {
|
|
|
329
341
|
logger.warn({ error: err }, '[EmbeddingServerManager] Failed to remove old socket');
|
|
330
342
|
}
|
|
331
343
|
}
|
|
344
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
345
|
+
// PRE-SPAWN ORPHAN KILL: Ensure NO other Frankenstein is running for this socket
|
|
346
|
+
// This is the LAST line of defense before spawning a new process
|
|
347
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
348
|
+
try {
|
|
349
|
+
const killWaitMs = parseInt(process.env['SPECMEM_ORPHAN_KILL_WAIT_MS'] || '2000', 10);
|
|
350
|
+
// 1. Kill via PID file
|
|
351
|
+
const pidFilePath = join(dirname(this.socketPath), 'embedding.pid');
|
|
352
|
+
if (existsSync(pidFilePath)) {
|
|
353
|
+
const pidContent = readFileSync(pidFilePath, 'utf8').trim();
|
|
354
|
+
const oldPid = parseInt(pidContent.split(':')[0], 10);
|
|
355
|
+
if (oldPid && !isNaN(oldPid) && oldPid !== process.pid) {
|
|
356
|
+
try {
|
|
357
|
+
process.kill(oldPid, 0);
|
|
358
|
+
logger.info({ pid: oldPid }, '[EmbeddingServerManager] Killing existing process before spawn');
|
|
359
|
+
process.kill(oldPid, 'SIGTERM');
|
|
360
|
+
await this.sleep(killWaitMs);
|
|
361
|
+
try {
|
|
362
|
+
process.kill(oldPid, 0);
|
|
363
|
+
process.kill(oldPid, 'SIGKILL');
|
|
364
|
+
logger.warn({ pid: oldPid }, '[EmbeddingServerManager] Force killed stubborn process');
|
|
365
|
+
} catch { /* dead */ }
|
|
366
|
+
} catch { /* not running */ }
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
// 2. Kill via pgrep as fallback (catches processes without PID files)
|
|
370
|
+
try {
|
|
371
|
+
const { execSync: execSyncLocal } = await import('child_process');
|
|
372
|
+
const pids = execSyncLocal(`pgrep -f "frankenstein-embeddings.py" 2>/dev/null || true`, { encoding: 'utf8' }).trim().split('\n').filter(Boolean);
|
|
373
|
+
for (const pidStr of pids) {
|
|
374
|
+
const pid = parseInt(pidStr, 10);
|
|
375
|
+
if (pid && pid !== process.pid) {
|
|
376
|
+
try {
|
|
377
|
+
process.kill(pid, 'SIGTERM');
|
|
378
|
+
logger.info({ pid }, '[EmbeddingServerManager] Killed orphan frankenstein process (pgrep)');
|
|
379
|
+
} catch { /* already dead */ }
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
if (pids.filter(Boolean).length > 0) {
|
|
383
|
+
await this.sleep(killWaitMs);
|
|
384
|
+
}
|
|
385
|
+
} catch { /* pgrep not available or no matches */ }
|
|
386
|
+
// 3. Clean stale socket
|
|
387
|
+
if (existsSync(this.socketPath)) {
|
|
388
|
+
unlinkSync(this.socketPath);
|
|
389
|
+
logger.debug('[EmbeddingServerManager] Removed stale socket before spawn');
|
|
390
|
+
}
|
|
391
|
+
} catch (preSpawnErr) {
|
|
392
|
+
logger.debug({ error: preSpawnErr }, '[EmbeddingServerManager] Pre-spawn cleanup failed (non-fatal)');
|
|
393
|
+
}
|
|
332
394
|
// Find the embedding script (prefers warm-start.sh Docker mode)
|
|
333
395
|
const scriptInfo = this.findEmbeddingScript();
|
|
334
396
|
if (!scriptInfo) {
|
|
@@ -1064,6 +1126,8 @@ export class EmbeddingServerManager extends EventEmitter {
|
|
|
1064
1126
|
this.restartTimestamps = [];
|
|
1065
1127
|
this.consecutiveFailures = 0;
|
|
1066
1128
|
this.isShuttingDown = false;
|
|
1129
|
+
// Reset circuit breaker on user-initiated start (Issue #10)
|
|
1130
|
+
this.resetCircuitBreaker();
|
|
1067
1131
|
// Kill existing and start fresh
|
|
1068
1132
|
await this.stop();
|
|
1069
1133
|
// Clear the shutdown flag that stop() sets
|
|
@@ -1109,6 +1173,7 @@ export class EmbeddingServerManager extends EventEmitter {
|
|
|
1109
1173
|
...this.getStatus(),
|
|
1110
1174
|
stoppedByUser: this.isStoppedByUser(),
|
|
1111
1175
|
restartLoop: this.getRestartLoopInfo(),
|
|
1176
|
+
circuitBreaker: this.getCircuitBreakerStatus(),
|
|
1112
1177
|
startupGrace: graceActive ? {
|
|
1113
1178
|
active: true,
|
|
1114
1179
|
remainingMs: this.startupGraceUntil - Date.now(),
|
|
@@ -1117,6 +1182,59 @@ export class EmbeddingServerManager extends EventEmitter {
|
|
|
1117
1182
|
};
|
|
1118
1183
|
}
|
|
1119
1184
|
// ==========================================================================
|
|
1185
|
+
// CIRCUIT BREAKER (Issue #10)
|
|
1186
|
+
// ==========================================================================
|
|
1187
|
+
/**
|
|
1188
|
+
* Manually reset the circuit breaker - callable from MCP tools
|
|
1189
|
+
* Resets the circuit breaker to closed state, clears all cooldowns and counters.
|
|
1190
|
+
* Use this when the underlying issue has been resolved (e.g., model file fixed, dependency installed).
|
|
1191
|
+
*/
|
|
1192
|
+
resetCircuitBreaker() {
|
|
1193
|
+
const previousState = this.cbState;
|
|
1194
|
+
this.cbState = 'closed';
|
|
1195
|
+
this.cbRestartTimestamps = [];
|
|
1196
|
+
this.cbCurrentCooldownMs = 0;
|
|
1197
|
+
this.cbCooldownUntil = 0;
|
|
1198
|
+
this.cbLastStateChange = Date.now();
|
|
1199
|
+
// Also reset the legacy restart counters
|
|
1200
|
+
this.restartCount = 0;
|
|
1201
|
+
this.restartTimestamps = [];
|
|
1202
|
+
this.consecutiveFailures = 0;
|
|
1203
|
+
logger.info({
|
|
1204
|
+
previousState,
|
|
1205
|
+
newState: 'closed',
|
|
1206
|
+
}, '[EmbeddingServerManager] Circuit breaker manually reset: -> closed (all counters cleared)');
|
|
1207
|
+
this.emit('circuit_breaker', { state: 'closed', manualReset: true });
|
|
1208
|
+
return {
|
|
1209
|
+
success: true,
|
|
1210
|
+
previousState,
|
|
1211
|
+
newState: 'closed',
|
|
1212
|
+
message: `Circuit breaker reset from '${previousState}' to 'closed'. All cooldowns and counters cleared.`,
|
|
1213
|
+
};
|
|
1214
|
+
}
|
|
1215
|
+
/**
|
|
1216
|
+
* Get circuit breaker status for diagnostics
|
|
1217
|
+
*/
|
|
1218
|
+
getCircuitBreakerStatus() {
|
|
1219
|
+
const now = Date.now();
|
|
1220
|
+
// Prune window for accurate count
|
|
1221
|
+
const restartsInWindow = this.cbRestartTimestamps.filter(
|
|
1222
|
+
ts => (now - ts) < this.config.cbRestartWindowMs
|
|
1223
|
+
).length;
|
|
1224
|
+
return {
|
|
1225
|
+
state: this.cbState,
|
|
1226
|
+
restartsInWindow,
|
|
1227
|
+
maxRestartsInWindow: this.config.cbMaxRestartsInWindow,
|
|
1228
|
+
windowMs: this.config.cbRestartWindowMs,
|
|
1229
|
+
currentCooldownMs: this.cbCurrentCooldownMs,
|
|
1230
|
+
maxCooldownMs: this.config.cbMaxCooldownMs,
|
|
1231
|
+
cooldownUntil: this.cbCooldownUntil > 0 ? new Date(this.cbCooldownUntil).toISOString() : null,
|
|
1232
|
+
cooldownRemainingMs: this.cbCooldownUntil > now ? this.cbCooldownUntil - now : 0,
|
|
1233
|
+
lastStateChange: new Date(this.cbLastStateChange).toISOString(),
|
|
1234
|
+
timeSinceLastStateChangeMs: now - this.cbLastStateChange,
|
|
1235
|
+
};
|
|
1236
|
+
}
|
|
1237
|
+
// ==========================================================================
|
|
1120
1238
|
// PRIVATE METHODS
|
|
1121
1239
|
// ==========================================================================
|
|
1122
1240
|
/**
|
|
@@ -1610,7 +1728,12 @@ export class EmbeddingServerManager extends EventEmitter {
|
|
|
1610
1728
|
this.attemptRestart();
|
|
1611
1729
|
}
|
|
1612
1730
|
/**
|
|
1613
|
-
* Attempt to restart the server
|
|
1731
|
+
* Attempt to restart the server (with circuit breaker - Issue #10)
|
|
1732
|
+
*
|
|
1733
|
+
* Circuit breaker pattern:
|
|
1734
|
+
* - CLOSED: Normal operation, restarts allowed. Track restarts in sliding window.
|
|
1735
|
+
* - OPEN: Too many restarts in window, block all restarts, wait for cooldown.
|
|
1736
|
+
* - HALF-OPEN: After cooldown, allow ONE test restart. Success -> CLOSED, failure -> OPEN (doubled cooldown).
|
|
1614
1737
|
*/
|
|
1615
1738
|
async attemptRestart() {
|
|
1616
1739
|
// Phase 4: Don't restart if user manually stopped
|
|
@@ -1618,7 +1741,60 @@ export class EmbeddingServerManager extends EventEmitter {
|
|
|
1618
1741
|
logger.info('[EmbeddingServerManager] Skipping restart - stopped by user');
|
|
1619
1742
|
return;
|
|
1620
1743
|
}
|
|
1621
|
-
|
|
1744
|
+
const now = Date.now();
|
|
1745
|
+
// --- Circuit Breaker Logic (Issue #10) ---
|
|
1746
|
+
// Prune the sliding window: remove timestamps older than cbRestartWindowMs
|
|
1747
|
+
this.cbRestartTimestamps = this.cbRestartTimestamps.filter(
|
|
1748
|
+
ts => (now - ts) < this.config.cbRestartWindowMs
|
|
1749
|
+
);
|
|
1750
|
+
if (this.cbState === 'open') {
|
|
1751
|
+
// Circuit is OPEN - check if cooldown has elapsed
|
|
1752
|
+
if (now < this.cbCooldownUntil) {
|
|
1753
|
+
const remainingMs = this.cbCooldownUntil - now;
|
|
1754
|
+
logger.warn({
|
|
1755
|
+
cbState: this.cbState,
|
|
1756
|
+
cooldownRemainingMs: remainingMs,
|
|
1757
|
+
currentCooldownMs: this.cbCurrentCooldownMs,
|
|
1758
|
+
}, '[EmbeddingServerManager] Circuit breaker OPEN - restart blocked, waiting for cooldown');
|
|
1759
|
+
return;
|
|
1760
|
+
}
|
|
1761
|
+
// Cooldown elapsed - transition to half-open
|
|
1762
|
+
this.cbState = 'half-open';
|
|
1763
|
+
this.cbLastStateChange = now;
|
|
1764
|
+
logger.info({
|
|
1765
|
+
previousState: 'open',
|
|
1766
|
+
newState: 'half-open',
|
|
1767
|
+
cooldownMs: this.cbCurrentCooldownMs,
|
|
1768
|
+
}, '[EmbeddingServerManager] Circuit breaker: open -> half-open (allowing one test restart)');
|
|
1769
|
+
this.emit('circuit_breaker', { state: 'half-open', cooldownMs: this.cbCurrentCooldownMs });
|
|
1770
|
+
}
|
|
1771
|
+
if (this.cbState === 'closed') {
|
|
1772
|
+
// Check if we should trip the breaker
|
|
1773
|
+
if (this.cbRestartTimestamps.length >= this.config.cbMaxRestartsInWindow) {
|
|
1774
|
+
// Trip the circuit breaker
|
|
1775
|
+
this.cbState = 'open';
|
|
1776
|
+
this.cbCurrentCooldownMs = this.cbCurrentCooldownMs || this.config.cbCooldownMs;
|
|
1777
|
+
this.cbCooldownUntil = now + this.cbCurrentCooldownMs;
|
|
1778
|
+
this.cbLastStateChange = now;
|
|
1779
|
+
logger.error({
|
|
1780
|
+
previousState: 'closed',
|
|
1781
|
+
newState: 'open',
|
|
1782
|
+
restartsInWindow: this.cbRestartTimestamps.length,
|
|
1783
|
+
windowMs: this.config.cbRestartWindowMs,
|
|
1784
|
+
maxAllowed: this.config.cbMaxRestartsInWindow,
|
|
1785
|
+
cooldownMs: this.cbCurrentCooldownMs,
|
|
1786
|
+
cooldownUntil: new Date(this.cbCooldownUntil).toISOString(),
|
|
1787
|
+
}, '[EmbeddingServerManager] Circuit breaker TRIPPED: closed -> open (too many restarts in window)');
|
|
1788
|
+
this.emit('circuit_breaker', {
|
|
1789
|
+
state: 'open',
|
|
1790
|
+
restartsInWindow: this.cbRestartTimestamps.length,
|
|
1791
|
+
cooldownMs: this.cbCurrentCooldownMs,
|
|
1792
|
+
});
|
|
1793
|
+
return;
|
|
1794
|
+
}
|
|
1795
|
+
}
|
|
1796
|
+
// --- End Circuit Breaker pre-check ---
|
|
1797
|
+
// Phase 4: Check for restart loop (>3 restarts in 60 seconds) - legacy check
|
|
1622
1798
|
const loopInfo = this.getRestartLoopInfo();
|
|
1623
1799
|
if (loopInfo.inLoop) {
|
|
1624
1800
|
logger.error({
|
|
@@ -1632,7 +1808,7 @@ export class EmbeddingServerManager extends EventEmitter {
|
|
|
1632
1808
|
await this.sleep(backoffMs);
|
|
1633
1809
|
}
|
|
1634
1810
|
// Check cooldown
|
|
1635
|
-
const timeSinceLastRestart =
|
|
1811
|
+
const timeSinceLastRestart = now - this.lastRestartTime;
|
|
1636
1812
|
if (timeSinceLastRestart < this.config.restartCooldownMs) {
|
|
1637
1813
|
const waitTime = this.config.restartCooldownMs - timeSinceLastRestart;
|
|
1638
1814
|
logger.debug({ waitTime }, '[EmbeddingServerManager] Waiting for restart cooldown');
|
|
@@ -1649,15 +1825,57 @@ export class EmbeddingServerManager extends EventEmitter {
|
|
|
1649
1825
|
}
|
|
1650
1826
|
this.restartCount++;
|
|
1651
1827
|
this.lastRestartTime = Date.now();
|
|
1652
|
-
//
|
|
1653
|
-
|
|
1654
|
-
|
|
1828
|
+
// Track restart timestamp for both legacy loop detection and circuit breaker window
|
|
1829
|
+
const restartTs = Date.now();
|
|
1830
|
+
this.restartTimestamps.push(restartTs);
|
|
1831
|
+
this.cbRestartTimestamps.push(restartTs);
|
|
1832
|
+
// Keep only last 10 timestamps for legacy tracking
|
|
1655
1833
|
if (this.restartTimestamps.length > 10) {
|
|
1656
1834
|
this.restartTimestamps.shift();
|
|
1657
1835
|
}
|
|
1658
|
-
logger.info({
|
|
1836
|
+
logger.info({
|
|
1837
|
+
attempt: this.restartCount,
|
|
1838
|
+
cbState: this.cbState,
|
|
1839
|
+
restartsInWindow: this.cbRestartTimestamps.length,
|
|
1840
|
+
}, '[EmbeddingServerManager] Attempting restart');
|
|
1659
1841
|
this.emit('restarting', { attempt: this.restartCount });
|
|
1660
1842
|
const success = await this.start();
|
|
1843
|
+
// --- Circuit Breaker post-restart evaluation ---
|
|
1844
|
+
if (this.cbState === 'half-open') {
|
|
1845
|
+
if (success) {
|
|
1846
|
+
// Test restart succeeded - close the circuit breaker
|
|
1847
|
+
this.cbState = 'closed';
|
|
1848
|
+
this.cbCurrentCooldownMs = 0; // Reset cooldown on success
|
|
1849
|
+
this.cbRestartTimestamps = [];
|
|
1850
|
+
this.cbLastStateChange = Date.now();
|
|
1851
|
+
logger.info({
|
|
1852
|
+
previousState: 'half-open',
|
|
1853
|
+
newState: 'closed',
|
|
1854
|
+
}, '[EmbeddingServerManager] Circuit breaker: half-open -> closed (restart succeeded, counters reset)');
|
|
1855
|
+
this.emit('circuit_breaker', { state: 'closed' });
|
|
1856
|
+
}
|
|
1857
|
+
else {
|
|
1858
|
+
// Test restart failed - reopen with doubled cooldown
|
|
1859
|
+
this.cbState = 'open';
|
|
1860
|
+
this.cbCurrentCooldownMs = Math.min(
|
|
1861
|
+
this.cbCurrentCooldownMs * 2,
|
|
1862
|
+
this.config.cbMaxCooldownMs
|
|
1863
|
+
);
|
|
1864
|
+
this.cbCooldownUntil = Date.now() + this.cbCurrentCooldownMs;
|
|
1865
|
+
this.cbLastStateChange = Date.now();
|
|
1866
|
+
logger.error({
|
|
1867
|
+
previousState: 'half-open',
|
|
1868
|
+
newState: 'open',
|
|
1869
|
+
newCooldownMs: this.cbCurrentCooldownMs,
|
|
1870
|
+
maxCooldownMs: this.config.cbMaxCooldownMs,
|
|
1871
|
+
cooldownUntil: new Date(this.cbCooldownUntil).toISOString(),
|
|
1872
|
+
}, '[EmbeddingServerManager] Circuit breaker: half-open -> open (restart failed, cooldown doubled)');
|
|
1873
|
+
this.emit('circuit_breaker', {
|
|
1874
|
+
state: 'open',
|
|
1875
|
+
cooldownMs: this.cbCurrentCooldownMs,
|
|
1876
|
+
});
|
|
1877
|
+
}
|
|
1878
|
+
}
|
|
1661
1879
|
if (!success) {
|
|
1662
1880
|
// Will retry on next health check
|
|
1663
1881
|
logger.warn('[EmbeddingServerManager] Restart attempt failed');
|
|
@@ -31,7 +31,9 @@ export var ComponentHealth;
|
|
|
31
31
|
ComponentHealth["UNKNOWN"] = "unknown";
|
|
32
32
|
})(ComponentHealth || (ComponentHealth = {}));
|
|
33
33
|
const DEFAULT_CONFIG = {
|
|
34
|
-
checkIntervalMs: parseInt(process.env['SPECMEM_HEALTH_CHECK_INTERVAL'] || '30000', 10),
|
|
34
|
+
checkIntervalMs: parseInt(process.env['SPECMEM_HEALTH_CHECK_INTERVAL_MS'] || process.env['SPECMEM_HEALTH_CHECK_INTERVAL'] || '30000', 10),
|
|
35
|
+
// Adaptive interval for unhealthy state (Issue #16)
|
|
36
|
+
unhealthyCheckIntervalMs: parseInt(process.env['SPECMEM_HEALTH_CHECK_UNHEALTHY_INTERVAL_MS'] || '5000', 10),
|
|
35
37
|
dbTimeoutMs: parseInt(process.env['SPECMEM_HEALTH_DB_TIMEOUT'] || '5000', 10),
|
|
36
38
|
// Use unified timeout config for embedding health checks
|
|
37
39
|
embeddingTimeoutMs: getEmbeddingTimeout('health'),
|
|
@@ -58,6 +60,14 @@ export class HealthMonitor extends EventEmitter {
|
|
|
58
60
|
checkTimer = null;
|
|
59
61
|
logTimer = null;
|
|
60
62
|
isRunning = false;
|
|
63
|
+
// Issue #16: Concurrency guard - prevents overlapping health checks
|
|
64
|
+
isCheckRunning = false;
|
|
65
|
+
// Issue #16: Diagnostics
|
|
66
|
+
totalHealthChecks = 0;
|
|
67
|
+
lastCheckTimestamp = 0;
|
|
68
|
+
// Issue #16: Adaptive interval tracking
|
|
69
|
+
currentCheckIntervalMs = 0;
|
|
70
|
+
consecutiveHealthyChecks = 0;
|
|
61
71
|
// Component references
|
|
62
72
|
resilientTransport = null;
|
|
63
73
|
database = null;
|
|
@@ -76,8 +86,11 @@ export class HealthMonitor extends EventEmitter {
|
|
|
76
86
|
this.transportHealth = this.createInitialHealth('transport');
|
|
77
87
|
this.databaseHealth = this.createInitialHealth('database');
|
|
78
88
|
this.embeddingHealth = this.createInitialHealth('embedding');
|
|
89
|
+
// Issue #16: Initialize adaptive interval to healthy rate
|
|
90
|
+
this.currentCheckIntervalMs = this.config.checkIntervalMs;
|
|
79
91
|
logger.info({
|
|
80
92
|
checkIntervalMs: this.config.checkIntervalMs,
|
|
93
|
+
unhealthyCheckIntervalMs: this.config.unhealthyCheckIntervalMs,
|
|
81
94
|
autoRecoveryEnabled: this.config.autoRecoveryEnabled,
|
|
82
95
|
logHealthStatus: this.config.logHealthStatus
|
|
83
96
|
}, '[HealthMonitor] Initialized with config');
|
|
@@ -116,6 +129,10 @@ export class HealthMonitor extends EventEmitter {
|
|
|
116
129
|
}
|
|
117
130
|
/**
|
|
118
131
|
* Start the health monitoring loop
|
|
132
|
+
*
|
|
133
|
+
* Issue #16: Uses setTimeout + recursive scheduling instead of setInterval
|
|
134
|
+
* to prevent check stacking when a health check takes longer than the interval.
|
|
135
|
+
* Uses adaptive intervals: faster checks when unhealthy, slower when healthy.
|
|
119
136
|
*/
|
|
120
137
|
start() {
|
|
121
138
|
if (this.isRunning) {
|
|
@@ -124,13 +141,9 @@ export class HealthMonitor extends EventEmitter {
|
|
|
124
141
|
}
|
|
125
142
|
this.isRunning = true;
|
|
126
143
|
this.startTime = Date.now();
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
logger.error({ error: err }, '[HealthMonitor] Health check error');
|
|
131
|
-
});
|
|
132
|
-
}, this.config.checkIntervalMs);
|
|
133
|
-
this.checkTimer.unref();
|
|
144
|
+
this.currentCheckIntervalMs = this.config.checkIntervalMs;
|
|
145
|
+
// Issue #16: Start recursive setTimeout-based health check loop
|
|
146
|
+
this.scheduleNextHealthCheck();
|
|
134
147
|
// Start periodic health logging if enabled
|
|
135
148
|
if (this.config.logHealthStatus) {
|
|
136
149
|
this.logTimer = setInterval(() => {
|
|
@@ -138,11 +151,40 @@ export class HealthMonitor extends EventEmitter {
|
|
|
138
151
|
}, this.config.logIntervalMs);
|
|
139
152
|
this.logTimer.unref();
|
|
140
153
|
}
|
|
141
|
-
// Run initial health check
|
|
154
|
+
// Run initial health check immediately
|
|
142
155
|
this.runHealthChecks().catch(err => {
|
|
143
156
|
logger.error({ error: err }, '[HealthMonitor] Initial health check error');
|
|
144
157
|
});
|
|
145
|
-
logger.info(
|
|
158
|
+
logger.info({
|
|
159
|
+
initialIntervalMs: this.currentCheckIntervalMs,
|
|
160
|
+
unhealthyIntervalMs: this.config.unhealthyCheckIntervalMs,
|
|
161
|
+
}, '[HealthMonitor] Health monitoring started (adaptive setTimeout scheduling)');
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Issue #16: Schedule the next health check using setTimeout (prevents stacking)
|
|
165
|
+
* Each check schedules the next one after completing, so checks never overlap from the timer.
|
|
166
|
+
*/
|
|
167
|
+
scheduleNextHealthCheck() {
|
|
168
|
+
if (!this.isRunning) {
|
|
169
|
+
return;
|
|
170
|
+
}
|
|
171
|
+
// Clear any existing timer to prevent duplicates
|
|
172
|
+
if (this.checkTimer) {
|
|
173
|
+
clearTimeout(this.checkTimer);
|
|
174
|
+
this.checkTimer = null;
|
|
175
|
+
}
|
|
176
|
+
this.checkTimer = setTimeout(async () => {
|
|
177
|
+
if (!this.isRunning) return;
|
|
178
|
+
try {
|
|
179
|
+
await this.runHealthChecks();
|
|
180
|
+
}
|
|
181
|
+
catch (err) {
|
|
182
|
+
logger.error({ error: err }, '[HealthMonitor] Health check error');
|
|
183
|
+
}
|
|
184
|
+
// Schedule the next check (recursive scheduling)
|
|
185
|
+
this.scheduleNextHealthCheck();
|
|
186
|
+
}, this.currentCheckIntervalMs);
|
|
187
|
+
this.checkTimer.unref();
|
|
146
188
|
}
|
|
147
189
|
/**
|
|
148
190
|
* Stop the health monitoring loop
|
|
@@ -152,36 +194,122 @@ export class HealthMonitor extends EventEmitter {
|
|
|
152
194
|
return;
|
|
153
195
|
}
|
|
154
196
|
this.isRunning = false;
|
|
197
|
+
// Issue #16: checkTimer is now a setTimeout, use clearTimeout
|
|
155
198
|
if (this.checkTimer) {
|
|
156
|
-
|
|
199
|
+
clearTimeout(this.checkTimer);
|
|
157
200
|
this.checkTimer = null;
|
|
158
201
|
}
|
|
159
202
|
if (this.logTimer) {
|
|
160
203
|
clearInterval(this.logTimer);
|
|
161
204
|
this.logTimer = null;
|
|
162
205
|
}
|
|
163
|
-
logger.info(
|
|
206
|
+
logger.info({
|
|
207
|
+
totalHealthChecks: this.totalHealthChecks,
|
|
208
|
+
lastCheckTimestamp: this.lastCheckTimestamp > 0 ? new Date(this.lastCheckTimestamp).toISOString() : null,
|
|
209
|
+
uptimeMs: Date.now() - this.startTime,
|
|
210
|
+
}, '[HealthMonitor] Health monitoring stopped');
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Issue #16: Full cleanup/destroy method for graceful shutdown
|
|
214
|
+
* Stops all timers, removes all listeners, and resets all state.
|
|
215
|
+
*/
|
|
216
|
+
destroy() {
|
|
217
|
+
this.stop();
|
|
218
|
+
this.removeAllListeners();
|
|
219
|
+
this.resilientTransport = null;
|
|
220
|
+
this.database = null;
|
|
221
|
+
this.embeddingSocketPath = null;
|
|
222
|
+
this.isCheckRunning = false;
|
|
223
|
+
this.totalHealthChecks = 0;
|
|
224
|
+
this.lastCheckTimestamp = 0;
|
|
225
|
+
this.consecutiveHealthyChecks = 0;
|
|
226
|
+
logger.info('[HealthMonitor] Destroyed - all resources released');
|
|
164
227
|
}
|
|
165
228
|
/**
|
|
166
229
|
* Run all health checks (non-blocking)
|
|
230
|
+
*
|
|
231
|
+
* Issue #16: Added concurrency guard to prevent overlapping checks,
|
|
232
|
+
* diagnostic counters, and adaptive interval adjustment.
|
|
167
233
|
*/
|
|
168
234
|
async runHealthChecks() {
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
this.
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
await
|
|
183
|
-
|
|
184
|
-
|
|
235
|
+
// Issue #16: Concurrency guard - prevent overlapping health checks
|
|
236
|
+
if (this.isCheckRunning) {
|
|
237
|
+
logger.debug('[HealthMonitor] Health check already in progress, skipping to prevent stacking');
|
|
238
|
+
return this.getSystemHealth();
|
|
239
|
+
}
|
|
240
|
+
this.isCheckRunning = true;
|
|
241
|
+
try {
|
|
242
|
+
const checkPromises = [
|
|
243
|
+
this.checkTransportHealth(),
|
|
244
|
+
this.checkDatabaseHealth(),
|
|
245
|
+
this.checkEmbeddingHealth()
|
|
246
|
+
];
|
|
247
|
+
// Run all checks in parallel - they shouldn't block each other
|
|
248
|
+
await Promise.allSettled(checkPromises);
|
|
249
|
+
// Issue #16: Update diagnostics
|
|
250
|
+
this.totalHealthChecks++;
|
|
251
|
+
this.lastCheckTimestamp = Date.now();
|
|
252
|
+
// Calculate overall health
|
|
253
|
+
const result = this.getSystemHealth();
|
|
254
|
+
// Issue #16: Adaptive interval adjustment
|
|
255
|
+
this.adjustCheckInterval(result.overallHealth);
|
|
256
|
+
// Emit health event
|
|
257
|
+
this.emit('health', result);
|
|
258
|
+
// Check for auto-recovery needs
|
|
259
|
+
if (this.config.autoRecoveryEnabled) {
|
|
260
|
+
await this.attemptAutoRecovery();
|
|
261
|
+
}
|
|
262
|
+
return result;
|
|
263
|
+
}
|
|
264
|
+
finally {
|
|
265
|
+
this.isCheckRunning = false;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
/**
|
|
269
|
+
* Issue #16: Adjust the check interval based on current system health
|
|
270
|
+
*
|
|
271
|
+
* - Healthy: use normal interval (SPECMEM_HEALTH_CHECK_INTERVAL_MS, default 30s)
|
|
272
|
+
* - Unhealthy: use fast interval (SPECMEM_HEALTH_CHECK_UNHEALTHY_INTERVAL_MS, default 5s)
|
|
273
|
+
* - Recovering: gradually increase from unhealthy to healthy interval
|
|
274
|
+
* (each consecutive healthy check increases interval by 25% toward the healthy rate)
|
|
275
|
+
*/
|
|
276
|
+
adjustCheckInterval(overallHealth) {
|
|
277
|
+
const healthyInterval = this.config.checkIntervalMs;
|
|
278
|
+
const unhealthyInterval = this.config.unhealthyCheckIntervalMs;
|
|
279
|
+
const previousInterval = this.currentCheckIntervalMs;
|
|
280
|
+
if (overallHealth === ComponentHealth.UNHEALTHY) {
|
|
281
|
+
// Unhealthy: switch to fast polling immediately
|
|
282
|
+
this.currentCheckIntervalMs = unhealthyInterval;
|
|
283
|
+
this.consecutiveHealthyChecks = 0;
|
|
284
|
+
}
|
|
285
|
+
else if (overallHealth === ComponentHealth.DEGRADED) {
|
|
286
|
+
// Degraded: use midpoint between unhealthy and healthy
|
|
287
|
+
this.currentCheckIntervalMs = Math.round((unhealthyInterval + healthyInterval) / 2);
|
|
288
|
+
this.consecutiveHealthyChecks = 0;
|
|
289
|
+
}
|
|
290
|
+
else if (overallHealth === ComponentHealth.HEALTHY) {
|
|
291
|
+
this.consecutiveHealthyChecks++;
|
|
292
|
+
if (this.currentCheckIntervalMs < healthyInterval) {
|
|
293
|
+
// Recovering: gradually increase interval back to healthy rate
|
|
294
|
+
// Each consecutive healthy check moves 25% closer to the healthy interval
|
|
295
|
+
const step = (healthyInterval - this.currentCheckIntervalMs) * 0.25;
|
|
296
|
+
this.currentCheckIntervalMs = Math.round(
|
|
297
|
+
Math.min(this.currentCheckIntervalMs + Math.max(step, 1000), healthyInterval)
|
|
298
|
+
);
|
|
299
|
+
}
|
|
300
|
+
else {
|
|
301
|
+
this.currentCheckIntervalMs = healthyInterval;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
// Log interval changes
|
|
305
|
+
if (previousInterval !== this.currentCheckIntervalMs) {
|
|
306
|
+
logger.info({
|
|
307
|
+
previousIntervalMs: previousInterval,
|
|
308
|
+
newIntervalMs: this.currentCheckIntervalMs,
|
|
309
|
+
overallHealth,
|
|
310
|
+
consecutiveHealthyChecks: this.consecutiveHealthyChecks,
|
|
311
|
+
}, '[HealthMonitor] Adaptive interval adjusted');
|
|
312
|
+
}
|
|
185
313
|
}
|
|
186
314
|
/**
|
|
187
315
|
* Check transport health
|
|
@@ -590,7 +718,12 @@ export class HealthMonitor extends EventEmitter {
|
|
|
590
718
|
embedding: { ...this.embeddingHealth }
|
|
591
719
|
},
|
|
592
720
|
uptime: Date.now() - this.startTime,
|
|
593
|
-
timestamp: new Date().toISOString()
|
|
721
|
+
timestamp: new Date().toISOString(),
|
|
722
|
+
// Issue #16: Diagnostics
|
|
723
|
+
totalHealthChecks: this.totalHealthChecks,
|
|
724
|
+
lastCheckTimestamp: this.lastCheckTimestamp > 0 ? new Date(this.lastCheckTimestamp).toISOString() : null,
|
|
725
|
+
currentCheckIntervalMs: this.currentCheckIntervalMs,
|
|
726
|
+
consecutiveHealthyChecks: this.consecutiveHealthyChecks,
|
|
594
727
|
};
|
|
595
728
|
}
|
|
596
729
|
/**
|
|
@@ -667,7 +800,7 @@ export function resetHealthMonitor(projectPath) {
|
|
|
667
800
|
const targetProject = projectPath || getProjectPath();
|
|
668
801
|
const monitor = healthMonitorsByProject.get(targetProject);
|
|
669
802
|
if (monitor) {
|
|
670
|
-
monitor.
|
|
803
|
+
monitor.destroy();
|
|
671
804
|
healthMonitorsByProject.delete(targetProject);
|
|
672
805
|
logger.debug({ projectPath: targetProject }, '[HealthMonitor] Reset instance for project');
|
|
673
806
|
}
|
|
@@ -677,8 +810,8 @@ export function resetHealthMonitor(projectPath) {
|
|
|
677
810
|
*/
|
|
678
811
|
export function resetAllHealthMonitors() {
|
|
679
812
|
for (const [projectPath, monitor] of healthMonitorsByProject) {
|
|
680
|
-
monitor.
|
|
681
|
-
logger.debug({ projectPath }, '[HealthMonitor]
|
|
813
|
+
monitor.destroy();
|
|
814
|
+
logger.debug({ projectPath }, '[HealthMonitor] Destroyed instance for project');
|
|
682
815
|
}
|
|
683
816
|
healthMonitorsByProject.clear();
|
|
684
817
|
}
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
*/
|
|
13
13
|
import { getEmbeddingServerManager } from '../embeddingServerManager.js';
|
|
14
14
|
import { logger } from '../../utils/logger.js';
|
|
15
|
+
import { execSync } from 'child_process';
|
|
15
16
|
// Module-level reference to embedding provider for socket reset
|
|
16
17
|
let embeddingProviderRef = null;
|
|
17
18
|
/**
|
|
@@ -56,6 +57,36 @@ export class EmbeddingStart {
|
|
|
56
57
|
status: manager.getExtendedStatus()
|
|
57
58
|
};
|
|
58
59
|
}
|
|
60
|
+
// ═══════════════════════════════════════════════════════════════
|
|
61
|
+
// HARD KILL: Force kill ALL frankenstein processes before restart
|
|
62
|
+
// This ensures force start actually works even when manager lost
|
|
63
|
+
// track of the process (e.g., after MCP reconnect)
|
|
64
|
+
// ═══════════════════════════════════════════════════════════════
|
|
65
|
+
try {
|
|
66
|
+
const pids = execSync('pgrep -f "frankenstein-embeddings.py" 2>/dev/null || true', { encoding: 'utf8' }).trim().split('\n').filter(Boolean);
|
|
67
|
+
if (pids.length > 0) {
|
|
68
|
+
logger.info({ pids, count: pids.length }, '[EmbeddingStart] Hard killing all frankenstein processes');
|
|
69
|
+
for (const pidStr of pids) {
|
|
70
|
+
const pid = parseInt(pidStr, 10);
|
|
71
|
+
if (pid && pid !== process.pid) {
|
|
72
|
+
try { process.kill(pid, 'SIGTERM'); } catch { /* already dead */ }
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// Wait for them to die
|
|
76
|
+
const killWaitMs = parseInt(process.env['SPECMEM_ORPHAN_KILL_WAIT_MS'] || '2000', 10);
|
|
77
|
+
await new Promise(r => setTimeout(r, killWaitMs));
|
|
78
|
+
// Force kill survivors
|
|
79
|
+
for (const pidStr of pids) {
|
|
80
|
+
const pid = parseInt(pidStr, 10);
|
|
81
|
+
if (pid && pid !== process.pid) {
|
|
82
|
+
try { process.kill(pid, 'SIGKILL'); } catch { /* already dead */ }
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
logger.info('[EmbeddingStart] All old frankenstein processes killed');
|
|
86
|
+
}
|
|
87
|
+
} catch (killErr) {
|
|
88
|
+
logger.debug({ error: killErr }, '[EmbeddingStart] Hard kill failed (non-fatal)');
|
|
89
|
+
}
|
|
59
90
|
const result = await manager.userStart();
|
|
60
91
|
// CRITICAL: Reset the MCP's socket connection to pick up new server
|
|
61
92
|
if (result.success && embeddingProviderRef?.resetSocket) {
|
|
@@ -29,6 +29,7 @@ import { logger } from '../../utils/logger.js';
|
|
|
29
29
|
import { v4 as uuidv4 } from 'uuid';
|
|
30
30
|
import { createHash } from 'crypto';
|
|
31
31
|
import fs from 'fs';
|
|
32
|
+
import path from 'path';
|
|
32
33
|
import { getProjectPathForInsert } from '../../services/ProjectContext.js';
|
|
33
34
|
import { getFileCommsTransport } from '../../comms/fileCommsTransport.js';
|
|
34
35
|
import { smartCompress } from '../../utils/tokenCompressor.js';
|
|
@@ -649,6 +650,13 @@ Examples:
|
|
|
649
650
|
logger.warn({ error: fileErr }, '[TeamComms] File fallback failed - message only visible in current process');
|
|
650
651
|
}
|
|
651
652
|
}
|
|
653
|
+
// Write latest team message to statusbar state file for live display
|
|
654
|
+
try {
|
|
655
|
+
const statusFile = path.join(projectPath, 'specmem', 'sockets', 'team-comms-latest.json');
|
|
656
|
+
const commsState = { sender: senderDisplayName, message: message.slice(0, 80), timestamp, channel: channel || 'main' };
|
|
657
|
+
fs.writeFileSync(statusFile, JSON.stringify(commsState));
|
|
658
|
+
}
|
|
659
|
+
catch (_e) { /* non-fatal */ }
|
|
652
660
|
logToTeamChannel('send_message', {
|
|
653
661
|
messageId,
|
|
654
662
|
type,
|
|
@@ -1130,6 +1138,14 @@ Use cross_project: true for system-wide announcements (use sparingly).`;
|
|
|
1130
1138
|
};
|
|
1131
1139
|
teamMessagesMemory.set(messageId, teamMessage);
|
|
1132
1140
|
}
|
|
1141
|
+
// Write latest broadcast to statusbar state file for live display
|
|
1142
|
+
try {
|
|
1143
|
+
const broadcastProjectPath = cross_project ? (process.env.SPECMEM_PROJECT_PATH || process.cwd()) : projectPath;
|
|
1144
|
+
const statusFile = path.join(broadcastProjectPath, 'specmem', 'sockets', 'team-comms-latest.json');
|
|
1145
|
+
const commsState = { sender: senderName, message: message.slice(0, 80), timestamp, channel: 'broadcast' };
|
|
1146
|
+
fs.writeFileSync(statusFile, JSON.stringify(commsState));
|
|
1147
|
+
}
|
|
1148
|
+
catch (_e) { /* non-fatal */ }
|
|
1133
1149
|
logToTeamChannel('broadcast', {
|
|
1134
1150
|
messageId,
|
|
1135
1151
|
broadcast_type,
|