@evomap/evolver 1.87.4 → 1.88.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/index.js +848 -33
  2. package/package.json +1 -1
  3. package/scripts/build_binaries.js +11 -1
  4. package/src/adapters/hookAdapter.js +3 -1
  5. package/src/adapters/scripts/_runtimePaths.js +24 -0
  6. package/src/adapters/scripts/evolver-session-end.js +110 -78
  7. package/src/adapters/scripts/evolver-session-start.js +100 -0
  8. package/src/config.js +43 -8
  9. package/src/evolve/guards.js +1 -1
  10. package/src/evolve/pipeline/collect.js +1 -1
  11. package/src/evolve/pipeline/dispatch.js +1 -1
  12. package/src/evolve/pipeline/enrich.js +1 -1
  13. package/src/evolve/pipeline/hub.js +1 -1
  14. package/src/evolve/pipeline/select.js +1 -1
  15. package/src/evolve/pipeline/signals.js +1 -1
  16. package/src/evolve/utils.js +1 -1
  17. package/src/evolve.js +1 -1
  18. package/src/forceUpdate.js +42 -21
  19. package/src/gep/a2aProtocol.js +1 -1
  20. package/src/gep/assetStore.js +40 -0
  21. package/src/gep/candidateEval.js +1 -1
  22. package/src/gep/candidates.js +1 -1
  23. package/src/gep/contentHash.js +1 -1
  24. package/src/gep/crypto.js +1 -1
  25. package/src/gep/curriculum.js +1 -1
  26. package/src/gep/deviceId.js +1 -1
  27. package/src/gep/envFingerprint.js +1 -1
  28. package/src/gep/epigenetics.js +1 -1
  29. package/src/gep/explore.js +1 -1
  30. package/src/gep/featureFlags.js +4 -0
  31. package/src/gep/gitOps.js +7 -2
  32. package/src/gep/hash.js +1 -1
  33. package/src/gep/hubFetch.js +1 -1
  34. package/src/gep/hubReview.js +1 -1
  35. package/src/gep/hubSearch.js +1 -1
  36. package/src/gep/hubVerify.js +1 -1
  37. package/src/gep/idleScheduler.js +78 -0
  38. package/src/gep/learningSignals.js +1 -1
  39. package/src/gep/mailboxTransport.js +34 -0
  40. package/src/gep/memoryGraph.js +1 -1
  41. package/src/gep/memoryGraphAdapter.js +1 -1
  42. package/src/gep/mutation.js +1 -1
  43. package/src/gep/narrativeMemory.js +1 -1
  44. package/src/gep/openPRRegistry.js +1 -1
  45. package/src/gep/paths.js +16 -2
  46. package/src/gep/personality.js +1 -1
  47. package/src/gep/policyCheck.js +1 -1
  48. package/src/gep/prompt.js +1 -1
  49. package/src/gep/recallVerifier.js +1 -1
  50. package/src/gep/reflection.js +1 -1
  51. package/src/gep/selector.js +1 -1
  52. package/src/gep/skillDistiller.js +1 -1
  53. package/src/gep/solidify.js +1 -1
  54. package/src/gep/strategy.js +1 -1
  55. package/src/gep/validator/index.js +46 -1
  56. package/src/gep/validator/sandboxExecutor.js +10 -1
  57. package/src/gep/validator/stakeBootstrap.js +3 -0
  58. package/src/gep/workspaceKeychain.js +1 -1
  59. package/src/ops/lifecycle.js +79 -10
  60. package/src/ops/skills_monitor.js +2 -1
  61. package/src/proxy/index.js +7 -1
  62. package/src/proxy/lifecycle/manager.js +77 -4
  63. package/src/proxy/mailbox/store.js +52 -2
  64. package/src/proxy/server/settings.js +16 -2
  65. package/src/proxy/sync/inbound.js +14 -1
@@ -23,12 +23,30 @@ const HEARTBEAT_BACKOFF_CAP_MS = 15 * 60_000;
23
23
  const HELLO_TIMEOUT = 15_000;
24
24
  const HEARTBEAT_TIMEOUT = 10_000;
25
25
  const MAX_REAUTH_ATTEMPTS = 2;
26
- // First failure = 30 min, subsequent consecutive failures double up to ~4h.
27
- // Without escalation a daemon stuck on a bad secret gets re-poked every 30
28
- // minutes by inbound auth errors and fills the log forever.
29
- const REAUTH_BACKOFF_BASE_MS = 30 * 60_000;
26
+ // First failure = 2 min, subsequent consecutive failures double up to ~4h.
27
+ // Aligned with a2aProtocol.js Round-9 reduction (was 30 min, caused
28
+ // "idle-death" for proxy-mode users: one benign 401 silenced the node for 30
29
+ // min, triggering stagnation kills and manual restart loops).
30
+ const REAUTH_BACKOFF_BASE_MS = 2 * 60_000;
30
31
  const REAUTH_BACKOFF_MAX_MS = 4 * 60 * 60_000;
31
32
 
33
+ // Wall-clock drift detector tunables. Mirrors DRIFT_CHECK_MS /
34
+ // DRIFT_SLEEP_THRESHOLD_MS / DRIFT_LONG_SLEEP_THRESHOLD_MS in
35
+ // src/gep/a2aProtocol.js. setTimeout / setInterval fire on libuv's
36
+ // monotonic clock, which freezes while the host is suspended -- so a
37
+ // laptop closed for hours and reopened would not trigger any heartbeat
38
+ // tick until the next scheduled time, which under exponential backoff
39
+ // can sit at HEARTBEAT_BACKOFF_CAP_MS (15 min). Sampling Date.now()
40
+ // (wall clock) every DRIFT_CHECK_MS lets us detect the jump and
41
+ // immediately poke the heartbeat so recovery does not have to wait for
42
+ // the next natural tick. Long-sleep gap also clears reauth backoff:
43
+ // hub-side state we cached is almost certainly stale after a 30min+
44
+ // suspend, so force a clean retry path on wake instead of carrying the
45
+ // pre-sleep penalty through. R10 (#544).
46
+ const DRIFT_CHECK_MS = 30 * 1000;
47
+ const DRIFT_SLEEP_THRESHOLD_MS = 90 * 1000;
48
+ const DRIFT_LONG_SLEEP_THRESHOLD_MS = 30 * 60_000;
49
+
32
50
  let _cachedFingerprint = null;
33
51
  function _getEnvFingerprint() {
34
52
  if (_cachedFingerprint) return _cachedFingerprint;
@@ -101,6 +119,8 @@ class LifecycleManager {
101
119
  this._helloRateLimitUntil = 0;
102
120
  this._reauthBackoffUntil = 0;
103
121
  this._consecutiveReauthFailures = 0;
122
+ this._driftInterval = null;
123
+ this._lastDriftCheckAt = 0;
104
124
  }
105
125
 
106
126
  get nodeId() {
@@ -510,6 +530,55 @@ class LifecycleManager {
510
530
  // to schedule its own next timer (a fresher path already owns it).
511
531
  this._heartbeatGen = (this._heartbeatGen || 0) + 1;
512
532
  this._heartbeatTick(this._heartbeatGen);
533
+ this._startDriftDetector();
534
+ }
535
+
536
+ // Sample wall-clock every DRIFT_CHECK_MS so macOS sleep / hypervisor
537
+ // pause / debugger break is detected and the heartbeat loop is poked
538
+ // back into action without waiting for the (possibly 15-min) backoff
539
+ // timer to fire on libuv's monotonic clock. R10 (#544).
540
+ _startDriftDetector() {
541
+ if (this._driftInterval) return;
542
+ this._lastDriftCheckAt = Date.now();
543
+ this._driftInterval = setInterval(() => {
544
+ // Wrap the whole body in try/catch -- this is a setInterval
545
+ // callback; any throw escaping it kills the detector itself,
546
+ // which is the bug we're protecting against.
547
+ try {
548
+ if (!this._running) return;
549
+ const now = Date.now();
550
+ const gap = now - this._lastDriftCheckAt;
551
+ this._lastDriftCheckAt = now;
552
+ if (gap > DRIFT_SLEEP_THRESHOLD_MS) {
553
+ try {
554
+ this.logger.warn(
555
+ `[lifecycle] wall-clock jump detected (+${Math.round(gap / 1000)}s); ` +
556
+ 'likely sleep/wake or process suspension, poking heartbeat'
557
+ );
558
+ } catch (_) { /* logger broken; detector must still poke */ }
559
+ // Long-sleep recovery: the hub-side cached state we carried
560
+ // through the suspend is almost certainly stale. Clear reauth
561
+ // backoff so the next tick can try a clean recovery path
562
+ // instead of sitting out a pre-sleep penalty for up to 4h.
563
+ if (gap > DRIFT_LONG_SLEEP_THRESHOLD_MS) {
564
+ this._consecutiveReauthFailures = 0;
565
+ this._reauthBackoffUntil = 0;
566
+ try {
567
+ this.logger.warn(
568
+ `[lifecycle] long sleep (+${Math.round(gap / 60_000)}min) cleared reauth backoff`
569
+ );
570
+ } catch (_) { /* logger broken; non-fatal */ }
571
+ }
572
+ this.pokeHeartbeatLoop();
573
+ }
574
+ } catch (err) {
575
+ try { this.logger.error(`[lifecycle] drift detector threw: ${err && err.message}`); }
576
+ catch (_) { /* never let the detector escape */ }
577
+ }
578
+ }, DRIFT_CHECK_MS);
579
+ // Don't keep the event loop alive on behalf of the detector alone --
580
+ // matches the unref() used on _heartbeatTimer.
581
+ if (this._driftInterval.unref) this._driftInterval.unref();
513
582
  }
514
583
 
515
584
  async _heartbeatTick(myGen) {
@@ -574,6 +643,10 @@ class LifecycleManager {
574
643
  clearTimeout(this._heartbeatTimer);
575
644
  this._heartbeatTimer = null;
576
645
  }
646
+ if (this._driftInterval) {
647
+ clearInterval(this._driftInterval);
648
+ this._driftInterval = null;
649
+ }
577
650
  }
578
651
 
579
652
  _shouldUpgrade(minVersion) {
@@ -68,6 +68,27 @@ function safeParse(payload) {
68
68
  try { return JSON.parse(payload); } catch { return payload; }
69
69
  }
70
70
 
71
+ // Round-9: the round-8 cross-process append lock (§21.7) was REMOVED.
72
+ // Its premise -- that fs.appendFileSync to a regular file can interleave
73
+ // bytes mid-line unless each write stays under PIPE_BUF (512 B darwin,
74
+ // 4096 B linux) -- conflated two different POSIX guarantees. PIPE_BUF
75
+ // atomicity is defined for PIPES/FIFOs, not regular files. A single
76
+ // O_APPEND write() to a regular file is positioned atomically at EOF and
77
+ // is not interleaved with other appenders on the local filesystems evolver
78
+ // uses (~/.evomap); this was verified empirically on darwin/APFS --
79
+ // concurrent 4 KB..1 MB appends from 6 writers produced zero torn lines.
80
+ // So the lock guarded a non-problem. Worse, its 5 s deadline with a
81
+ // busy-wait (Atomics.wait, then a spin-loop fallback) ran on the single
82
+ // JS thread, so under any real contention it BLOCKED the event loop --
83
+ // starving the very heartbeat/SSE/HTTP it shared the process with, i.e.
84
+ // it could itself produce the "process alive but inert" symptom it claimed
85
+ // to prevent. fs.appendFileSync writes the whole buffer with O_APPEND, so
86
+ // a single record lands as one atomic append.
87
+ //
88
+ // Windows note: PIPE_BUF is a POSIX concept; it does not exist on Windows.
89
+ // Windows NTFS provides the same atomicity guarantee for O_APPEND writes to
90
+ // regular files that POSIX local filesystems do, so the removal above is
91
+ // equally valid on Windows. No platform-specific code is needed here.
71
92
  function appendLine(filePath, obj) {
72
93
  fs.appendFileSync(filePath, JSON.stringify(obj) + '\n', 'utf8');
73
94
  }
@@ -135,8 +156,27 @@ class MailboxStore {
135
156
  _persistState() {
136
157
  const dir = path.dirname(this._stateFile);
137
158
  if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
138
- const tmp = `${this._stateFile}.tmp`;
159
+ // Round-7 (§20.5): per-PID tmp path. Two evolver processes (daemon +
160
+ // ad-hoc CLI / proxy + loop) writing to the same `${stateFile}.tmp`
161
+ // would otherwise interleave: process B's writeFileSync truncates
162
+ // A's tmp mid-write, then B's rename completes with B's truncated
163
+ // payload as the final state.json. `state.json` holds the cached
164
+ // node_secret after a hub rotation -- a torn write here is the
165
+ // load-bearing trigger for the "401-loop -> reauth backoff -> dead
166
+ // for 30 min..4 h" symptom this branch targets. Matches the
167
+ // precedent set by _persistNodeSecret in src/gep/a2aProtocol.js.
168
+ const tmp = `${this._stateFile}.${process.pid}.tmp`;
139
169
  fs.writeFileSync(tmp, JSON.stringify(this._state, null, 2) + '\n', 'utf8');
170
+ // Windows: fs.renameSync throws EPERM when the destination file already
171
+ // exists, unlike POSIX where rename(2) atomically replaces the target.
172
+ // Remove the destination first so the rename succeeds on all platforms.
173
+ // The window between unlink and rename is intentionally tiny; a crash in
174
+ // that window leaves the tmp file behind (recovered on next _persistState).
175
+ if (process.platform === 'win32') {
176
+ try { fs.unlinkSync(this._stateFile); } catch (e) {
177
+ if (e.code !== 'ENOENT') throw e;
178
+ }
179
+ }
140
180
  fs.renameSync(tmp, this._stateFile);
141
181
  }
142
182
 
@@ -384,7 +424,10 @@ class MailboxStore {
384
424
  // --- Compaction (reduces JSONL file size by rewriting only current state) ---
385
425
 
386
426
  compact() {
387
- const tmpFile = this._messagesFile + '.tmp';
427
+ // Round-7 (§20.5): same per-PID tmp rationale as _persistState.
428
+ // Two concurrent compact() calls (daemon + ad-hoc CLI) racing on
429
+ // the same `${messagesFile}.tmp` lose the loser's compacted log.
430
+ const tmpFile = `${this._messagesFile}.${process.pid}.tmp`;
388
431
  const entries = [];
389
432
  for (const [, msg] of this._messages) {
390
433
  entries.push(msg);
@@ -396,6 +439,13 @@ class MailboxStore {
396
439
  fs.writeSync(fd, JSON.stringify(msg) + '\n');
397
440
  }
398
441
  fs.closeSync(fd);
442
+ // Windows: renameSync throws EPERM when the destination already exists.
443
+ // Remove it first so the swap succeeds on all platforms.
444
+ if (process.platform === 'win32') {
445
+ try { fs.unlinkSync(this._messagesFile); } catch (e) {
446
+ if (e.code !== 'ENOENT') throw e;
447
+ }
448
+ }
399
449
  fs.renameSync(tmpFile, this._messagesFile);
400
450
  this._rebuildIndex();
401
451
  }
@@ -37,9 +37,13 @@ function writeSettings(data) {
37
37
  }
38
38
  const current = readSettings();
39
39
  const merged = { ...current, ...data };
40
+ // NOTE(windows): mode 0o600 is silently ignored on Windows. The settings
41
+ // file (which may contain proxy credentials) will NOT be owner-read-only.
42
+ // Only Windows user-profile directory ACLs provide isolation. The chmodSync
43
+ // call below is also a no-op on Windows but is retained for Unix correctness.
40
44
  fs.writeFileSync(file, JSON.stringify(merged, null, 2), { encoding: 'utf8', mode: 0o600 });
41
45
  // mode: 0o600 only applies on creation; explicitly chmod to tighten pre-existing files
42
- try { fs.chmodSync(file, 0o600); } catch {}
46
+ try { fs.chmodSync(file, 0o600); } catch { /* best-effort; no-op on Windows */ }
43
47
  return merged;
44
48
  }
45
49
 
@@ -59,9 +63,19 @@ function isStaleProxy() {
59
63
  const pid = settings.proxy?.pid;
60
64
  if (!pid) return false;
61
65
  try {
66
+ // process.kill(pid, 0) probes whether the process exists without sending a
67
+ // signal. On POSIX it throws ESRCH when the PID is gone. On Windows the
68
+ // Node.js runtime maps this to the same behavior (ESRCH via uv_kill), so
69
+ // the cross-platform semantics are consistent. If the current process does
70
+ // not have permission to query the target PID, EPERM is thrown -- that
71
+ // means the PID exists and is owned by another user, so we treat it as
72
+ // live (not stale) rather than crashing.
62
73
  process.kill(pid, 0);
63
74
  return false;
64
- } catch {
75
+ } catch (err) {
76
+ // ESRCH: process does not exist -> stale.
77
+ // EPERM: process exists but is not ours -> not stale (leave settings alone).
78
+ if (err.code === 'EPERM') return false;
65
79
  return true;
66
80
  }
67
81
  }
@@ -83,12 +83,25 @@ class InboundSync {
83
83
 
84
84
  try {
85
85
  const senderId = this.store.getState('node_id');
86
- await hubFetch(endpoint, {
86
+ // Round-8 (§21.5): drain the response body so the undici long-poll
87
+ // dispatcher pool is not leaked one socket per ack. ackDelivered
88
+ // is called every inbound poll cycle (default 1-10s); the
89
+ // pre-round-8 code captured no reference to res and never called
90
+ // .json()/.text()/body.cancel(), so each ack pinned a socket
91
+ // until GC. After a few minutes of activity the strict-pool was
92
+ // exhausted and proxy-mode heartbeats hung on next acquire --
93
+ // matches the "alive once then dead" user symptom in proxy mode.
94
+ const res = await hubFetch(endpoint, {
87
95
  method: 'POST',
88
96
  headers: this.getHeaders(),
89
97
  body: JSON.stringify({ sender_id: senderId, message_ids: delivered.map(m => m.id) }),
90
98
  signal: AbortSignal.timeout(10_000),
91
99
  });
100
+ try {
101
+ if (res && res.body && typeof res.body.cancel === 'function') {
102
+ await res.body.cancel().catch(() => {});
103
+ }
104
+ } catch (_) { /* never escape the drain helper */ }
92
105
  return { acked: delivered.length };
93
106
  } catch (err) {
94
107
  this.logger.error(`[inbound] ack failed: ${err.message}`);