@evomap/evolver 1.87.4 → 1.88.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/index.js +934 -33
  2. package/package.json +1 -1
  3. package/scripts/build_binaries.js +11 -1
  4. package/src/adapters/hookAdapter.js +3 -1
  5. package/src/adapters/scripts/_runtimePaths.js +24 -0
  6. package/src/adapters/scripts/evolver-session-end.js +110 -78
  7. package/src/adapters/scripts/evolver-session-start.js +100 -0
  8. package/src/config.js +43 -8
  9. package/src/evolve/guards.js +1 -1
  10. package/src/evolve/pipeline/collect.js +1 -1
  11. package/src/evolve/pipeline/dispatch.js +1 -1
  12. package/src/evolve/pipeline/enrich.js +1 -1
  13. package/src/evolve/pipeline/hub.js +1 -1
  14. package/src/evolve/pipeline/select.js +1 -1
  15. package/src/evolve/pipeline/signals.js +1 -1
  16. package/src/evolve/utils.js +1 -1
  17. package/src/evolve.js +1 -1
  18. package/src/forceUpdate.js +42 -21
  19. package/src/gep/a2aProtocol.js +1 -1
  20. package/src/gep/assetStore.js +40 -0
  21. package/src/gep/autoDistillConv.js +1 -0
  22. package/src/gep/autoDistillLlm.js +1 -0
  23. package/src/gep/bridge.js +69 -2
  24. package/src/gep/candidateEval.js +1 -1
  25. package/src/gep/candidates.js +1 -1
  26. package/src/gep/contentHash.js +1 -1
  27. package/src/gep/conversationSniffer.js +1 -0
  28. package/src/gep/crypto.js +1 -1
  29. package/src/gep/curriculum.js +1 -1
  30. package/src/gep/deviceId.js +1 -1
  31. package/src/gep/envFingerprint.js +1 -1
  32. package/src/gep/epigenetics.js +1 -1
  33. package/src/gep/execBridge.js +1 -0
  34. package/src/gep/explore.js +1 -1
  35. package/src/gep/featureFlags.js +4 -0
  36. package/src/gep/gitOps.js +7 -2
  37. package/src/gep/hash.js +1 -1
  38. package/src/gep/hubFetch.js +1 -1
  39. package/src/gep/hubReview.js +1 -1
  40. package/src/gep/hubSearch.js +1 -1
  41. package/src/gep/hubVerify.js +1 -1
  42. package/src/gep/idleScheduler.js +78 -0
  43. package/src/gep/learningSignals.js +1 -1
  44. package/src/gep/mailboxTransport.js +34 -0
  45. package/src/gep/memoryGraph.js +1 -1
  46. package/src/gep/memoryGraphAdapter.js +1 -1
  47. package/src/gep/mutation.js +1 -1
  48. package/src/gep/narrativeMemory.js +1 -1
  49. package/src/gep/openPRRegistry.js +1 -1
  50. package/src/gep/paths.js +16 -2
  51. package/src/gep/personality.js +1 -1
  52. package/src/gep/policyCheck.js +1 -1
  53. package/src/gep/prompt.js +1 -1
  54. package/src/gep/recallVerifier.js +1 -1
  55. package/src/gep/reflection.js +1 -1
  56. package/src/gep/selector.js +1 -1
  57. package/src/gep/skillDistiller.js +1 -1
  58. package/src/gep/solidify.js +1 -1
  59. package/src/gep/strategy.js +1 -1
  60. package/src/gep/validator/index.js +46 -1
  61. package/src/gep/validator/sandboxExecutor.js +10 -1
  62. package/src/gep/validator/stakeBootstrap.js +3 -0
  63. package/src/gep/workspaceKeychain.js +1 -1
  64. package/src/ops/lifecycle.js +79 -10
  65. package/src/ops/skills_monitor.js +2 -1
  66. package/src/proxy/index.js +31 -6
  67. package/src/proxy/lifecycle/manager.js +77 -4
  68. package/src/proxy/mailbox/store.js +52 -2
  69. package/src/proxy/server/settings.js +16 -2
  70. package/src/proxy/sync/inbound.js +14 -1
package/index.js CHANGED
@@ -42,14 +42,94 @@ try {
42
42
  const evolve = require('./src/evolve');
43
43
  const { solidify } = require('./src/gep/solidify');
44
44
  const path = require('path');
45
+ const os = require('os');
45
46
  const { getRepoRoot } = require('./src/gep/paths');
46
47
  const fs = require('fs');
47
48
  const { spawn } = require('child_process');
48
49
 
50
+ // Interruptible sleep: SIGCONT (and any future wake hook) can short-circuit
51
+ // pending sleeps so a daemon that just woke from macOS sleep doesn't sit
52
+ // out the rest of its pre-sleep adaptive-sleep window on the resumed
53
+ // monotonic clock. Without this, the heartbeat side recovers via the
54
+ // drift detector but the outer evolve cycle stays paused up to maxSleepMs
55
+ // (default 5 min) after wake. Each call tracks its own resolver in
56
+ // _activeSleeps so the wake hook can resolve all of them.
57
+ const _activeSleeps = new Set();
49
58
  function sleepMs(ms) {
50
59
  const n = parseInt(String(ms), 10);
51
60
  const t = Number.isFinite(n) ? Math.max(0, n) : 0;
52
- return new Promise(resolve => setTimeout(resolve, t));
61
+ return new Promise(resolve => {
62
+ let done = false;
63
+ const finish = () => {
64
+ if (done) return;
65
+ done = true;
66
+ clearTimeout(timer);
67
+ _activeSleeps.delete(finish);
68
+ resolve();
69
+ };
70
+ const timer = setTimeout(finish, t);
71
+ // NOTE: intentionally NOT calling timer.unref() here. When the daemon is in
72
+ // a long adaptive sleep (up to maxSleepMs = 5 min by default), this timer is
73
+ // often the ONLY ref'd handle keeping the event loop alive. All other timers
74
+ // (_heartbeatTimer, _heartbeatDriftInterval, _selfDrivingPollTimer, etc.) are
75
+ // unref'd, so once the evolve loop's sleepMs timer was also unref'd, Node.js
76
+ // could see zero ref'd handles and silently exit the process mid-sleep. That
77
+ // was the root cause of "first launch ok, idle for a while, then evolver dead
78
+ // with no log trace" on macOS. A ref'd sleep timer is the load-bearing event-
79
+ // loop anchor during idle periods; it fires within maxSleepMs and the daemon
80
+ // then reschedules itself normally. Leaving it ref'd has no observable cost.
81
+ _activeSleeps.add(finish);
82
+ });
83
+ }
84
+ function _interruptAllSleeps() {
85
+ if (_activeSleeps.size === 0) return;
86
+ // Snapshot first because resolvers mutate the set as they run.
87
+ const finishers = Array.from(_activeSleeps);
88
+ for (const fn of finishers) {
89
+ try { fn(); } catch (_) {}
90
+ }
91
+ }
92
+
93
+ // Round-6 (§19.5): heartbeat-internal wake recovery (drainPool +
94
+ // pokeHeartbeat + SSE restart + self-driving-poll re-arm) lives in
95
+ // a2aProtocol so the drift detector can drive it directly. Process-
96
+ // level wake hooks (sleepMs interrupter, validator daemon poke) are
97
+ // registered with a2aProtocol so both the SIGCONT handler and the
98
+ // drift detector long-sleep branch run them. Lazy-register so requires
99
+ // resolve cleanly under test (single Set of registered hooks; cheap to
100
+ // re-register idempotently).
101
+ let _wakeHooksRegistered = false;
102
+ function _registerProcessWakeHooks() {
103
+ if (_wakeHooksRegistered) return;
104
+ try {
105
+ const a2a = require('./src/gep/a2aProtocol.js');
106
+ if (typeof a2a.registerWakeHook !== 'function') return;
107
+ a2a.registerWakeHook(function () {
108
+ try { _interruptAllSleeps(); } catch (_) {}
109
+ });
110
+ // R13: guards.sleepMs is a separate private helper used for 60-120s
111
+ // backoffs inside evolve.run() arms (active-sessions, system-load,
112
+ // pending-solidify). Without this hook, a guard sleep that spans
113
+ // macOS suspend would block the cycle for the full window on the
114
+ // resumed monotonic clock even though the outer sleep was interrupted.
115
+ a2a.registerWakeHook(function () {
116
+ try {
117
+ const guards = require('./src/evolve/guards');
118
+ if (guards && typeof guards._interruptGuardSleeps === 'function') {
119
+ guards._interruptGuardSleeps();
120
+ }
121
+ } catch (_) {}
122
+ });
123
+ a2a.registerWakeHook(function () {
124
+ try {
125
+ const v = require('./src/gep/validator');
126
+ if (v && typeof v.pokeValidatorDaemon === 'function') {
127
+ v.pokeValidatorDaemon();
128
+ }
129
+ } catch (_) {}
130
+ });
131
+ _wakeHooksRegistered = true;
132
+ } catch (_) {}
53
133
  }
54
134
 
55
135
  function readJsonSafe(p) {
@@ -186,35 +266,256 @@ function getLastSignals(statePath) {
186
266
  }
187
267
  }
188
268
 
189
- // Singleton Guard - prevent multiple evolver daemon instances
269
+ // Singleton Guard - prevent multiple evolver daemon instances.
270
+ //
271
+ // Round-4: pidfile location previously defaulted to __dirname, which is a
272
+ // DIFFERENT path per install mode -- /usr/local/lib/node_modules/... for a
273
+ // global install, the dev-clone path for `node index.js`, a transient
274
+ // $NPM_CACHE/_npx/<hash> for `npx evolver`. Two daemons launched under
275
+ // different install modes never saw each other's lock and could run
276
+ // concurrently against the same ~/.evomap/node_secret, ping-ponging on
277
+ // secret rotation and silently entering reauth backoff -- the user-
278
+ // reported "first launch ok, idle, then dead forever" pattern. Default
279
+ // now lives under the per-user state dir so all install modes converge.
280
+ // EVOLVER_LOCK_DIR still overrides for tests / sandboxed runs.
190
281
  function getLockFilePath() {
191
- // Allow tests / sandboxed runs to override the pid-file location so they
192
- // do not collide with a real daemon's lock at the source-dir default.
193
- const dir = process.env.EVOLVER_LOCK_DIR || __dirname;
194
- return path.join(dir, 'evolver.pid');
282
+ if (process.env.EVOLVER_LOCK_DIR) {
283
+ return path.join(process.env.EVOLVER_LOCK_DIR, 'evolver.pid');
284
+ }
285
+ // os.homedir() is cross-platform; process.env.HOME is unset on Windows.
286
+ return path.join(os.homedir(), '.evomap', 'instance.lock');
195
287
  }
288
+
289
+ function _writeLockAtomic(lockFile, payload) {
290
+ // Round-6 (§19.8): the previous implementation used tmp + rename, which
291
+ // makes the WRITE atomic but not the OWNERSHIP claim. Two processes
292
+ // could both rename their own tmp file over the same lockFile (rename
293
+ // is atomic per call but successive renames overwrite each other), then
294
+ // each read it back and -- if the second rename happened between the
295
+ // first process's rename and its read-back -- see the OTHER process's
296
+ // PID. Each then concludes "I lost the race" and exits, leaving the
297
+ // lockFile owned by no live process. Symmetrically, two processes can
298
+ // each see their own PID if the reads happen between their respective
299
+ // renames, and both conclude they won.
300
+ //
301
+ // The proper primitive is link(2): given a unique tmp file, link to the
302
+ // target path fails atomically with EEXIST if the target already
303
+ // exists. Only one of N concurrent linkers succeeds.
304
+ // NOTE(windows): mode 0o700 / 0o600 are silently ignored on Windows.
305
+ // The lock directory and tmp file will NOT be owner-only on Windows.
306
+ // Isolation relies solely on the user-profile directory ACLs.
307
+ const dir = path.dirname(lockFile);
308
+ try { fs.mkdirSync(dir, { recursive: true, mode: 0o700 }); } catch (_) {}
309
+ const tmp = lockFile + '.' + process.pid + '.tmp';
310
+ fs.writeFileSync(tmp, payload, { encoding: 'utf8', mode: 0o600 });
311
+ // link() requires the target NOT to exist. The caller in the takeover
312
+ // path has already unlinked the stale lockFile via fs.unlinkSync
313
+ // (ignoring ENOENT). If a concurrent process beat us to the link, our
314
+ // linkSync below throws EEXIST -- we surface that to the caller and
315
+ // clean up our tmp.
316
+ //
317
+ // EXDEV: fs.link() fails with EXDEV when tmp and lockFile are on different
318
+ // volumes (can happen on Windows when EVOLVER_LOCK_DIR points to a drive
319
+ // other than the tmp dir). Fall back to renameSync, which Node.js handles
320
+ // cross-device by copying + deleting. rename is not atomic in this path,
321
+ // so the EEXIST guard is lost, but this is an unusual configuration and
322
+ // the result is still safe (worst case: two daemons both think they won,
323
+ // the second write wins, the first will exit on its next tick when it
324
+ // reads back a foreign PID via the heartbeat).
325
+ try {
326
+ fs.linkSync(tmp, lockFile);
327
+ } catch (err) {
328
+ if (err && err.code === 'EXDEV') {
329
+ // Cross-device: rename falls back to copy+delete inside Node.js; this
330
+ // loses the atomic-EEXIST guarantee but is better than hard-failing.
331
+ try {
332
+ fs.renameSync(tmp, lockFile);
333
+ } catch (renameErr) {
334
+ try { fs.unlinkSync(tmp); } catch (_) {}
335
+ throw renameErr;
336
+ }
337
+ return; // tmp has been consumed by renameSync, skip unlinkSync below
338
+ }
339
+ try { fs.unlinkSync(tmp); } catch (_) {}
340
+ throw err;
341
+ }
342
+ try { fs.unlinkSync(tmp); } catch (_) {}
343
+ }
344
+
345
+ function _readLockPayload(lockFile) {
346
+ try {
347
+ const raw = fs.readFileSync(lockFile, 'utf8').trim();
348
+ if (!raw) return null;
349
+ // Backward-compat: older lock files contained only the pid as text.
350
+ // Newer payloads are JSON {pid, uid, startedAt}.
351
+ if (raw[0] === '{') {
352
+ try { return JSON.parse(raw); } catch (_) { return null; }
353
+ }
354
+ const pid = parseInt(raw, 10);
355
+ return Number.isFinite(pid) && pid > 0 ? { pid: pid } : null;
356
+ } catch (_) { return null; }
357
+ }
358
+
359
+ function _lockPayload() {
360
+ return JSON.stringify({
361
+ pid: process.pid,
362
+ uid: typeof process.getuid === 'function' ? process.getuid() : null,
363
+ startedAt: new Date().toISOString(),
364
+ // Round-9: marks a daemon that refreshes this lock file's mtime on a
365
+ // lease (see startLockRefresh). Only when this flag is present do
366
+ // acquireLock / refuseHelloIfDaemonRunning trust mtime-staleness to
367
+ // reclaim a lock whose PID is alive -- the PID-reuse / SIGKILL-stale
368
+ // guard. A lock written by an OLDER daemon (no flag) keeps the legacy
369
+ // kill(0)-only behavior so a new binary can never falsely steal a
370
+ // still-running old daemon's lock (which would run two daemons).
371
+ lease: true,
372
+ });
373
+ }
374
+
375
+ // Round-9: lease tunables for the daemon lock. A live daemon refreshes the
376
+ // lock mtime every LOCK_REFRESH_MS; a lock whose mtime is older than
377
+ // STALE_LOCK_TTL_MS (and that was written by a lease-aware daemon) is
378
+ // treated as stale even if its PID happens to be alive -- closing the
379
+ // "crash + PID reuse -> new daemon silently refuses to start" hole and the
380
+ // "SIGKILL leaves a stale lock nobody reclaims" hole. The TTL is well above
381
+ // the heartbeat interval (default 6min) so a healthy daemon never trips it.
382
+ // On Windows, SIGTERM is implemented as TerminateProcess() (not a catchable
383
+ // signal), so the shutdown() handler that calls releaseLock() never runs.
384
+ // The lock file stays on disk with the dead PID. Reduce the TTL on Windows
385
+ // so a subsequent start doesn't wait 15 minutes to reclaim the stale lock.
386
+ // Unix dropped from 15 min -> 5 min so a wedged daemon does not block takeover
387
+ // for a quarter hour. 5 min is still 2.5x the 2-min Unix refresh cadence.
388
+ // Windows 3 min TTL gets a 1-min refresh (3x margin) since 2-min refresh left
389
+ // only 1.5x margin against transient FS hiccups.
390
+ const STALE_LOCK_TTL_MS = process.platform === 'win32' ? 3 * 60_000 : 5 * 60_000;
391
+ const LOCK_REFRESH_MS = process.platform === 'win32' ? 1 * 60_000 : 2 * 60_000;
392
+ let _lockRefreshTimer = null;
393
+
394
+ // Returns true if the lock was written by a lease-aware daemon AND its
395
+ // mtime is older than the stale TTL -- i.e. no live owner is refreshing it,
396
+ // so it is safe to reclaim regardless of whether the recorded PID resolves.
397
+ function _lockIsStaleByLease(lockFile, payload) {
398
+ if (!payload || payload.lease !== true) return false;
399
+ try {
400
+ const ageMs = Date.now() - fs.statSync(lockFile).mtimeMs;
401
+ return ageMs > STALE_LOCK_TTL_MS;
402
+ } catch (_) {
403
+ return false;
404
+ }
405
+ }
406
+
407
+ // Start refreshing the lock file's mtime so other processes can tell this
408
+ // daemon is alive without trusting a (recyclable) PID. unref'd: it never
409
+ // keeps the event loop open on its own, but fires for as long as the daemon
410
+ // is otherwise alive.
411
+ function startLockRefresh() {
412
+ if (_lockRefreshTimer) return;
413
+ const lockFile = getLockFilePath();
414
+ _lockRefreshTimer = setInterval(function () {
415
+ try {
416
+ const now = new Date();
417
+ fs.utimesSync(lockFile, now, now);
418
+ } catch (_) { /* lock gone / FS error: nothing we can do here */ }
419
+ }, LOCK_REFRESH_MS);
420
+ if (_lockRefreshTimer && typeof _lockRefreshTimer.unref === 'function') {
421
+ _lockRefreshTimer.unref();
422
+ }
423
+ }
424
+
425
+ function stopLockRefresh() {
426
+ if (_lockRefreshTimer) {
427
+ clearInterval(_lockRefreshTimer);
428
+ _lockRefreshTimer = null;
429
+ }
430
+ }
431
+
196
432
  function acquireLock() {
197
433
  const lockFile = getLockFilePath();
434
+ // NOTE(windows): mode 0o700 / 0o600 are silently ignored on Windows.
435
+ // Lock directory and file permissions provide no OS-level isolation on
436
+ // Windows; rely on user-profile directory ACLs (%USERPROFILE%\.evomap).
198
437
  try {
438
+ try { fs.mkdirSync(path.dirname(lockFile), { recursive: true, mode: 0o700 }); } catch (_) {}
199
439
  try {
200
- fs.writeFileSync(lockFile, String(process.pid), { flag: 'wx' });
440
+ fs.writeFileSync(lockFile, _lockPayload(), { flag: 'wx', mode: 0o600 });
201
441
  return true;
202
442
  } catch (exclErr) {
203
443
  if (exclErr.code !== 'EEXIST') throw exclErr;
204
444
  }
205
- const pid = parseInt(fs.readFileSync(lockFile, 'utf8').trim(), 10);
206
- if (!Number.isFinite(pid) || pid <= 0) {
207
- console.log('[Singleton] Corrupt lock file (invalid PID). Taking over.');
445
+ const payload = _readLockPayload(lockFile);
446
+ if (!payload || !Number.isFinite(payload.pid) || payload.pid <= 0) {
447
+ console.log('[Singleton] Corrupt lock file. Taking over.');
448
+ } else if (_lockIsStaleByLease(lockFile, payload)) {
449
+ // Round-9: a lease-aware daemon has not refreshed this lock's mtime
450
+ // within the stale TTL. Either it was SIGKILLed/crashed, or its PID
451
+ // has since been reused by an unrelated process (kill(0) below would
452
+ // then falsely report it alive and we would refuse to start forever).
453
+ // The expired lease is authoritative: take over.
454
+ console.log('[Singleton] Lock lease expired (PID ' + payload.pid + ', no mtime refresh for > ' +
455
+ Math.round(STALE_LOCK_TTL_MS / 60_000) + 'min). Taking over.');
208
456
  } else {
209
457
  try {
210
- process.kill(pid, 0);
211
- console.log(`[Singleton] Evolver loop already running (PID ${pid}). Exiting.`);
458
+ process.kill(payload.pid, 0);
459
+ // Process exists. Distinguish "alive, our user" (refuse) from
460
+ // "alive, different uid" (also refuse -- never barge into a root
461
+ // daemon under a user-launched evolver, etc.).
462
+ console.log(`[Singleton] Evolver loop already running (PID ${payload.pid}). Exiting.`);
212
463
  return false;
213
464
  } catch (e) {
214
- console.log(`[Singleton] Stale lock found (PID ${pid}). Taking over.`);
465
+ if (e && e.code === 'EPERM') {
466
+ // PID exists but belongs to another user. Conservatively
467
+ // refuse: barging in would race the existing daemon for
468
+ // secret/heartbeat ownership.
469
+ console.warn(`[Singleton] Lock owned by PID ${payload.pid} (different user). Refusing to take over. ` +
470
+ `Remove ${lockFile} manually if the PID is actually dead.`);
471
+ return false;
472
+ }
473
+ console.log(`[Singleton] Stale lock found (PID ${payload.pid}). Taking over.`);
474
+ }
475
+ }
476
+ // Atomic takeover so two daemons that both observe the same stale PID
477
+ // and pass the kill(0) check cannot both end up "owning" the lock.
478
+ //
479
+ // Bug it fixes: the previous "unconditional unlinkSync then linkSync"
480
+ // pattern was NOT atomic across acquirers. Interleaving where P1 wins
481
+ // the linkSync but P2's unlinkSync then deletes P1's freshly-linked
482
+ // file (P2 never re-verifies it's deleting the same stale lock it
483
+ // observed) lets P2's subsequent linkSync also succeed. Both processes
484
+ // then return true and start a daemon, racing each other on the
485
+ // shared singleton secret store.
486
+ //
487
+ // renameSync is atomic at the filesystem level: only one of N racing
488
+ // acquirers can move the stale lockFile to a unique claim name, the
489
+ // rest see ENOENT and abort. After the claim succeeds, _writeLockAtomic
490
+ // installs the fresh lock; the claim file is unlinked in every exit
491
+ // path so it doesn't accumulate.
492
+ const claimFile = lockFile + '.' + process.pid + '.' + Date.now() + '.takeover';
493
+ try {
494
+ fs.renameSync(lockFile, claimFile);
495
+ } catch (e) {
496
+ if (e && e.code === 'ENOENT') {
497
+ // Another concurrent acquirer already claimed the stale lock.
498
+ // They'll race us on _writeLockAtomic below; the EEXIST branch
499
+ // handles the loser case correctly.
500
+ } else {
501
+ console.warn('[Singleton] Cannot claim stale lock at ' + lockFile + ': ' + e.message);
502
+ return false;
503
+ }
504
+ }
505
+ try {
506
+ _writeLockAtomic(lockFile, _lockPayload());
507
+ } catch (linkErr) {
508
+ try { fs.unlinkSync(claimFile); } catch (_) {}
509
+ if (linkErr && linkErr.code === 'EEXIST') {
510
+ // Lost the link race to another concurrent acquirer. Read who
511
+ // won (best-effort) for the log line.
512
+ const winner = _readLockPayload(lockFile);
513
+ console.log('[Singleton] Lost takeover race to PID ' + (winner && winner.pid) + '. Exiting.');
514
+ return false;
215
515
  }
516
+ throw linkErr;
216
517
  }
217
- fs.writeFileSync(lockFile, String(process.pid));
518
+ try { fs.unlinkSync(claimFile); } catch (_) {}
218
519
  return true;
219
520
  } catch (err) {
220
521
  console.error('[Singleton] Lock acquisition failed:', err);
@@ -226,12 +527,73 @@ function releaseLock() {
226
527
  const lockFile = getLockFilePath();
227
528
  try {
228
529
  if (fs.existsSync(lockFile)) {
229
- const pid = parseInt(fs.readFileSync(lockFile, 'utf8').trim(), 10);
230
- if (pid === process.pid) fs.unlinkSync(lockFile);
530
+ const payload = _readLockPayload(lockFile);
531
+ if (payload && payload.pid === process.pid) fs.unlinkSync(lockFile);
231
532
  }
232
533
  } catch (e) { /* ignore */ }
233
534
  }
234
535
 
536
+ // Round-7 (§20.7): the daemon-lock acquireLock() only fires for `--loop`
537
+ // mode; CLI subcommands like `evolver fetch` and `evolver sync` run
538
+ // without acquiring the lock and freely call sendHelloToHub when
539
+ // node_secret is missing. The hub-side hello-with-rotate rewrites the
540
+ // node_secret on disk, so two writers (the daemon's heartbeat path
541
+ // rotating one secret + this CLI's sendHelloToHub writing a different
542
+ // one) race to be "last writer." Whichever wrote second silences the
543
+ // other -- the daemon then 401-loops -> enters reauth backoff -> goes
544
+ // silent for 30 min..4 h. The original §6 "instance lock" scenario.
545
+ //
546
+ // This helper does NOT take the lock (the daemon legitimately owns it);
547
+ // it only refuses to proceed if a LIVE daemon owns the lock AND we are
548
+ // about to send a fresh hello. If the daemon is alive it already has a
549
+ // valid secret in ~/.evomap/node_secret, so the right thing for the CLI
550
+ // is to wait briefly for the secret to appear (newly registered daemon)
551
+ // or exit with an actionable error.
552
+ //
553
+ // Callers: every CLI subcommand whose runner could call sendHelloToHub()
554
+ // when getHubNodeSecret() returns empty. Currently: fetch, sync
555
+ // (round-7 §20.7), plus atp-complete, buy, orders, verify (round-8
556
+ // §21.8 -- the ATP runners hit the same vector via consumerAgent /
557
+ // merchantAgent / atpExecute paths).
558
+ function refuseHelloIfDaemonRunning(toolLabel) {
559
+ try {
560
+ const lockFile = getLockFilePath();
561
+ if (!fs.existsSync(lockFile)) return; // no daemon
562
+ const payload = _readLockPayload(lockFile);
563
+ if (!payload || !Number.isFinite(payload.pid) || payload.pid <= 0) return;
564
+ if (payload.pid === process.pid) return; // shouldn't happen for CLI
565
+ // Round-9: a lease-aware lock whose mtime has gone stale means the
566
+ // daemon is dead (or its PID was reused). Do NOT refuse on it -- that
567
+ // was the "CLI hard-exits because it trusts a recyclable PID" hole.
568
+ if (_lockIsStaleByLease(lockFile, payload)) return;
569
+ try {
570
+ process.kill(payload.pid, 0);
571
+ } catch (e) {
572
+ if (e && e.code === 'ESRCH') return; // stale lock, daemon is gone
573
+ // EPERM = alive under a different user; still a real daemon. Fall
574
+ // through to refuse.
575
+ }
576
+ console.error(
577
+ '[' + toolLabel + '] Refusing to send hello: an evolver daemon ' +
578
+ '(PID ' + payload.pid + ') is running and owns ~/.evomap/instance.lock.'
579
+ );
580
+ console.error(
581
+ ' Two concurrent hello calls would rotate node_secret against ' +
582
+ 'each other and silence the daemon for hours.'
583
+ );
584
+ console.error(
585
+ ' Either wait for the daemon to register (the secret will ' +
586
+ 'appear at ~/.evomap/node_secret), or stop the daemon and retry.'
587
+ );
588
+ process.exit(1);
589
+ } catch (_) {
590
+ // Never let the lock-check helper itself escape; if the helper
591
+ // throws (FS permission, etc.) we fall through to the original code
592
+ // path. The race we're protecting against is rare; failing closed
593
+ // here would block legitimate CLI use.
594
+ }
595
+ }
596
+
235
597
  async function main() {
236
598
  const args = process.argv.slice(2);
237
599
  const command = args[0];
@@ -242,13 +604,49 @@ async function main() {
242
604
 
243
605
  if (!command || command === 'run' || command === '/evolve' || isLoop) {
244
606
  if (isLoop) {
607
+ // EPIPE protection. The daemon may outlive the controlling
608
+ // terminal (user closes the iTerm tab, ssh session drops, parent
609
+ // shell exits). The SIGHUP handler below covers the signal side,
610
+ // but the underlying pty fd is gone and the FIRST subsequent
611
+ // console.log writes to a closed pipe -> stdout emits 'error'
612
+ // with EPIPE. Without a listener attached, Node escalates EPIPE
613
+ // to uncaughtException, which our handler then turns into
614
+ // process.exit(1). Net result: daemon silently dies the next
615
+ // time it tries to log, with no useful trace. Swallow EPIPE
616
+ // explicitly so the daemon stays alive when its terminal goes
617
+ // away (matching standard daemonization practice).
618
+ try {
619
+ // EPIPE: swallow (daemon must outlive its controlling terminal).
620
+ // Non-EPIPE (EIO, ENOSPC on redirected log, etc.): the listener
621
+ // already prevents 'error' from escalating to uncaughtException,
622
+ // so write a one-line trace to the *other* stream so operators
623
+ // can see the failure mode instead of finding a silent daemon.
624
+ process.stdout.on('error', function (err) {
625
+ if (err && err.code === 'EPIPE') return;
626
+ try { process.stderr.write('[evolver] stdout error: ' + (err && (err.code || err.message) || err) + '\n'); } catch (_) {}
627
+ });
628
+ process.stderr.on('error', function (err) {
629
+ if (err && err.code === 'EPIPE') return;
630
+ try { process.stdout.write('[evolver] stderr error: ' + (err && (err.code || err.message) || err) + '\n'); } catch (_) {}
631
+ });
632
+ } catch (_) {}
633
+
245
634
  const originalLog = console.log;
246
635
  const originalWarn = console.warn;
247
636
  const originalError = console.error;
248
637
  function ts() { return '[' + new Date().toISOString() + ']'; }
249
- console.log = (...args) => { originalLog.call(console, ts(), ...args); };
250
- console.warn = (...args) => { originalWarn.call(console, ts(), ...args); };
251
- console.error = (...args) => { originalError.call(console, ts(), ...args); };
638
+ // Wrap originals in try/catch so a broken transport (closed pty,
639
+ // disk full on a redirected log file) cannot escape and trip
640
+ // unhandledException -> exit(1) the next time we log.
641
+ console.log = (...args) => {
642
+ try { originalLog.call(console, ts(), ...args); } catch (_) {}
643
+ };
644
+ console.warn = (...args) => {
645
+ try { originalWarn.call(console, ts(), ...args); } catch (_) {}
646
+ };
647
+ console.error = (...args) => {
648
+ try { originalError.call(console, ts(), ...args); } catch (_) {}
649
+ };
252
650
  }
253
651
 
254
652
  console.log('Starting evolver...');
@@ -274,26 +672,371 @@ async function main() {
274
672
  if (isLoop) {
275
673
  // Internal daemon loop (no wrapper required).
276
674
  if (!acquireLock()) process.exit(0);
675
+ // Round-9: refresh the lock lease so other processes can detect a
676
+ // crash / PID reuse via stale mtime instead of trusting kill(0).
677
+ startLockRefresh();
678
+
679
+ // Linux OOM score adjustment: lower oom_score_adj so the kernel
680
+ // deprioritises evolver when choosing an OOM victim. This is a
681
+ // best-effort hint -- the kernel can still kill us under extreme
682
+ // memory pressure, but we will not be the first target.
683
+ //
684
+ // Value -500 (range -1000..1000; -1000 = never kill, 0 = default,
685
+ // +1000 = kill first). -500 gives meaningful protection without
686
+ // reserving the slot for truly critical system services.
687
+ //
688
+ // Requires the process to be either root or to have CAP_SYS_RESOURCE.
689
+ // On most Docker/k8s images running as non-root this write will fail
690
+ // with EACCES -- that is expected and harmless; we log a one-liner so
691
+ // operators know to pass --oom-score-adj=-500 via their container spec,
692
+ // or to set /proc/<pid>/oom_score_adj from the supervising process.
693
+ //
694
+ // Users who want to set this from outside the process (safer, no CAP):
695
+ // echo -500 > /proc/$(pgrep -f "node.*evolver.*--loop")/oom_score_adj
696
+ //
697
+ // Opt-out: EVOLVER_DISABLE_OOM_ADJUST=1
698
+ if (process.platform === 'linux' &&
699
+ String(process.env.EVOLVER_DISABLE_OOM_ADJUST || '') !== '1') {
700
+ try {
701
+ const _oomPath = '/proc/self/oom_score_adj';
702
+ const _oomTarget = '-500';
703
+ require('fs').writeFileSync(_oomPath, _oomTarget + '\n', 'utf8');
704
+ console.log('[evolver] Set Linux oom_score_adj=' + _oomTarget +
705
+ ' to reduce OOM-kill priority.');
706
+ } catch (oomErr) {
707
+ // EACCES under non-root / no CAP_SYS_RESOURCE is expected; EPERM
708
+ // inside stricter seccomp/apparmor profiles. Both are non-fatal.
709
+ const oomCode = oomErr && oomErr.code ? oomErr.code : 'unknown';
710
+ console.log('[evolver] Could not set oom_score_adj (' + oomCode +
711
+ '). To protect evolver from OOM kill, run as root, add ' +
712
+ 'CAP_SYS_RESOURCE, or set oom_score_adj externally via your ' +
713
+ 'container spec (e.g. resources.requests + oom_score_adj in k8s).');
714
+ }
715
+ }
716
+
717
+ // Round-4: macOS App Nap / QoS demotion mitigation. Without this,
718
+ // a backgrounded `evolver --loop` running in an iTerm tab gets its
719
+ // process QoS demoted to UTILITY/BACKGROUND once the parent app
720
+ // is no longer focused. CPU runtime caps to ~5% of one core,
721
+ // setTimeout resolution drops toward 1 Hz, disk I/O is throttled.
722
+ // The drift detector cannot rescue this because the demotion does
723
+ // NOT cause Date.now() to jump -- only the inter-tick interval
724
+ // dilates, which the detector samples through its own (also
725
+ // demoted) setInterval. Net result: heartbeat appears alive but
726
+ // ticks fire so slowly that the hub marks the node offline,
727
+ // matching the user-reported "first launch ok -> idle -> dead
728
+ // forever" pattern.
729
+ //
730
+ // os.setPriority() raises BSD process priority; macOS bridges that
731
+ // to Mach thread QoS via the priority bridge so the demotion does
732
+ // not engage. -10 is the most negative value raisable without
733
+ // root. Failures are logged but non-fatal (e.g. EPERM under a
734
+ // restrictive sandbox -- the daemon continues, just unprotected).
735
+ // Opt-out via EVOLVER_DISABLE_PRIORITY_BOOST=1 for users on
736
+ // power-constrained battery profiles who would rather accept
737
+ // the throttle than the extra wake-time.
738
+ if (process.platform === 'darwin' &&
739
+ String(process.env.EVOLVER_DISABLE_PRIORITY_BOOST || '') !== '1') {
740
+ let priorityBoostOk = false;
741
+ try {
742
+ const os = require('os');
743
+ os.setPriority(0, -10);
744
+ // Round-5: actually verify the boost landed. macOS silently
745
+ // returns success from setPriority(2) under some sandboxes
746
+ // even when the underlying syscall was rejected by the
747
+ // Mach thread-policy bridge. Read it back; if the value is
748
+ // still 0 (or worse), App Nap will engage and the user
749
+ // sees the "first launch -> idle -> dead" symptom from
750
+ // round-3 with NO log evidence to RCA from.
751
+ const observed = os.getPriority();
752
+ if (observed <= -10) {
753
+ priorityBoostOk = true;
754
+ console.log('[evolver] Raised process priority on macOS to ' + observed +
755
+ ' to prevent App Nap / QoS demotion.');
756
+ } else {
757
+ console.warn('[evolver] setPriority(-10) reported success but observed priority is ' +
758
+ observed + '; App Nap protection NOT in effect. ' +
759
+ 'Run with EVOLVER_CAFFEINATE=1 or via `caffeinate -is node index.js --loop`.');
760
+ }
761
+ } catch (e) {
762
+ console.warn('[evolver] setPriority(-10) refused (' + (e && e.code || 'unknown') +
763
+ '): ' + (e && e.message || e) + '. App Nap protection NOT in effect. ' +
764
+ 'Run with EVOLVER_CAFFEINATE=1 or via `caffeinate -is node index.js --loop`.');
765
+ }
766
+ // Round-5: caffeinate side-child. Round-4 made this opt-in via
767
+ // EVOLVER_CAFFEINATE=1 to avoid the extra Activity-Monitor row;
768
+ // the round-5 audit found that 99% of users never set the env
769
+ // var, so the App Nap fallback was effectively unused. Promote
770
+ // to default-on when the priority boost did NOT land (so we
771
+ // either have priority or have caffeinate, never neither),
772
+ // unless the user has explicitly opted out via
773
+ // EVOLVER_CAFFEINATE=0. The combined effect: a fresh laptop
774
+ // user gets at least one layer of throttle protection without
775
+ // having to learn about either env var.
776
+ const caffeinateRaw = String(process.env.EVOLVER_CAFFEINATE || '').toLowerCase().trim();
777
+ const caffeinateOptedIn = caffeinateRaw === '1' || caffeinateRaw === 'true';
778
+ const caffeinateOptedOut = caffeinateRaw === '0' || caffeinateRaw === 'false';
779
+ const caffeinateFallback = !priorityBoostOk && !caffeinateOptedOut;
780
+ if (caffeinateOptedIn || caffeinateFallback) {
781
+ try {
782
+ const child = spawn('caffeinate', ['-i', '-w', String(process.pid)], {
783
+ detached: true,
784
+ stdio: 'ignore',
785
+ });
786
+ child.unref();
787
+ console.log('[evolver] Spawned caffeinate -i -w ' + process.pid +
788
+ ' to block App Nap (pid ' + child.pid + ').' +
789
+ (caffeinateFallback ? ' (fallback because priority boost was refused)' : ''));
790
+ } catch (e) {
791
+ console.warn('[evolver] caffeinate spawn failed: ' +
792
+ (e && e.message || e) + '. App Nap may throttle the heartbeat. ' +
793
+ 'Install caffeinate (Xcode CLT) or run under a launchd plist with NSAppSleepDisabled=1.');
794
+ }
795
+ }
796
+ }
797
+
798
+ // Event-loop keep-alive anchor (defense-in-depth for the sleepMs fix).
799
+ //
800
+ // All timers in a2aProtocol.js (heartbeat, drift detector, self-driving
801
+ // poll, SSE reconnect) are unref'd so they never prevent a clean exit.
802
+ // The sleepMs() timer above is now ref'd (the primary fix), but as an
803
+ // additional safety net we install one ref'd setInterval here that fires
804
+ // every 10 minutes. Its only job is to emit a lightweight log line so
805
+ // the evolver_loop.log gets touched even when the daemon is completely
806
+ // idle (no session signals, evolve cycle sleeping at maxSleepMs). This
807
+ // guarantees the event loop has at least one ref'd handle at all times
808
+ // while the daemon is running, and provides a heartbeat-on-disk so
809
+ // lifecycle.checkHealth() (MAX_SILENCE_MS = 30 min default) does not
810
+ // wrongly declare the process stagnant during legitimate long idle windows.
811
+ // Cleared in shutdown() so it does not outlive the daemon.
812
+ const _KEEPALIVE_INTERVAL_MS = 10 * 60 * 1000;
813
+ let _keepAliveTimer = setInterval(function () {
814
+ try {
815
+ // Inline append that mirrors a2aProtocol._appendHeartbeatLog's
816
+ // ENOENT-retry (that helper is not exported).
817
+ const a2aKA = require('./src/gep/a2aProtocol');
818
+ if (typeof a2aKA.getHeartbeatStats === 'function') {
819
+ const s = a2aKA.getHeartbeatStats();
820
+ const { getEvolverLogPath } = require('./src/gep/paths');
821
+ const fsKA = require('fs');
822
+ const pathKA = require('path');
823
+ try {
824
+ const logPath = getEvolverLogPath();
825
+ fsKA.mkdirSync(pathKA.dirname(logPath), { recursive: true });
826
+ const line = JSON.stringify({
827
+ ts: new Date().toISOString(),
828
+ type: 'keepalive_tick',
829
+ hb_running: s.running,
830
+ hb_last_tick_ago_s: s.lastTickAt ? Math.round((Date.now() - s.lastTickAt) / 1000) : null,
831
+ }) + '\n';
832
+ try {
833
+ fsKA.appendFileSync(logPath, line, { encoding: 'utf8' });
834
+ } catch (e) {
835
+ if (e && e.code === 'ENOENT') {
836
+ try {
837
+ fsKA.mkdirSync(pathKA.dirname(logPath), { recursive: true });
838
+ fsKA.appendFileSync(logPath, line, { encoding: 'utf8' });
839
+ } catch (_) { /* log destination broken; do not throw out */ }
840
+ }
841
+ }
842
+ } catch (_) { /* never let the log write kill the timer */ }
843
+ }
844
+ } catch (_) { /* never let any error kill the keep-alive timer */ }
845
+ }, _KEEPALIVE_INTERVAL_MS);
846
+ // Intentionally ref'd: this is the explicit event-loop anchor.
847
+ // Do NOT add .unref() here -- that would defeat the purpose.
848
+
277
849
  function shutdown() {
850
+ if (_keepAliveTimer) { clearInterval(_keepAliveTimer); _keepAliveTimer = null; }
851
+ stopLockRefresh();
278
852
  releaseLock();
853
+ // stopHeartbeat() clears the drift detector interval and the heartbeat
854
+ // timer, preventing "ghost tick" log noise after exit and ensuring a
855
+ // clean state if the process is somehow continued (test harness, etc.).
856
+ try { require('./src/gep/a2aProtocol').stopHeartbeat(); } catch (e) {}
279
857
  try { require('./src/gep/a2aProtocol').stopEventStream(); } catch (e) {}
280
858
  }
281
859
  process.on('exit', shutdown);
282
860
  process.on('SIGINT', () => { shutdown(); process.exit(); });
283
861
  process.on('SIGTERM', () => { shutdown(); process.exit(); });
862
+ // SIGHUP: two meanings depending on platform and how the daemon was started.
863
+ //
864
+ // macOS / interactive terminal: closing the iTerm/Terminal tab sends
865
+ // SIGHUP to the controlling process, and Node's default action is to
866
+ // terminate. That is the most common "first-launch, then idle, then
867
+ // evolver dead" path on macOS. As a daemon we intentionally ignore it.
868
+ //
869
+ // Linux systemd: `systemctl reload evolver` delivers SIGHUP to signal
870
+ // configuration reload. The socket / connection state may be stale (e.g.
871
+ // the hub URL changed in .env, or the admin wants a fresh hello after a
872
+ // manual secret rotation). We treat reload as a soft wake-recovery: drain
873
+ // the undici pool, poke the heartbeat, and restart the SSE stream, which
874
+ // is identical to what SIGCONT / the drift detector do on system resume.
875
+ // We also emit sd_notify RELOADING=1 / READY=1 so systemd can track the
876
+ // reload state (required for Type=notify units that call systemctl reload).
877
+ //
878
+ // A one-shot (non --loop) invocation keeps the default behavior because
879
+ // this branch is gated on `isLoop`.
880
+ process.on('SIGHUP', () => {
881
+ try {
882
+ if (process.platform === 'linux') {
883
+ // On Linux, SIGHUP from systemd means reload, not terminal close.
884
+ // Announce reload state to the service manager first so systemd
885
+ // does not time out waiting, then perform the recovery, then signal
886
+ // READY=1 again to confirm we are back in steady state.
887
+ try {
888
+ const a2aForSd = require('./src/gep/a2aProtocol.js');
889
+ if (typeof a2aForSd._sdNotify === 'function') {
890
+ // MONOTONIC_USEC requires microseconds from the monotonic clock.
891
+ // process.hrtime() returns [sec, nsec] from a fixed epoch;
892
+ // avoids BigInt literals for Node <10.3 compatibility.
893
+ const _hrt = process.hrtime();
894
+ const _monUsec = _hrt[0] * 1000000 + Math.floor(_hrt[1] / 1000);
895
+ a2aForSd._sdNotify('RELOADING=1\nMONOTONIC_USEC=' + _monUsec);
896
+ }
897
+ } catch (_) {}
898
+ console.warn('[evolver] Received SIGHUP on Linux (systemctl reload?). ' +
899
+ 'Running wake recovery (drain pool + poke heartbeat + restart SSE). ' +
900
+ 'To stop the daemon use SIGINT/SIGTERM.');
901
+ try {
902
+ const a2a = require('./src/gep/a2aProtocol.js');
903
+ if (typeof a2a._runWakeRecovery === 'function') a2a._runWakeRecovery();
904
+ } catch (_) {}
905
+ // Interrupt any pending sleepMs so the evolve loop picks up
906
+ // immediately after the reload rather than sitting out its window.
907
+ try { _interruptAllSleeps(); } catch (_) {}
908
+ // Signal READY=1 to close the RELOADING window. systemd will mark
909
+ // the reload complete once it sees this notification.
910
+ try {
911
+ const a2aForSd2 = require('./src/gep/a2aProtocol.js');
912
+ if (typeof a2aForSd2._sdNotify === 'function') {
913
+ a2aForSd2._sdNotify('READY=1');
914
+ }
915
+ } catch (_) {}
916
+ } else {
917
+ // macOS / non-systemd: terminal-close semantics, ignore the signal.
918
+ console.warn('[evolver] Received SIGHUP (controlling terminal closed?). ' +
919
+ 'Daemon ignoring -- heartbeat loop continues. To stop the daemon use SIGINT/SIGTERM.');
920
+ }
921
+ } catch (_) {}
922
+ });
923
+ // SIGCONT fires on `kill -CONT`, debugger detach, and some VM/sleep
924
+ // resume paths. Nudge the heartbeat loop so it doesn't sit waiting for
925
+ // its next scheduled tick (which could be up to 30 min away under
926
+ // backoff) before reconnecting after a wake event. Also restart the
927
+ // SSE stream: the underlying TCP socket almost certainly died during
928
+ // the SIGSTOP window without a FIN reaching us, and the existing
929
+ // exponential reconnect could be up to 120s away on the resumed
930
+ // monotonic clock.
931
+ // Round-6 (§19.5): register process-level wake hooks so both the
932
+ // SIGCONT handler and the drift detector's long-sleep branch
933
+ // (a2aProtocol) interrupt the outer evolve sleepMs and poke the
934
+ // validator daemon, not just the heartbeat-internal recovery.
935
+ _registerProcessWakeHooks();
936
+ // SIGCONT is not supported on Windows (process.on() throws ERR_UNKNOWN_SIGNAL).
937
+ // Wake recovery on Windows is handled exclusively by the drift detector.
938
+ if (process.platform !== 'win32') {
939
+ process.on('SIGCONT', () => {
940
+ // Real recovery delegates to a2aProtocol._runWakeRecovery so
941
+ // SIGCONT and the drift detector share one code path. NOTE:
942
+ // per followups §18.2, SIGCONT is never sent by the macOS
943
+ // kernel on system wake; this handler primarily covers:
944
+ // - hypervisor/docker resume (container unpause)
945
+ // - `kill -CONT <pid>` from operators or supervisors
946
+ // - Linux debugger attach/detach (ptrace SIGSTOP+SIGCONT;
947
+ // on Linux this is a true job-control signal unlike macOS)
948
+ // - `docker unpause` (sends SIGCONT to all cgroup processes)
949
+ // Bare-metal macOS wake recovery is driven by the drift
950
+ // detector only. _runWakeRecovery() has a 1s debounce gate so
951
+ // a rapid burst (e.g. gdb repeatedly attaching) collapses into
952
+ // one recovery without leaking undici agents or SSE connections.
953
+ try {
954
+ const a2a = require('./src/gep/a2aProtocol.js');
955
+ if (typeof a2a._runWakeRecovery === 'function') a2a._runWakeRecovery();
956
+ } catch (_) {}
957
+ });
958
+ }
284
959
  process.on('uncaughtException', (err) => {
285
960
  console.error('[FATAL] Uncaught exception:', err && err.stack ? err.stack : String(err));
286
961
  releaseLock();
287
962
  process.exit(1);
288
963
  });
289
964
  // Sliding window: only exit if many rejections cluster in a short
290
- // period. A daemon running for weeks can accumulate harmless,
291
- // unrelated rejections (transient network blips, hub timeouts);
292
- // the original cumulative counter would eventually kill the
293
- // process for noise. Cluster = real failure cascade.
965
+ // period AND the daemon shows no other signs of life. A daemon
966
+ // running for weeks can accumulate harmless, unrelated rejections
967
+ // (transient network blips, hub timeouts); the original cumulative
968
+ // counter would eventually kill the process for noise. Cluster =
969
+ // real failure cascade. But macOS wake bursts also synthesize
970
+ // clusters: heartbeat / SSE / validator / merchantAgent / ATP all
971
+ // fire near-simultaneously on resume and any subsystem with an
972
+ // unhandled async-callback throw can blow past 5 rejections in
973
+ // seconds. We add a liveness gate so an actively-recovering
974
+ // daemon doesn't kill itself in the middle of a wake-recovery
975
+ // storm. Threshold and window widened to match the macOS-wake
976
+ // amplification observed in round-2 testing.
294
977
  const REJECTION_WINDOW_MS = 5 * 60 * 1000;
295
- const REJECTION_THRESHOLD = 5;
978
+ const REJECTION_THRESHOLD = 10;
979
+ const RECENT_LIVENESS_MS = 60 * 1000;
296
980
  let _rejectionTimestamps = [];
981
+ function _heartbeatLooksAlive() {
982
+ // Round-6 (§19.8): the previous implementation reached into
983
+ // the `_testing` namespace and returned false (= "treat as
984
+ // dead, exit on cluster") if that test-only accessor was
985
+ // unavailable. Under bundling / minification / a future
986
+ // refactor that drops the `_testing` export, this turned a
987
+ // recovery storm into a guaranteed exit -- the OPPOSITE of
988
+ // what the gate exists to do. Switched to the public
989
+ // getHeartbeatStats() API (which surfaces `running` and
990
+ // `lastTickAt` for exactly this purpose) and made the
991
+ // require failure path "fail open" -- assume alive so we
992
+ // don't kill an actively-recovering daemon just because the
993
+ // module load failed on this turn.
994
+ //
995
+ // Round-10: `running` + recent `lastTickAt` alone are not
996
+ // enough to claim "alive." `lastTickAt` is stamped at the
997
+ // TOP of every heartbeat tick, regardless of whether the
998
+ // tick actually makes progress -- including ticks that
999
+ // immediately bail out because the loop is spinning in a
1000
+ // reauth backoff window (see a2aProtocol.js getHeartbeatStats
1001
+ // comment near :2940, which acknowledges that the loop
1002
+ // showed `running: true, lastTickAt: <recent>` even when
1003
+ // silent for 30 min waiting on a reauth backoff). In that
1004
+ // state a rejection cascade originating OUTSIDE the
1005
+ // heartbeat would be repeatedly forgiven while the loop is
1006
+ // not actually making forward progress. Require additionally
1007
+ // that `consecutiveFailures === 0` and that we are not
1008
+ // currently inside a reauth backoff window, so "alive" means
1009
+ // "making progress," not just "ticking."
1010
+ //
1011
+ // Trade-off: a transient hub blip that bumps
1012
+ // `consecutiveFailures` to 1 will now NOT forgive a
1013
+ // concurrent rejection cascade. That is intentional --
1014
+ // cascade-forgiveness exists to avoid flapping during a
1015
+ // healthy loop; during an unhealthy loop we should not keep
1016
+ // absorbing rejections silently.
1017
+ try {
1018
+ const a2a = require('./src/gep/a2aProtocol.js');
1019
+ if (!a2a || typeof a2a.getHeartbeatStats !== 'function') {
1020
+ // Cannot read state -- fail open. A real wedged daemon
1021
+ // will be caught by the next rejection if/when stats
1022
+ // become available, or by other watchdogs.
1023
+ return true;
1024
+ }
1025
+ const s = a2a.getHeartbeatStats();
1026
+ if (!s || !s.running) return false;
1027
+ const last = s.lastTickAt || 0;
1028
+ if (!(last > 0 && (Date.now() - last) < RECENT_LIVENESS_MS)) return false;
1029
+ // Round-10: gate on success state, not just tick freshness.
1030
+ if ((s.consecutiveFailures || 0) > 0) return false;
1031
+ if ((s.reauthBackoffUntil || 0) > Date.now()) return false;
1032
+ return true;
1033
+ } catch (_) {
1034
+ // Module load threw -- fail open for the same reason as
1035
+ // above. A genuinely broken require would surface via
1036
+ // uncaughtException long before this gate matters.
1037
+ return true;
1038
+ }
1039
+ }
297
1040
  process.on('unhandledRejection', (reason) => {
298
1041
  const now = Date.now();
299
1042
  _rejectionTimestamps.push(now);
@@ -302,7 +1045,15 @@ async function main() {
302
1045
  });
303
1046
  console.error('[FATAL] Unhandled promise rejection (' + _rejectionTimestamps.length + ' in window):', reason && reason.stack ? reason.stack : String(reason));
304
1047
  if (_rejectionTimestamps.length >= REJECTION_THRESHOLD) {
305
- console.error('[FATAL] ' + _rejectionTimestamps.length + ' unhandled rejections within ' + (REJECTION_WINDOW_MS / 1000) + 's. Exiting to avoid corrupt state.');
1048
+ if (_heartbeatLooksAlive()) {
1049
+ console.warn('[FATAL] ' + _rejectionTimestamps.length + ' rejections within ' +
1050
+ (REJECTION_WINDOW_MS / 1000) + 's BUT heartbeat ticked in the last ' +
1051
+ (RECENT_LIVENESS_MS / 1000) + 's. Treating as recovery storm, not exiting. ' +
1052
+ 'Resetting rejection window so a real subsequent cascade can still trip the trap.');
1053
+ _rejectionTimestamps = [];
1054
+ return;
1055
+ }
1056
+ console.error('[FATAL] ' + _rejectionTimestamps.length + ' unhandled rejections within ' + (REJECTION_WINDOW_MS / 1000) + 's and no recent heartbeat activity. Exiting to avoid corrupt state.');
306
1057
  releaseLock();
307
1058
  process.exit(1);
308
1059
  }
@@ -508,10 +1259,29 @@ async function main() {
508
1259
  if (consent.enabled) {
509
1260
  const hubUrl = process.env.A2A_HUB_URL || process.env.EVOMAP_HUB_URL || '';
510
1261
  if (hubUrl) {
511
- autoBuyer.start({
512
- dailyCap: Number(process.env.ATP_AUTOBUY_DAILY_CAP_CREDITS) || undefined,
513
- perOrderCap: Number(process.env.ATP_AUTOBUY_PER_ORDER_CAP_CREDITS) || undefined,
514
- });
1262
+ // Round-5: previously this bare start() call was a true
1263
+ // fire-and-forget. If autoBuyer.start returned a rejected
1264
+ // promise (transient hub error, bad config, mid-wake DNS
1265
+ // flap), the unhandledRejection escaped to the
1266
+ // process-level handler -- which, post round-3, only
1267
+ // exits if heartbeat is also dead. Net effect: daemon
1268
+ // stays alive but the autobuyer is half-initialized and
1269
+ // silently ignores claims. Attach a catch so the
1270
+ // operator can see the failure and the daemon-survival
1271
+ // gate is not relied on.
1272
+ try {
1273
+ const _autoBuyerPromise = autoBuyer.start({
1274
+ dailyCap: Number(process.env.ATP_AUTOBUY_DAILY_CAP_CREDITS) || undefined,
1275
+ perOrderCap: Number(process.env.ATP_AUTOBUY_PER_ORDER_CAP_CREDITS) || undefined,
1276
+ });
1277
+ if (_autoBuyerPromise && typeof _autoBuyerPromise.catch === 'function') {
1278
+ _autoBuyerPromise.catch(function (abErr) {
1279
+ console.warn('[ATP-AutoBuyer] start() rejected: ' + (abErr && abErr.message || abErr));
1280
+ });
1281
+ }
1282
+ } catch (abSyncErr) {
1283
+ console.warn('[ATP-AutoBuyer] start() threw synchronously: ' + (abSyncErr && abSyncErr.message || abSyncErr));
1284
+ }
515
1285
  if (consent.source === 'default') {
516
1286
  // First-run on a non-TTY (daemon, hook, CI) where the prompt
517
1287
  // could not fire AND no env override + no ack file. autoBuyer
@@ -538,9 +1308,19 @@ async function main() {
538
1308
  const hubUrl = process.env.A2A_HUB_URL || process.env.EVOMAP_HUB_URL || '';
539
1309
  if (hubUrl) {
540
1310
  const autoDeliver = require('./src/atp/autoDeliver');
541
- autoDeliver.start({
542
- pollMs: Number(process.env.ATP_AUTODELIVER_POLL_MS) || undefined,
543
- });
1311
+ // Round-5: same fire-and-forget hardening as autoBuyer above.
1312
+ try {
1313
+ const _autoDeliverPromise = autoDeliver.start({
1314
+ pollMs: Number(process.env.ATP_AUTODELIVER_POLL_MS) || undefined,
1315
+ });
1316
+ if (_autoDeliverPromise && typeof _autoDeliverPromise.catch === 'function') {
1317
+ _autoDeliverPromise.catch(function (adErr) {
1318
+ console.warn('[ATP-AutoDeliver] start() rejected: ' + (adErr && adErr.message || adErr));
1319
+ });
1320
+ }
1321
+ } catch (adSyncErr) {
1322
+ console.warn('[ATP-AutoDeliver] start() threw synchronously: ' + (adSyncErr && adSyncErr.message || adSyncErr));
1323
+ }
544
1324
  } else {
545
1325
  console.warn('[ATP-AutoDeliver] autodeliver enabled but no hub URL configured, skipping.');
546
1326
  }
@@ -552,6 +1332,7 @@ async function main() {
552
1332
  // Hoist module refs used inside the loop to avoid repeated module lookups per cycle
553
1333
  const idleScheduler = require('./src/gep/idleScheduler');
554
1334
  const { shouldDistillFromFailures: shouldDF, autoDistillFromFailures: autoDF } = require('./src/gep/skillDistiller');
1335
+ const { autoDistillLlm } = require('./src/gep/autoDistillLlm'); // P3: autonomous LLM distillation (shadow-first, off by default)
555
1336
  const { tryExplore } = require('./src/gep/explore');
556
1337
 
557
1338
  let currentSleepMs = minSleepMs;
@@ -674,6 +1455,21 @@ async function main() {
674
1455
  } catch (e) {
675
1456
  if (isVerbose) console.warn('[OMLS] Distill error: ' + (e.message || e));
676
1457
  }
1458
+ // P3: autonomous LLM-quality distillation of SUCCESS capsules.
1459
+ // Default off; shadow logs a candidate; enforce upserts (after a
1460
+ // real run-green gate). Reuses the P1 exec bridge under the hood.
1461
+ if ((process.env.EVOLVER_AUTO_DISTILL_LLM || 'off') !== 'off') {
1462
+ try {
1463
+ const llmRes = await autoDistillLlm();
1464
+ if (llmRes && llmRes.ok && llmRes.gene) {
1465
+ console.log('[OMLS] Idle-window LLM distillation enforced gene: ' + llmRes.gene.id);
1466
+ } else if (llmRes && llmRes.reason === 'shadow_logged') {
1467
+ console.log('[OMLS] LLM distillation shadow candidate: ' + (llmRes.candidate && llmRes.candidate.id));
1468
+ }
1469
+ } catch (e) {
1470
+ if (isVerbose) console.warn('[OMLS] LLM distill error (non-fatal): ' + (e.message || e));
1471
+ }
1472
+ }
677
1473
  }
678
1474
  if (schedule.should_explore) {
679
1475
  try {
@@ -685,6 +1481,22 @@ async function main() {
685
1481
  if (isVerbose) console.warn('[OMLS] Explore error: ' + (e.message || e));
686
1482
  }
687
1483
  }
1484
+ // P2: conversation capability -> distilled gene (shadow-only v1).
1485
+ // Deliberately OUTSIDE the should_distill guard: should_distill is
1486
+ // true only at aggressive/deep intensity, but headless/air-gapped
1487
+ // hosts fall back to 'normal', which would make P2 a dead feature.
1488
+ // A freshly-discovered capability is time-relevant; gate it solely on
1489
+ // its own flag + the per-slug cooldown + a non-empty queue (all of
1490
+ // which already bound spend). Default off => zero behavior change.
1491
+ if ((process.env.EVOLVER_CONV_DISTILL_ENABLED || 'off') !== 'off') {
1492
+ try {
1493
+ const { autoDistillConversation } = require('./src/gep/autoDistillConv');
1494
+ const convRes = await autoDistillConversation();
1495
+ if (convRes && convRes.ok) console.log('[P2] conv-distill ' + convRes.mode + ' candidate: ' + (convRes.gene_id || convRes.reason));
1496
+ } catch (e) {
1497
+ if (isVerbose) console.warn('[P2] conv-distill error (non-fatal): ' + (e.message || e));
1498
+ }
1499
+ }
688
1500
  if (isVerbose && schedule.idle_seconds >= 0) {
689
1501
  console.log(`[OMLS] idle=${schedule.idle_seconds}s intensity=${schedule.intensity} multiplier=${omlsMultiplier}`);
690
1502
  }
@@ -911,6 +1723,60 @@ async function main() {
911
1723
  console.error('[SOLIDIFY] Error:', error);
912
1724
  process.exit(2);
913
1725
  }
1726
+ } else if (command === 'exec') {
1727
+ // node index.js exec --harness=claude-code [--once] [--max-cycles N]
1728
+ // P1 auto-exec bridge: run the Brain, scrape its sessions_spawn(...), spawn
1729
+ // the Hand (headless claude) to apply + solidify. Shadow-first opt-in.
1730
+ if (String(process.env.EVOLVE_EXEC_BRIDGE || '').toLowerCase() !== 'true') {
1731
+ console.error('[exec] EVOLVE_EXEC_BRIDGE is not "true". The auto-exec bridge is opt-in. Refusing.');
1732
+ process.exit(2);
1733
+ }
1734
+ const getFlag = (n) => {
1735
+ const i = args.findIndex(a => a === `--${n}` || a.startsWith(`--${n}=`));
1736
+ if (i === -1) return undefined;
1737
+ const h = args[i];
1738
+ if (h.includes('=')) return h.split('=').slice(1).join('='); // --n=value
1739
+ // bare --n: if the next token is a value (not another --flag), consume it
1740
+ // (#179 r6: support `--max-cycles N` space-separated, not just =N). A
1741
+ // trailing bare flag with no following value stays boolean true (e.g. --once).
1742
+ const next = args[i + 1];
1743
+ return (next !== undefined && !next.startsWith('--')) ? next : true;
1744
+ };
1745
+ const harness = String(getFlag('harness') || 'claude-code');
1746
+ const once = getFlag('once') === true;
1747
+ // #179 r7: validate --max-cycles. Number('foo')||0 silently became 0 =
1748
+ // unbounded daemon — a typo must fail fast, not run forever. Absent flag =>
1749
+ // 0 (intentional unbounded). A present value must be a non-negative integer.
1750
+ const rawMaxCycles = getFlag('max-cycles');
1751
+ let maxCycles = 0;
1752
+ if (rawMaxCycles !== undefined && rawMaxCycles !== true) {
1753
+ const n = Number(rawMaxCycles);
1754
+ if (!Number.isInteger(n) || n < 0) {
1755
+ console.error(`[exec] invalid --max-cycles '${rawMaxCycles}' (expected a non-negative integer; 0 or omit = unbounded)`);
1756
+ process.exit(2);
1757
+ }
1758
+ maxCycles = n;
1759
+ } else if (rawMaxCycles === true) {
1760
+ console.error('[exec] --max-cycles requires a value (e.g. --max-cycles 5 or --max-cycles=5)');
1761
+ process.exit(2);
1762
+ }
1763
+ if (!['claude-code', 'openclaw'].includes(harness)) {
1764
+ console.error(`[exec] unknown --harness '${harness}' (expected claude-code | openclaw)`);
1765
+ process.exit(2);
1766
+ }
1767
+ try {
1768
+ const { runExecBridge } = require('./src/gep/execBridge');
1769
+ const res = await runExecBridge({ harness, once, maxCycles });
1770
+ console.log(`[exec] done: cycles=${res.cycles} lastOutcome=${res.lastOutcome}`);
1771
+ // Exit 0 only on a genuine success. A bounded/daemon run that ended in
1772
+ // hand_failed/brain_failed/no_spawn must report non-zero to shells & CI
1773
+ // (Bugbot #179: do not exit 0 on failure just because cycles>0).
1774
+ process.exit(res.lastOutcome === 'success' ? 0 : 1);
1775
+ } catch (error) {
1776
+ console.error('[exec] bridge error:', error && error.message ? error.message : error);
1777
+ process.exit(1);
1778
+ }
1779
+
914
1780
  } else if (command === 'distill') {
915
1781
  const responseFileFlag = args.find(a => typeof a === 'string' && a.startsWith('--response-file='));
916
1782
  if (!responseFileFlag) {
@@ -1117,6 +1983,10 @@ async function main() {
1117
1983
 
1118
1984
  try {
1119
1985
  if (!getHubNodeSecret()) {
1986
+ // Round-7 (§20.7): if a daemon is up and we have no secret, we
1987
+ // would race the daemon's hello and silently corrupt its
1988
+ // node_secret. Refuse cleanly with a hint instead.
1989
+ refuseHelloIfDaemonRunning('fetch');
1120
1990
  console.log('[fetch] No node_secret found. Sending hello to Hub to register...');
1121
1991
  const helloResult = await sendHelloToHub();
1122
1992
  if (!helloResult || !helloResult.ok) {
@@ -1324,6 +2194,9 @@ async function main() {
1324
2194
 
1325
2195
  try {
1326
2196
  if (!getHubNodeSecret()) {
2197
+ // Round-7 (§20.7): refuse a fresh hello if a live daemon owns
2198
+ // the lock; the daemon's secret will appear shortly.
2199
+ refuseHelloIfDaemonRunning('sync');
1327
2200
  console.log('[sync] No node_secret found. Sending hello to Hub to register...');
1328
2201
  const helloResult = await sendHelloToHub();
1329
2202
  if (!helloResult || !helloResult.ok) {
@@ -1750,7 +2623,13 @@ async function main() {
1750
2623
  // we just print the unset hint)
1751
2624
  const path = require('path');
1752
2625
  const fs = require('fs');
1753
- const home = process.env.HOME || require('os').homedir();
2626
+ // Honor an explicit HOME override (used by tests to redirect to a fake
2627
+ // home) before falling back to os.homedir(). On POSIX, os.homedir() also
2628
+ // reads $HOME first, so this is a no-op in practice on macOS/Linux. On
2629
+ // Windows, os.homedir() reads %USERPROFILE% and ignores HOME -- without
2630
+ // this fallback, test/resetLocalSecret.test.js cannot inject a fake home
2631
+ // and the reset operates on the real user dir.
2632
+ const home = process.env.HOME || os.homedir();
1754
2633
  const stateFile = path.join(home, '.evomap', 'mailbox', 'state.json');
1755
2634
  const legacyFile = path.join(home, '.evomap', 'node_secret');
1756
2635
  let cleared = 0;
@@ -1796,6 +2675,18 @@ async function main() {
1796
2675
  // Invoked by a spawned Cursor sub-session after it has written the ATP
1797
2676
  // task answer to a file. Drives publish -> task/complete -> atp/deliver.
1798
2677
  try {
2678
+ // Round-8 (§21.8): if a daemon is up and the spawned subsession
2679
+ // somehow has no secret on disk, the inner completeAtpTask ->
2680
+ // _ensureNodeSecret -> sendHelloToHub call would race the
2681
+ // daemon's hello and silently corrupt the daemon's node_secret
2682
+ // (same vector round-7 §20.7 closed for fetch/sync). In the
2683
+ // common happy path the daemon already registered, the secret
2684
+ // exists, the guard is a no-op. Imported lazily so the helper
2685
+ // resolution does not slow down unrelated subcommands.
2686
+ try {
2687
+ const { getHubNodeSecret } = require('./src/gep/a2aProtocol');
2688
+ if (!getHubNodeSecret()) refuseHelloIfDaemonRunning('atp-complete');
2689
+ } catch (_) { /* never block ATP completion on a guard error */ }
1799
2690
  const subArgs = args.slice(1);
1800
2691
  function flag(name) {
1801
2692
  const pref = '--' + name + '=';
@@ -1833,6 +2724,16 @@ async function main() {
1833
2724
 
1834
2725
  } else if (command === 'buy' || command === 'orders' || command === 'verify' || command === 'atp') {
1835
2726
  try {
2727
+ // Round-8 (§21.8): same daemon-vs-CLI race protection as fetch/sync
2728
+ // and atp-complete. The ATP runners (consumerAgent / merchantAgent
2729
+ // / atpExecute) all call sendHelloToHub when getHubNodeSecret() is
2730
+ // empty, which clobbers a running daemon's secret and silences it
2731
+ // for 30 min..4 h. The check is a no-op when a secret already
2732
+ // exists (the common case once the daemon has registered).
2733
+ try {
2734
+ const { getHubNodeSecret } = require('./src/gep/a2aProtocol');
2735
+ if (!getHubNodeSecret()) refuseHelloIfDaemonRunning(command);
2736
+ } catch (_) { /* never block ATP CLI on a guard error */ }
1836
2737
  const atpCli = require('./src/atp/cli');
1837
2738
  const subArgs = args.slice(1); // drop the command token (e.g. "buy") itself
1838
2739
  let parsed;