@evomap/evolver 1.88.1 → 1.88.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +148 -3
- package/package.json +2 -1
- package/src/adapters/claudeCode.js +21 -1
- package/src/adapters/hookAdapter.js +4 -2
- package/src/adapters/scripts/evolver-session-start.js +14 -10
- package/src/adapters/scripts/evolver-task-recall.js +173 -0
- package/src/atp/atpExecute.js +20 -7
- package/src/atp/cli.js +17 -9
- package/src/atp/protocol.js +41 -0
- package/src/config.js +23 -0
- package/src/evolve/guards.js +1 -1
- package/src/evolve/pipeline/collect.js +1 -1
- package/src/evolve/pipeline/dispatch.js +1 -1
- package/src/evolve/pipeline/enrich.js +1 -1
- package/src/evolve/pipeline/hub.js +1 -1
- package/src/evolve/pipeline/select.js +1 -1
- package/src/evolve/pipeline/signals.js +1 -1
- package/src/evolve/utils.js +1 -1
- package/src/evolve.js +1 -1
- package/src/forceUpdate.js +108 -3
- package/src/gep/a2aProtocol.js +1 -1
- package/src/gep/assetCallLog.js +40 -1
- package/src/gep/autoDistillConv.js +1 -1
- package/src/gep/autoDistillLlm.js +1 -1
- package/src/gep/candidateEval.js +1 -1
- package/src/gep/candidates.js +1 -1
- package/src/gep/contentHash.js +1 -1
- package/src/gep/conversationSniffer.js +1 -1
- package/src/gep/crypto.js +1 -1
- package/src/gep/curriculum.js +1 -1
- package/src/gep/deviceId.js +1 -1
- package/src/gep/envFingerprint.js +1 -1
- package/src/gep/epigenetics.js +1 -1
- package/src/gep/execBridge.js +1 -1
- package/src/gep/explore.js +1 -1
- package/src/gep/hash.js +1 -1
- package/src/gep/hubFetch.js +1 -1
- package/src/gep/hubReview.js +1 -1
- package/src/gep/hubSearch.js +1 -1
- package/src/gep/hubVerify.js +1 -1
- package/src/gep/learningSignals.js +1 -1
- package/src/gep/memoryGraph.js +1 -1
- package/src/gep/memoryGraphAdapter.js +1 -1
- package/src/gep/mutation.js +1 -1
- package/src/gep/narrativeMemory.js +1 -1
- package/src/gep/openPRRegistry.js +1 -1
- package/src/gep/personality.js +1 -1
- package/src/gep/policyCheck.js +1 -1
- package/src/gep/prompt.js +1 -1
- package/src/gep/recallInject.js +1 -0
- package/src/gep/recallVerifier.js +1 -1
- package/src/gep/reflection.js +1 -1
- package/src/gep/selector.js +1 -1
- package/src/gep/skillDistiller.js +1 -1
- package/src/gep/solidify.js +1 -1
- package/src/gep/strategy.js +1 -1
- package/src/gep/workspaceKeychain.js +1 -1
- package/src/proxy/index.js +22 -1
- package/src/proxy/lifecycle/manager.js +456 -2
|
@@ -6,6 +6,23 @@ const { PROXY_PROTOCOL_VERSION } = require('../mailbox/store');
|
|
|
6
6
|
const crypto = require('crypto');
|
|
7
7
|
const { hubFetch } = require('../../gep/hubFetch');
|
|
8
8
|
const { getEvomapPath } = require('../../gep/paths');
|
|
9
|
+
// last_update transit (PR #188): proxy heartbeat ferries a pending
|
|
10
|
+
// force_update outcome to the hub, then clears the state file on 2xx.
|
|
11
|
+
// Proxy DOES run the upgrade now (PR #188 follow-up, HIGH bug): the
|
|
12
|
+
// original comment "Proxy itself never runs the upgrade — telemetry-only
|
|
13
|
+
// here" reflected pre-fix behaviour. Pure proxy-mode nodes (EVOMAP_PROXY=1,
|
|
14
|
+
// no evolve loop) never traversed a2aProtocol.js sendHeartbeat, so the
|
|
15
|
+
// canonical `_maybeTriggerForceUpdateFromHeartbeat` block at
|
|
16
|
+
// a2aProtocol.js:2304 never fired for them — Hub could push force_update
|
|
17
|
+
// forever with no upgrade attempt and no EvolverUpgradeAttempt row. The
|
|
18
|
+
// proxy heartbeat (200 with force_update, AND 426 with force_update in the
|
|
19
|
+
// error envelope) must mirror that logic. reportForceUpdateOutcome writes
|
|
20
|
+
// the state file the next heartbeat will pick up via body.last_update.
|
|
21
|
+
const {
|
|
22
|
+
readPendingLastUpdate,
|
|
23
|
+
clearLastUpdateOnAck,
|
|
24
|
+
reportForceUpdateOutcome,
|
|
25
|
+
} = require('../../gep/a2aProtocol');
|
|
9
26
|
|
|
10
27
|
// Hub's nodeId regex; mirror of src/gep/a2aProtocol.js so a malformed
|
|
11
28
|
// legacy file can never feed garbage into the hello payload.
|
|
@@ -47,6 +64,28 @@ const DRIFT_CHECK_MS = 30 * 1000;
|
|
|
47
64
|
const DRIFT_SLEEP_THRESHOLD_MS = 90 * 1000;
|
|
48
65
|
const DRIFT_LONG_SLEEP_THRESHOLD_MS = 30 * 60_000;
|
|
49
66
|
|
|
67
|
+
// Heartbeat-driven force_update lifecycle tracking. Mirrors
|
|
68
|
+
// `_forceUpdateInFlight` / `_forceUpdateLastAttemptAt` /
|
|
69
|
+
// `_getForceUpdateRetryCooldownMs` in src/gep/a2aProtocol.js so the proxy
|
|
70
|
+
// path uses the same in-flight + cooldown contract as the canonical path.
|
|
71
|
+
// Module-level (not instance-level) so multiple LifecycleManager instances
|
|
72
|
+
// in the same process serialize through one upgrade attempt — matches
|
|
73
|
+
// a2aProtocol.js's module-level guard. Process-local is sufficient: the
|
|
74
|
+
// proxy daemon runs in a single process and any sibling process would
|
|
75
|
+
// have its own require-cached state; cross-process serialization is the
|
|
76
|
+
// hub's job via directive_id dedup, not the client's.
|
|
77
|
+
let _proxyForceUpdateInFlight = false;
|
|
78
|
+
let _proxyForceUpdateLastAttemptAt = 0;
|
|
79
|
+
function _getProxyForceUpdateRetryCooldownMs() {
|
|
80
|
+
// Share the env var with a2aProtocol.js: an operator who sets
|
|
81
|
+
// EVOLVER_FORCE_UPDATE_RETRY_COOLDOWN_MS=0 in a test or production tune
|
|
82
|
+
// expects BOTH code paths to honour it. Default 15min matches
|
|
83
|
+
// a2aProtocol.js exactly.
|
|
84
|
+
const v = Number(process.env.EVOLVER_FORCE_UPDATE_RETRY_COOLDOWN_MS);
|
|
85
|
+
if (Number.isFinite(v) && v >= 0) return v;
|
|
86
|
+
return 15 * 60 * 1000;
|
|
87
|
+
}
|
|
88
|
+
|
|
50
89
|
let _cachedFingerprint = null;
|
|
51
90
|
function _getEnvFingerprint() {
|
|
52
91
|
if (_cachedFingerprint) return _cachedFingerprint;
|
|
@@ -97,6 +136,203 @@ function _readLegacyNodeId() {
|
|
|
97
136
|
return null;
|
|
98
137
|
}
|
|
99
138
|
|
|
139
|
+
// Mirror of src/gep/a2aProtocol.js `_persistNodeId`. Pure-proxy daemons
|
|
140
|
+
// (EVOMAP_PROXY=1, no a2aProtocol heartbeat thread) mint their own
|
|
141
|
+
// node_id and ONLY persist it to MailboxStore state.json. The legacy
|
|
142
|
+
// `~/.evomap/node_id` file never gets written, so:
|
|
143
|
+
//
|
|
144
|
+
// 1. `_shortNodeIdForStatePath` in a2aProtocol.js (used by the proxy
|
|
145
|
+
// heartbeat to pick the per-node `force_update_last.<suffix>.json`
|
|
146
|
+
// path) falls all the way through to 'anon' — every proxy node on
|
|
147
|
+
// the same EVOLVER_HOME would collide on the same state file.
|
|
148
|
+
// 2. A mixed-mode install (legacy evolve loop ran once, then user
|
|
149
|
+
// switched to proxy mode) is even worse: the legacy file holds a
|
|
150
|
+
// DIFFERENT id than the one in MailboxStore. The proxy heartbeats
|
|
151
|
+
// with body.node_id = its OWN id while writing
|
|
152
|
+
// `force_update_last.<legacy-suffix>.json`. The hub-side upgrade
|
|
153
|
+
// attempt row gets attributed to the wrong node.
|
|
154
|
+
//
|
|
155
|
+
// Calling this helper from hello() after the nodeId is resolved unifies
|
|
156
|
+
// the two persistence paths onto a single identity. Atomic write
|
|
157
|
+
// (per-pid tmp + rename) mirrors `_persistNodeSecret` in a2aProtocol.js;
|
|
158
|
+
// 0o600 mode keeps the file owner-read-only on POSIX (silently ignored
|
|
159
|
+
// on Windows, where %USERPROFILE% isolation is the only protection).
|
|
160
|
+
//
|
|
161
|
+
// Idempotent: if the file already holds the same id, we skip the write
|
|
162
|
+
// to avoid an inode churn on every hello tick. If it holds a DIFFERENT
|
|
163
|
+
// valid id, we still overwrite — the proxy's MailboxStore wins because
|
|
164
|
+
// that is the id the hub already knows us by (any rotation away from
|
|
165
|
+
// the legacy id was a deliberate operator action). The only way to
|
|
166
|
+
// re-seed a legacy id back onto a proxy install is to clear
|
|
167
|
+
// MailboxStore state.json (`evolver reset-local-secret`).
|
|
168
|
+
function _persistLegacyNodeId(id) {
|
|
169
|
+
if (!id || !NODE_ID_RE.test(id)) return;
|
|
170
|
+
const targets = [
|
|
171
|
+
getEvomapPath('node_id'),
|
|
172
|
+
path.resolve(__dirname, '..', '..', '..', '.evomap_node_id'),
|
|
173
|
+
];
|
|
174
|
+
// Try targets in order until one succeeds, matching the read order in
|
|
175
|
+
// _readLegacyNodeId. We only need ONE persistent copy; once the home
|
|
176
|
+
// path takes the write, the install-root path is unused.
|
|
177
|
+
for (const file of targets) {
|
|
178
|
+
try {
|
|
179
|
+
// Skip if the file already matches — common steady-state path,
|
|
180
|
+
// saves a syscall storm under heartbeat backoff doubling.
|
|
181
|
+
try {
|
|
182
|
+
if (fs.existsSync(file)) {
|
|
183
|
+
const existing = fs.readFileSync(file, 'utf8').trim();
|
|
184
|
+
if (existing === id) return;
|
|
185
|
+
}
|
|
186
|
+
} catch {
|
|
187
|
+
// Unreadable -- treat as missing and try to write.
|
|
188
|
+
}
|
|
189
|
+
const dir = path.dirname(file);
|
|
190
|
+
try {
|
|
191
|
+
if (!fs.existsSync(dir)) {
|
|
192
|
+
fs.mkdirSync(dir, { recursive: true, mode: 0o700 });
|
|
193
|
+
}
|
|
194
|
+
} catch (_) {
|
|
195
|
+
// mkdir failed (read-only fs, EPERM under sandboxing). Skip
|
|
196
|
+
// this candidate; the next one (install-root .evomap_node_id)
|
|
197
|
+
// may still work.
|
|
198
|
+
continue;
|
|
199
|
+
}
|
|
200
|
+
// Atomic write: a sibling evolver process (mixed-mode upgrade, two
|
|
201
|
+
// proxy daemons started by hand) could otherwise race on this
|
|
202
|
+
// path and leave a half-written file. Matches the pattern in
|
|
203
|
+
// a2aProtocol.js `_persistNodeSecret`.
|
|
204
|
+
const tmp = file + '.' + process.pid + '.tmp';
|
|
205
|
+
fs.writeFileSync(tmp, id, { encoding: 'utf8', mode: 0o600 });
|
|
206
|
+
fs.renameSync(tmp, file);
|
|
207
|
+
return;
|
|
208
|
+
} catch {
|
|
209
|
+
// Best-effort: continue to the next candidate. If both fail (no
|
|
210
|
+
// home, no writable install root) we accept the legacy file is
|
|
211
|
+
// unavailable — the proxy will still function, it just cannot
|
|
212
|
+
// unify state-file suffixes with a co-resident a2aProtocol path.
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Heartbeat-driven force_update trigger for proxy-mode nodes. Mirrors
|
|
218
|
+
// `_maybeTriggerForceUpdateFromHeartbeat` in src/gep/a2aProtocol.js
|
|
219
|
+
// (search there for that name to compare). Pure proxy-mode deployments
|
|
220
|
+
// (EVOMAP_PROXY=1) never run the evolve run() loop or sendHeartbeat, so
|
|
221
|
+
// without this trigger Hub can push force_update on every heartbeat
|
|
222
|
+
// forever and the node will keep heartbeating on the old version — which
|
|
223
|
+
// is exactly what shipped before PR #188's H1 fix.
|
|
224
|
+
//
|
|
225
|
+
// Drives `executeForceUpdate` directly, gated by an in-flight lock + a
|
|
226
|
+
// cooldown on failures so we do not hammer npm/degit on every tick. After
|
|
227
|
+
// the attempt, persists the outcome via `reportForceUpdateOutcome` so the
|
|
228
|
+
// next heartbeat carries it as body.last_update — that's the path that
|
|
229
|
+
// finally writes a row to the hub's EvolverUpgradeAttempt table.
|
|
230
|
+
//
|
|
231
|
+
// Logger is injected (not a console fallback) so tests can capture the
|
|
232
|
+
// upgrade-path stderr without polluting CI output. The logger contract
|
|
233
|
+
// matches what the LifecycleManager already uses.
|
|
234
|
+
function _maybeTriggerForceUpdateFromHeartbeat(forceUpdate, logger) {
|
|
235
|
+
if (!forceUpdate || typeof forceUpdate !== 'object') return;
|
|
236
|
+
if (_proxyForceUpdateInFlight) return;
|
|
237
|
+
const nowMs = Date.now();
|
|
238
|
+
if (
|
|
239
|
+
_proxyForceUpdateLastAttemptAt &&
|
|
240
|
+
(nowMs - _proxyForceUpdateLastAttemptAt) < _getProxyForceUpdateRetryCooldownMs()
|
|
241
|
+
) {
|
|
242
|
+
// A recent attempt already ran and either succeeded (process exited
|
|
243
|
+
// and we wouldn't be here on the post-restart heartbeat — see the
|
|
244
|
+
// FORCE_UPDATE_NOOP path in forceUpdate.js / reportForceUpdateOutcome
|
|
245
|
+
// status="skipped") or failed. Back off.
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
248
|
+
_proxyForceUpdateInFlight = true;
|
|
249
|
+
_proxyForceUpdateLastAttemptAt = nowMs;
|
|
250
|
+
|
|
251
|
+
// Capture from_version BEFORE executeForceUpdate runs. A successful
|
|
252
|
+
// upgrade calls process.exit(78); the post-restart heartbeat reads the
|
|
253
|
+
// state file from a fresh process where require('package.json').version
|
|
254
|
+
// is the NEW version — so snapshot the CURRENTLY running version now.
|
|
255
|
+
let fromVersion = '';
|
|
256
|
+
try {
|
|
257
|
+
fromVersion = String((require('../../../package.json') || {}).version || '');
|
|
258
|
+
} catch (_) { /* best-effort */ }
|
|
259
|
+
|
|
260
|
+
// Kick off in a microtask so the heartbeat promise chain can still
|
|
261
|
+
// complete (log + return {ok:true}) before the long-running upgrade
|
|
262
|
+
// takes over the process. Matches a2aProtocol.js exactly.
|
|
263
|
+
Promise.resolve().then(() => {
|
|
264
|
+
let updated = false;
|
|
265
|
+
let noop = false;
|
|
266
|
+
let busy = false;
|
|
267
|
+
let thrownErr = null;
|
|
268
|
+
try {
|
|
269
|
+
const mod = require('../../forceUpdate');
|
|
270
|
+
const result = mod.executeForceUpdate(forceUpdate);
|
|
271
|
+
// Sentinel === comparison: executeForceUpdate returns the
|
|
272
|
+
// FORCE_UPDATE_NOOP symbol when the install is already at the
|
|
273
|
+
// required version. We must NOT treat that as "success" — doing so
|
|
274
|
+
// would (a) write a phantom {status:"success", from==to} row to
|
|
275
|
+
// EvolverUpgradeAttempt, and (b) trigger an exit(78) restart with
|
|
276
|
+
// nothing to restart for. The hub schema accepts status="skipped".
|
|
277
|
+
noop = (result === mod.FORCE_UPDATE_NOOP);
|
|
278
|
+
// FORCE_UPDATE_BUSY: another caller (e.g. a2aProtocol heartbeat
|
|
279
|
+
// trigger or an evolve tick) already holds the module-level
|
|
280
|
+
// _inFlight mutex in forceUpdate.js. Defensive only — the
|
|
281
|
+
// instance-level _proxyForceUpdateInFlight gate above and the
|
|
282
|
+
// single-caller property of pure proxy mode make BUSY unreachable
|
|
283
|
+
// in practice. If it does fire (mixed-mode regression, future
|
|
284
|
+
// additional caller, etc.), the other caller owns the telemetry:
|
|
285
|
+
// we MUST NOT write a state file or exit(78). Mirrors
|
|
286
|
+
// src/gep/a2aProtocol.js (search FORCE_UPDATE_BUSY).
|
|
287
|
+
busy = (result === mod.FORCE_UPDATE_BUSY);
|
|
288
|
+
updated = (result === true);
|
|
289
|
+
} catch (e) {
|
|
290
|
+
thrownErr = e;
|
|
291
|
+
try {
|
|
292
|
+
logger.warn(`[ForceUpdate] proxy heartbeat-trigger failed (non-fatal): ${e && e.message || e}`);
|
|
293
|
+
} catch (_) { /* logger broken; non-fatal */ }
|
|
294
|
+
updated = false;
|
|
295
|
+
} finally {
|
|
296
|
+
_proxyForceUpdateInFlight = false;
|
|
297
|
+
}
|
|
298
|
+
if (busy) {
|
|
299
|
+
try {
|
|
300
|
+
logger.log('[ForceUpdate] proxy heartbeat-trigger observed BUSY (concurrent invocation). Skipping telemetry; in-flight caller owns the outcome.');
|
|
301
|
+
} catch (_) { /* logger broken; non-fatal */ }
|
|
302
|
+
return;
|
|
303
|
+
}
|
|
304
|
+
// Persist outcome via the shared helper so the heartbeat-thread
|
|
305
|
+
// trigger and the proxy trigger stay in lockstep on payload assembly
|
|
306
|
+
// + validation. The next heartbeat reads this file and ferries it as
|
|
307
|
+
// body.last_update — same contract as the canonical path.
|
|
308
|
+
try {
|
|
309
|
+
reportForceUpdateOutcome(forceUpdate, {
|
|
310
|
+
updated: updated,
|
|
311
|
+
noop: noop,
|
|
312
|
+
error: thrownErr,
|
|
313
|
+
fromVersion: fromVersion,
|
|
314
|
+
});
|
|
315
|
+
} catch (e) {
|
|
316
|
+
try {
|
|
317
|
+
logger.warn(`[ForceUpdate] proxy reportForceUpdateOutcome failed (non-fatal): ${e && e.message || e}`);
|
|
318
|
+
} catch (_) { /* logger broken; non-fatal */ }
|
|
319
|
+
}
|
|
320
|
+
if (updated) {
|
|
321
|
+
try { logger.log('[ForceUpdate] Update complete (proxy heartbeat-trigger). Exiting for restart...'); } catch (_) {}
|
|
322
|
+
try { process.exit(78); } catch (_) {}
|
|
323
|
+
} else if (noop) {
|
|
324
|
+
try {
|
|
325
|
+
logger.log('[ForceUpdate] No-op (proxy heartbeat-trigger): already at required version. Skipping restart.');
|
|
326
|
+
} catch (_) {}
|
|
327
|
+
} else {
|
|
328
|
+
try {
|
|
329
|
+
logger.warn('[ForceUpdate] proxy heartbeat-trigger failed. Will retry after cooldown (' +
|
|
330
|
+
Math.round(_getProxyForceUpdateRetryCooldownMs() / 60000) + 'min).');
|
|
331
|
+
} catch (_) {}
|
|
332
|
+
}
|
|
333
|
+
});
|
|
334
|
+
}
|
|
335
|
+
|
|
100
336
|
class AuthError extends Error {
|
|
101
337
|
constructor(message, statusCode) {
|
|
102
338
|
super(message);
|
|
@@ -121,6 +357,48 @@ class LifecycleManager {
|
|
|
121
357
|
this._consecutiveReauthFailures = 0;
|
|
122
358
|
this._driftInterval = null;
|
|
123
359
|
this._lastDriftCheckAt = 0;
|
|
360
|
+
|
|
361
|
+
// H4 fix: persist the legacy node_id file as soon as the in-memory
|
|
362
|
+
// node_id is known, NOT only after a successful hello(). The original
|
|
363
|
+
// code persisted only in hello() (see ~L390-398) — but proxy-mode boot
|
|
364
|
+
// can fire `reportForceUpdateOutcome` BEFORE hello() returns:
|
|
365
|
+
//
|
|
366
|
+
// - First-tick heartbeat hits 426 → executeForceUpdate() → exit(78)
|
|
367
|
+
// all happens BEFORE the hello() response is processed.
|
|
368
|
+
// - enrich.js force_update path can fire during the same window.
|
|
369
|
+
//
|
|
370
|
+
// `_shortNodeIdForStatePath` in a2aProtocol.js then picks the
|
|
371
|
+
// state-file suffix from `_cachedNodeId` (never set in proxy mode —
|
|
372
|
+
// only getNodeId() sets it, and proxy never calls getNodeId()) or the
|
|
373
|
+
// legacy ~/.evomap/node_id file. With both empty, it falls through
|
|
374
|
+
// to 'anon', and the outcome lands at `force_update_last.anon.json`.
|
|
375
|
+
// Next boot's hello() writes the real id, the heartbeat reads
|
|
376
|
+
// `force_update_last.<8hex>.json`, the anon file is orphaned and the
|
|
377
|
+
// outcome is silently lost.
|
|
378
|
+
//
|
|
379
|
+
// Persisting at construction closes the window. _persistLegacyNodeId
|
|
380
|
+
// early-returns on invalid input (NODE_ID_RE gate, same regex as the
|
|
381
|
+
// hello() path uses) so a malformed/empty store value is a no-op,
|
|
382
|
+
// and it is idempotent on matching content so the cost is one
|
|
383
|
+
// existsSync + one readFileSync per construction. We keep the
|
|
384
|
+
// existing post-hello call as a safety net in case hello() mints or
|
|
385
|
+
// mutates the id.
|
|
386
|
+
try {
|
|
387
|
+
const earlyNodeId = this.store && this.store.getState
|
|
388
|
+
? this.store.getState('node_id')
|
|
389
|
+
: null;
|
|
390
|
+
if (earlyNodeId && NODE_ID_RE.test(earlyNodeId)) {
|
|
391
|
+
_persistLegacyNodeId(earlyNodeId);
|
|
392
|
+
}
|
|
393
|
+
} catch (e) {
|
|
394
|
+
// Best-effort: persistence failure must never break construction.
|
|
395
|
+
// Logger may not exist if tests passed undefined; guard the call.
|
|
396
|
+
try {
|
|
397
|
+
this.logger.warn(
|
|
398
|
+
`[lifecycle] early persist of legacy node_id failed (non-fatal): ${e && e.message || e}`
|
|
399
|
+
);
|
|
400
|
+
} catch (_) { /* logger broken; non-fatal */ }
|
|
401
|
+
}
|
|
124
402
|
}
|
|
125
403
|
|
|
126
404
|
get nodeId() {
|
|
@@ -293,6 +571,27 @@ class LifecycleManager {
|
|
|
293
571
|
}
|
|
294
572
|
|
|
295
573
|
this.store.setState('node_id', nodeId);
|
|
574
|
+
// Unify proxy node_id with the legacy GEP file. Without this, the
|
|
575
|
+
// proxy-only fast path (EVOMAP_PROXY=1) never seeds
|
|
576
|
+
// ~/.evomap/node_id and `_shortNodeIdForStatePath` in a2aProtocol
|
|
577
|
+
// (used to pick the per-node `force_update_last.<suffix>.json`
|
|
578
|
+
// path for upgrade telemetry) falls through to 'anon' — every
|
|
579
|
+
// proxy node under the same EVOLVER_HOME would collide on the
|
|
580
|
+
// same state file. In a mixed-mode install where the legacy file
|
|
581
|
+
// holds a DIFFERENT (stale) id, the helper overwrites it so the
|
|
582
|
+
// state-file suffix matches `this.nodeId` — the id the hub sees
|
|
583
|
+
// in body.node_id. We persist AFTER hello succeeds so a rejected
|
|
584
|
+
// first-boot mint never commits to disk; on a rejection the next
|
|
585
|
+
// tick will mint fresh again (existing behaviour).
|
|
586
|
+
try {
|
|
587
|
+
_persistLegacyNodeId(nodeId);
|
|
588
|
+
} catch (e) {
|
|
589
|
+
// Best-effort: persistence failure must never break hello. Log
|
|
590
|
+
// and move on — the proxy still functions, the state-file
|
|
591
|
+
// suffix just falls back to 'anon' until the next successful
|
|
592
|
+
// hello retries the write.
|
|
593
|
+
this.logger.warn(`[lifecycle] failed to persist legacy node_id (non-fatal): ${e && e.message || e}`);
|
|
594
|
+
}
|
|
296
595
|
this.logger.log(`[lifecycle] hello OK, node_id=${nodeId}${rotateSecret ? ' (secret rotated)' : ''}`);
|
|
297
596
|
return { ok: true, nodeId, response: data };
|
|
298
597
|
} catch (err) {
|
|
@@ -448,6 +747,21 @@ class LifecycleManager {
|
|
|
448
747
|
},
|
|
449
748
|
};
|
|
450
749
|
|
|
750
|
+
// Attach any pending force_update outcome so the hub-side
|
|
751
|
+
// EvolverUpgradeAttempt table gets a row. Captured in a local so the
|
|
752
|
+
// post-2xx clear matches identity (rotation-safe — see
|
|
753
|
+
// _clearLastUpdateStateIfMatches). Never let telemetry throw.
|
|
754
|
+
let capturedLastUpdate = null;
|
|
755
|
+
try {
|
|
756
|
+
const pending = readPendingLastUpdate();
|
|
757
|
+
if (pending) {
|
|
758
|
+
body.last_update = pending;
|
|
759
|
+
capturedLastUpdate = pending;
|
|
760
|
+
}
|
|
761
|
+
} catch (e) {
|
|
762
|
+
this.logger.warn(`[lifecycle] readPendingLastUpdate failed (non-fatal): ${e && e.message || e}`);
|
|
763
|
+
}
|
|
764
|
+
|
|
451
765
|
const res = await hubFetch(endpoint, {
|
|
452
766
|
method: 'POST',
|
|
453
767
|
headers: this._buildHeaders(),
|
|
@@ -470,8 +784,70 @@ class LifecycleManager {
|
|
|
470
784
|
}
|
|
471
785
|
|
|
472
786
|
if (!res.ok) {
|
|
473
|
-
this._consecutiveFailures++;
|
|
474
787
|
const errText = await res.text().catch(() => '');
|
|
788
|
+
// 426 Upgrade Required: hub emits this when our evolver_version is
|
|
789
|
+
// below the minimum version it requires. The body is JSON of shape
|
|
790
|
+
// `{ error: 'evolver_min_version_required', force_update: {...} }`
|
|
791
|
+
// (see hub `src/routes/a2a/_middleware.js`). Pre-fix this fell
|
|
792
|
+
// through to the generic `http_426` error and the proxy never
|
|
793
|
+
// attempted the upgrade — defeating the very mechanism that 426
|
|
794
|
+
// exists to drive. Mirror the 200+force_update path: parse the
|
|
795
|
+
// body, fire executeForceUpdate (which writes the state file via
|
|
796
|
+
// reportForceUpdateOutcome), and let the next heartbeat carry the
|
|
797
|
+
// attempt as body.last_update. Still return an error so the
|
|
798
|
+
// caller's failure counter ticks and the loop backs off.
|
|
799
|
+
if (res.status === 426) {
|
|
800
|
+
let parsed = null;
|
|
801
|
+
try { parsed = JSON.parse(errText); } catch (_) { /* body not JSON */ }
|
|
802
|
+
const fu = parsed && parsed.force_update;
|
|
803
|
+
if (fu && typeof fu === 'object') {
|
|
804
|
+
this.logger.warn(
|
|
805
|
+
`[lifecycle] heartbeat HTTP 426 with force_update directive (required=${
|
|
806
|
+
fu.required_version || '?'
|
|
807
|
+
}) — triggering executeForceUpdate`
|
|
808
|
+
);
|
|
809
|
+
_maybeTriggerForceUpdateFromHeartbeat(fu, this.logger);
|
|
810
|
+
} else {
|
|
811
|
+
this.logger.warn(
|
|
812
|
+
`[lifecycle] heartbeat HTTP 426 without parseable force_update payload: ${errText}`
|
|
813
|
+
);
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
// Hub 400 circuit breaker (mirrors a2aProtocol.js sendHeartbeat
|
|
817
|
+
// ~L2376-2411): if last_update was attached this tick and the hub
|
|
818
|
+
// rejected the body with 400 AND the rejection names the
|
|
819
|
+
// last_update field, the state file is poisoning every heartbeat
|
|
820
|
+
// (e.g. downgrade-then-upgrade left a payload the new hub schema
|
|
821
|
+
// rejects, or a manual edit corrupted the JSON shape). The proxy
|
|
822
|
+
// path used to lack this breaker entirely, so a single bad payload
|
|
823
|
+
// would block telemetry forever -- every retry re-sends the same
|
|
824
|
+
// poison and re-fails with 400. Single-strike (no counter): the
|
|
825
|
+
// 400 + last_update substring pair is unambiguous enough that
|
|
826
|
+
// waiting for repeats just delays recovery. Scope intentionally
|
|
827
|
+
// narrowed to 400-only (NOT any 4xx): 401/403 are auth errors
|
|
828
|
+
// (handled above), 404/405/409 etc. are hub-routing problems that
|
|
829
|
+
// are not the payload's fault. The existing _consecutiveFailures
|
|
830
|
+
// backoff is preserved -- the breaker runs BEFORE the early
|
|
831
|
+
// return so the file is cleared, and then the normal failure
|
|
832
|
+
// path continues unchanged.
|
|
833
|
+
if (res.status === 400 && capturedLastUpdate) {
|
|
834
|
+
const errorText = 'http_400: ' + errText;
|
|
835
|
+
if (/last[_-]?update/i.test(errorText)) {
|
|
836
|
+
// Bypass any rate-limited warn helper: this is a critical
|
|
837
|
+
// recovery signal that must surface even if other ForceUpdate
|
|
838
|
+
// warns fired recently.
|
|
839
|
+
this.logger.warn(
|
|
840
|
+
'[lifecycle] hub 400 with last_update attached (error names last_update); ' +
|
|
841
|
+
'clearing poisoning state file.'
|
|
842
|
+
);
|
|
843
|
+
try {
|
|
844
|
+
clearLastUpdateOnAck(capturedLastUpdate);
|
|
845
|
+
} catch (e) {
|
|
846
|
+
this.logger.warn(`[lifecycle] clearLastUpdateOnAck failed (non-fatal): ${e && e.message || e}`);
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
this._consecutiveFailures++;
|
|
475
851
|
this.logger.error(`[lifecycle] heartbeat HTTP ${res.status}: ${errText}`);
|
|
476
852
|
return { ok: false, error: `http_${res.status}`, statusCode: res.status };
|
|
477
853
|
}
|
|
@@ -481,11 +857,73 @@ class LifecycleManager {
|
|
|
481
857
|
this._consecutiveFailures = 0;
|
|
482
858
|
this.store.setState('last_heartbeat_at', new Date().toISOString());
|
|
483
859
|
|
|
860
|
+
// Semantic parity with a2aProtocol.js sendHeartbeat: a 2xx with
|
|
861
|
+
// `{ok:false}` or `status:'unknown_node'` is NOT a hub-side persist,
|
|
862
|
+
// so the state file must survive for the next heartbeat to retry
|
|
863
|
+
// (unknown_node triggers a re-hello below).
|
|
864
|
+
//
|
|
865
|
+
// PR #188 follow-up (HIGH H1-client): the hub now writes a top-level
|
|
866
|
+
// `last_update_ack: { ok, reason? }` whenever the request carried a
|
|
867
|
+
// last_update payload. Gate the clear on the ack so we do not unlink
|
|
868
|
+
// the only evidence of the upgrade attempt when the hub's
|
|
869
|
+
// fire-and-forget persist throws / dedup-misses / schema-rejects /
|
|
870
|
+
// bypass-path returns false. Backward compat: an old hub that has not
|
|
871
|
+
// yet rolled out the ack writer falls back to the original bare-2xx
|
|
872
|
+
// semantics so this client keeps working against pre-rollout hubs.
|
|
873
|
+
// See src/gep/a2aProtocol.js sendHeartbeat for the canonical comment.
|
|
874
|
+
const hubAccepted = !(data && data.ok === false) && data?.status !== 'unknown_node';
|
|
875
|
+
if (capturedLastUpdate) {
|
|
876
|
+
const ack = data && data.last_update_ack;
|
|
877
|
+
const hasAck = ack && typeof ack === 'object';
|
|
878
|
+
let shouldClear;
|
|
879
|
+
if (hasAck) {
|
|
880
|
+
shouldClear = ack.ok === true
|
|
881
|
+
|| ack.reason === 'duplicate'
|
|
882
|
+
|| ack.reason === 'invalid';
|
|
883
|
+
if (ack.reason === 'failed') {
|
|
884
|
+
this.logger.warn('[lifecycle] hub last_update_ack=failed; ' +
|
|
885
|
+
'keeping state file for retry on next heartbeat.');
|
|
886
|
+
} else if (ack.reason === 'invalid') {
|
|
887
|
+
this.logger.warn('[lifecycle] hub last_update_ack=invalid; ' +
|
|
888
|
+
'clearing state file (retry will not help).');
|
|
889
|
+
}
|
|
890
|
+
} else {
|
|
891
|
+
shouldClear = hubAccepted;
|
|
892
|
+
}
|
|
893
|
+
if (shouldClear) {
|
|
894
|
+
try {
|
|
895
|
+
clearLastUpdateOnAck(capturedLastUpdate);
|
|
896
|
+
} catch (e) {
|
|
897
|
+
this.logger.warn(`[lifecycle] clearLastUpdateOnAck failed (non-fatal): ${e && e.message || e}`);
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
|
|
484
902
|
if (data?.status === 'unknown_node') {
|
|
485
903
|
this.logger.warn('[lifecycle] Node unknown, re-registering...');
|
|
486
904
|
await this.hello();
|
|
487
905
|
}
|
|
488
906
|
|
|
907
|
+
// PR #188 H1 fix: 200 with a `force_update` directive must drive
|
|
908
|
+
// executeForceUpdate the same way a2aProtocol.js does for
|
|
909
|
+
// non-proxy nodes (see a2aProtocol.js:2292-2305 and
|
|
910
|
+
// _maybeTriggerForceUpdateFromHeartbeat). Pure proxy-mode nodes
|
|
911
|
+
// never enter the evolve run() loop, so the consumeForceUpdate
|
|
912
|
+
// path never fires for them — without this block the hub could
|
|
913
|
+
// push force_update forever with zero upgrade attempts and zero
|
|
914
|
+
// EvolverUpgradeAttempt rows. The helper is in-flight + cooldown
|
|
915
|
+
// gated; placing the call here (post-events, pre-min_version
|
|
916
|
+
// banner) means a single response carrying both events AND a
|
|
917
|
+
// force_update still processes the events first.
|
|
918
|
+
if (data && data.force_update && typeof data.force_update === 'object') {
|
|
919
|
+
this.logger.log(
|
|
920
|
+
'[ForceUpdate] Hub requires update to ' +
|
|
921
|
+
(data.force_update.required_version || '?') +
|
|
922
|
+
' -- reason: ' + (data.force_update.reason || 'unspecified')
|
|
923
|
+
);
|
|
924
|
+
_maybeTriggerForceUpdateFromHeartbeat(data.force_update, this.logger);
|
|
925
|
+
}
|
|
926
|
+
|
|
489
927
|
if (Array.isArray(data?.events) && data.events.length > 0) {
|
|
490
928
|
this.store.writeInboundBatch(
|
|
491
929
|
data.events.map(e => ({
|
|
@@ -665,4 +1103,20 @@ class LifecycleManager {
|
|
|
665
1103
|
}
|
|
666
1104
|
}
|
|
667
1105
|
|
|
668
|
-
module.exports = {
|
|
1106
|
+
module.exports = {
|
|
1107
|
+
LifecycleManager,
|
|
1108
|
+
AuthError,
|
|
1109
|
+
DEFAULT_HEARTBEAT_INTERVAL,
|
|
1110
|
+
HEARTBEAT_BACKOFF_CAP_MS,
|
|
1111
|
+
// Test hooks behind `_testing` to mirror the namespacing used by
|
|
1112
|
+
// a2aProtocol.js — production callers must not accidentally tweak the
|
|
1113
|
+
// proxy force_update lifecycle state.
|
|
1114
|
+
_testing: {
|
|
1115
|
+
// Reset proxy heartbeat-driven force_update state. Avoids cooldown
|
|
1116
|
+
// leakage between sibling tests that share one process.
|
|
1117
|
+
_resetProxyForceUpdateStateForTesting: function () {
|
|
1118
|
+
_proxyForceUpdateInFlight = false;
|
|
1119
|
+
_proxyForceUpdateLastAttemptAt = 0;
|
|
1120
|
+
},
|
|
1121
|
+
},
|
|
1122
|
+
};
|