clementine-agent 1.1.13 → 1.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -10,8 +10,17 @@
|
|
|
10
10
|
* data pipeline is identical and any divergence is purely rule-evaluation.
|
|
11
11
|
*/
|
|
12
12
|
import { CronRunLog } from '../../gateway/cron-scheduler.js';
|
|
13
|
-
import {
|
|
14
|
-
|
|
13
|
+
import { DEFAULT_MAX_TURNS_FALLBACK, DEFAULT_TIMEOUT_MS, MAX_TIMEOUT_MS, TIER_MAX_TURNS, getInterventionStats, readReflections, } from '../execution-advisor.js';
|
|
14
|
+
// NOTE: Phase 9c (commit 4451f36) made execution-advisor.ts static-import
|
|
15
|
+
// THIS module, creating a circular dep. The previous module-init line
|
|
16
|
+
// `void CIRCUIT_BREAKER_COOLDOWN_MS as _COOLDOWN_MS` was deferring access
|
|
17
|
+
// in source comment terms but actually FORCED a TDZ access at context.ts
|
|
18
|
+
// module-init — which is BEFORE execution-advisor.ts has reached line 38
|
|
19
|
+
// where the const is declared. That produced "Cannot access '_COOLDOWN_MS'
|
|
20
|
+
// before initialization" errors on every cron run after Phase 9c shipped.
|
|
21
|
+
// Removed the import + the void line. The cooldown duration is encoded as
|
|
22
|
+
// a literal in the builtin YAML rules, so this module never actually
|
|
23
|
+
// needed the constant — the import was documentation noise.
|
|
15
24
|
/**
|
|
16
25
|
* Build a fresh RuleContext for a job. Pass an existing `advice` if you want
|
|
17
26
|
* to mutate it (e.g. shadow mode passes a clone so the TS path's advice is
|
|
@@ -16,6 +16,14 @@
|
|
|
16
16
|
* returns false and all graph features are silently skipped.
|
|
17
17
|
*/
|
|
18
18
|
import type { EntityNode, EntityRef, GraphSyncStats, PathResult, RelationshipTriplet, TraversalResult } from '../types.js';
|
|
19
|
+
/**
|
|
20
|
+
* Phase 13 — extract structured info from arbitrary error objects so log
|
|
21
|
+
* entries always carry SOMETHING useful even when err.message is empty.
|
|
22
|
+
* Node socket errors have .code, .errno, .syscall, .address, .port that
|
|
23
|
+
* tell us "ECONNREFUSED on /tmp/x.sock" instead of the empty string the
|
|
24
|
+
* falkordb client surfaces by default.
|
|
25
|
+
*/
|
|
26
|
+
export declare function extractErrorInfo(err: unknown): Record<string, unknown>;
|
|
19
27
|
export declare class GraphStore {
|
|
20
28
|
private db;
|
|
21
29
|
private client;
|
|
@@ -23,6 +31,9 @@ export declare class GraphStore {
|
|
|
23
31
|
private available;
|
|
24
32
|
private persistenceDir;
|
|
25
33
|
private ownsServer;
|
|
34
|
+
private livenessProbeTimer;
|
|
35
|
+
private livenessFailureStreak;
|
|
36
|
+
private livenessRestartAttempts;
|
|
26
37
|
constructor(persistenceDir: string);
|
|
27
38
|
/** Get the socket file path for this instance's data dir. */
|
|
28
39
|
private get socketFilePath();
|
|
@@ -37,6 +48,19 @@ export declare class GraphStore {
|
|
|
37
48
|
*/
|
|
38
49
|
connectToRunning(): Promise<boolean>;
|
|
39
50
|
isAvailable(): boolean;
|
|
51
|
+
/**
|
|
52
|
+
* Periodic health check on the owned embedded server. Runs every 60s.
|
|
53
|
+
* Pings with a trivial Cypher query (RETURN 1). Two consecutive failures
|
|
54
|
+
* trigger a restart attempt. Backs off after 3 restart attempts to avoid
|
|
55
|
+
* crash-loop noise — at that point graph features stay disabled until
|
|
56
|
+
* the daemon is manually restarted.
|
|
57
|
+
*
|
|
58
|
+
* Daemon-only because client processes have their own reconnect loop on
|
|
59
|
+
* the .on('error') path. The probe specifically catches the case where
|
|
60
|
+
* the server hangs (no error event but stops responding).
|
|
61
|
+
*/
|
|
62
|
+
private startLivenessProbe;
|
|
63
|
+
private attemptServerRestart;
|
|
40
64
|
close(): Promise<void>;
|
|
41
65
|
upsertEntity(label: string, id: string, props: Record<string, any>): Promise<void>;
|
|
42
66
|
getEntity(label: string, id: string): Promise<EntityNode | null>;
|
|
@@ -22,6 +22,46 @@ import pino from 'pino';
|
|
|
22
22
|
const logger = pino({ name: 'clementine.graph' });
|
|
23
23
|
const GRAPH_NAME = 'clementine';
|
|
24
24
|
const WIKILINK_RE = /\[\[([^\]|]+)(?:\|[^\]]+)?\]\]/g;
|
|
25
|
+
/**
|
|
26
|
+
* Phase 13 — extract structured info from arbitrary error objects so log
|
|
27
|
+
* entries always carry SOMETHING useful even when err.message is empty.
|
|
28
|
+
* Node socket errors have .code, .errno, .syscall, .address, .port that
|
|
29
|
+
* tell us "ECONNREFUSED on /tmp/x.sock" instead of the empty string the
|
|
30
|
+
* falkordb client surfaces by default.
|
|
31
|
+
*/
|
|
32
|
+
export function extractErrorInfo(err) {
|
|
33
|
+
if (!err)
|
|
34
|
+
return { errKind: 'no-error-object' };
|
|
35
|
+
if (typeof err !== 'object')
|
|
36
|
+
return { errKind: 'primitive', err: String(err).slice(0, 200) };
|
|
37
|
+
const e = err;
|
|
38
|
+
const out = {};
|
|
39
|
+
if (typeof e.message === 'string' && e.message.length > 0)
|
|
40
|
+
out.errMessage = e.message.slice(0, 200);
|
|
41
|
+
if (typeof e.name === 'string')
|
|
42
|
+
out.errName = e.name;
|
|
43
|
+
if (e.code !== undefined)
|
|
44
|
+
out.errCode = e.code;
|
|
45
|
+
if (e.errno !== undefined)
|
|
46
|
+
out.errno = e.errno;
|
|
47
|
+
if (e.syscall !== undefined)
|
|
48
|
+
out.errSyscall = e.syscall;
|
|
49
|
+
if (e.address !== undefined)
|
|
50
|
+
out.errAddress = e.address;
|
|
51
|
+
if (e.port !== undefined)
|
|
52
|
+
out.errPort = e.port;
|
|
53
|
+
if (e.path !== undefined)
|
|
54
|
+
out.errPath = e.path;
|
|
55
|
+
// Fall back to the constructor name + JSON shape if nothing meaningful surfaced
|
|
56
|
+
if (Object.keys(out).length === 0) {
|
|
57
|
+
out.errKind = e.constructor?.name ?? 'unknown';
|
|
58
|
+
try {
|
|
59
|
+
out.errJson = JSON.stringify(e).slice(0, 300);
|
|
60
|
+
}
|
|
61
|
+
catch { /* ignore */ }
|
|
62
|
+
}
|
|
63
|
+
return out;
|
|
64
|
+
}
|
|
25
65
|
/** Well-known file where the daemon writes the socket path for other processes. */
|
|
26
66
|
const SOCKET_FILE_NAME = '.graph.sock';
|
|
27
67
|
export class GraphStore {
|
|
@@ -31,6 +71,9 @@ export class GraphStore {
|
|
|
31
71
|
available = false;
|
|
32
72
|
persistenceDir;
|
|
33
73
|
ownsServer = false;
|
|
74
|
+
livenessProbeTimer = null;
|
|
75
|
+
livenessFailureStreak = 0;
|
|
76
|
+
livenessRestartAttempts = 0;
|
|
34
77
|
constructor(persistenceDir) {
|
|
35
78
|
this.persistenceDir = persistenceDir;
|
|
36
79
|
}
|
|
@@ -53,15 +96,21 @@ export class GraphStore {
|
|
|
53
96
|
this.graph = this.db.selectGraph(GRAPH_NAME);
|
|
54
97
|
this.available = true;
|
|
55
98
|
this.ownsServer = true;
|
|
56
|
-
// Catch connection-level errors
|
|
99
|
+
// Catch connection-level errors with full diagnosis (Phase 13).
|
|
57
100
|
let serverErrorLogged = false;
|
|
58
101
|
this.db.on?.('error', (err) => {
|
|
59
102
|
if (!serverErrorLogged) {
|
|
60
103
|
serverErrorLogged = true;
|
|
61
|
-
logger.warn(
|
|
104
|
+
logger.warn(extractErrorInfo(err), 'FalkorDB server error — disabling graph features');
|
|
62
105
|
this.available = false;
|
|
63
106
|
}
|
|
64
107
|
});
|
|
108
|
+
// Phase 13 — server-side liveness probe with auto-restart.
|
|
109
|
+
// Periodically (60s) ping the embedded server with a tiny query.
|
|
110
|
+
// If the ping fails, the server has hung or quietly died. Auto-
|
|
111
|
+
// restart by reinitializing instead of leaving graph features
|
|
112
|
+
// silently broken until the next daemon restart.
|
|
113
|
+
this.startLivenessProbe();
|
|
65
114
|
// Write socket path so MCP/dashboard/assistant can connect
|
|
66
115
|
writeFileSync(this.socketFilePath, this.db.socketPath, 'utf-8');
|
|
67
116
|
// Create indexes for fast lookups
|
|
@@ -103,47 +152,62 @@ export class GraphStore {
|
|
|
103
152
|
this.graph = this.client.selectGraph(GRAPH_NAME);
|
|
104
153
|
this.available = true;
|
|
105
154
|
this.ownsServer = false;
|
|
106
|
-
// Catch connection-level errors: disable and start reconnect loop
|
|
155
|
+
// Catch connection-level errors: disable and start reconnect loop.
|
|
156
|
+
// Phase 13: capture FULL error context (errno, code, syscall) instead
|
|
157
|
+
// of just .message — falkordb client emits raw socket errors that
|
|
158
|
+
// often have empty .message strings, leaving us blind to root cause.
|
|
107
159
|
let errorHandled = false;
|
|
108
160
|
this.client.on?.('error', (err) => {
|
|
109
161
|
if (errorHandled)
|
|
110
162
|
return;
|
|
111
163
|
errorHandled = true;
|
|
112
|
-
|
|
164
|
+
const errInfo = extractErrorInfo(err);
|
|
165
|
+
logger.warn({ ...errInfo, pid: process.pid, socketPath }, 'FalkorDB connection lost — starting reconnect loop');
|
|
113
166
|
this.available = false;
|
|
114
167
|
try {
|
|
115
168
|
this.client?.disconnect?.();
|
|
116
169
|
}
|
|
117
170
|
catch { /* ignore */ }
|
|
118
|
-
// Reconnect
|
|
171
|
+
// Reconnect schedule (Phase 13): exponential backoff from 1s up to
|
|
172
|
+
// 60s for the first 6 attempts (covers transient blips fast), then
|
|
173
|
+
// 5min × 5 (handles a daemon-restart window), then a slow 30min
|
|
174
|
+
// probe forever. Total time to "give up fast retries" reduced from
|
|
175
|
+
// 150s (5×30s) to 109s (1+3+10+30+60+5*60) but the EARLY attempts
|
|
176
|
+
// are much more aggressive — most blips recover within seconds.
|
|
177
|
+
const SCHEDULE_MS = [
|
|
178
|
+
1_000, 3_000, 10_000, 30_000, 60_000,
|
|
179
|
+
5 * 60_000, 5 * 60_000, 5 * 60_000, 5 * 60_000, 5 * 60_000,
|
|
180
|
+
];
|
|
181
|
+
const SLOW_PROBE_MS = 30 * 60_000;
|
|
119
182
|
let attempts = 0;
|
|
120
183
|
const reconnectLoop = async () => {
|
|
121
184
|
attempts++;
|
|
122
185
|
try {
|
|
123
186
|
const reconnected = await this.connectToRunning();
|
|
124
187
|
if (reconnected) {
|
|
125
|
-
logger.info({ attempts }, 'FalkorDB reconnected');
|
|
126
|
-
return;
|
|
188
|
+
logger.info({ attempts, pid: process.pid }, 'FalkorDB reconnected');
|
|
189
|
+
return;
|
|
127
190
|
}
|
|
128
191
|
}
|
|
129
|
-
catch {
|
|
130
|
-
|
|
131
|
-
|
|
192
|
+
catch (retryErr) {
|
|
193
|
+
// Capture retry failures too — silent catch was hiding root cause
|
|
194
|
+
const ri = extractErrorInfo(retryErr);
|
|
195
|
+
logger.debug({ ...ri, attempts }, 'FalkorDB reconnect attempt failed');
|
|
132
196
|
}
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
else {
|
|
137
|
-
// Keep a slow background probe instead of giving up entirely
|
|
138
|
-
logger.warn({ attempts }, 'FalkorDB reconnect entering slow probe (every 30 min)');
|
|
139
|
-
setTimeout(reconnectLoop, 30 * 60_000);
|
|
197
|
+
const delay = SCHEDULE_MS[attempts - 1] ?? SLOW_PROBE_MS;
|
|
198
|
+
if (attempts === SCHEDULE_MS.length) {
|
|
199
|
+
logger.warn({ attempts, pid: process.pid }, 'FalkorDB reconnect exhausted fast-retry schedule — entering slow probe (every 30 min)');
|
|
140
200
|
}
|
|
201
|
+
setTimeout(reconnectLoop, delay);
|
|
141
202
|
};
|
|
142
|
-
setTimeout(reconnectLoop,
|
|
203
|
+
setTimeout(reconnectLoop, SCHEDULE_MS[0]);
|
|
143
204
|
});
|
|
144
205
|
return true;
|
|
145
206
|
}
|
|
146
|
-
catch {
|
|
207
|
+
catch (err) {
|
|
208
|
+
// Phase 13: log connect-time failures too (was swallowing them silently)
|
|
209
|
+
const errInfo = extractErrorInfo(err);
|
|
210
|
+
logger.debug({ ...errInfo, socketPath: this.socketFilePath }, 'FalkorDB initial connect failed');
|
|
147
211
|
this.available = false;
|
|
148
212
|
return false;
|
|
149
213
|
}
|
|
@@ -151,7 +215,101 @@ export class GraphStore {
|
|
|
151
215
|
isAvailable() {
|
|
152
216
|
return this.available;
|
|
153
217
|
}
|
|
218
|
+
// ── Liveness probe (daemon only — Phase 13) ──────────────────────────
|
|
219
|
+
/**
|
|
220
|
+
* Periodic health check on the owned embedded server. Runs every 60s.
|
|
221
|
+
* Pings with a trivial Cypher query (RETURN 1). Two consecutive failures
|
|
222
|
+
* trigger a restart attempt. Backs off after 3 restart attempts to avoid
|
|
223
|
+
* crash-loop noise — at that point graph features stay disabled until
|
|
224
|
+
* the daemon is manually restarted.
|
|
225
|
+
*
|
|
226
|
+
* Daemon-only because client processes have their own reconnect loop on
|
|
227
|
+
* the .on('error') path. The probe specifically catches the case where
|
|
228
|
+
* the server hangs (no error event but stops responding).
|
|
229
|
+
*/
|
|
230
|
+
startLivenessProbe() {
|
|
231
|
+
if (this.livenessProbeTimer)
|
|
232
|
+
return;
|
|
233
|
+
const PROBE_INTERVAL_MS = 60_000;
|
|
234
|
+
const MAX_RESTART_ATTEMPTS = 3;
|
|
235
|
+
const probe = async () => {
|
|
236
|
+
if (!this.ownsServer)
|
|
237
|
+
return; // safety — only the server owner probes
|
|
238
|
+
if (!this.available || !this.graph) {
|
|
239
|
+
// Already disabled — let the existing recovery paths run.
|
|
240
|
+
return;
|
|
241
|
+
}
|
|
242
|
+
try {
|
|
243
|
+
await Promise.race([
|
|
244
|
+
this.graph.query('RETURN 1 AS ping'),
|
|
245
|
+
new Promise((_, reject) => setTimeout(() => reject(new Error('probe-timeout')), 5_000)),
|
|
246
|
+
]);
|
|
247
|
+
// Probe succeeded — reset failure streak.
|
|
248
|
+
if (this.livenessFailureStreak > 0) {
|
|
249
|
+
logger.info({ priorFailures: this.livenessFailureStreak }, 'FalkorDB liveness probe recovered');
|
|
250
|
+
this.livenessFailureStreak = 0;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
catch (err) {
|
|
254
|
+
this.livenessFailureStreak++;
|
|
255
|
+
logger.warn({ ...extractErrorInfo(err), streak: this.livenessFailureStreak }, 'FalkorDB liveness probe failed');
|
|
256
|
+
if (this.livenessFailureStreak >= 2) {
|
|
257
|
+
await this.attemptServerRestart(MAX_RESTART_ATTEMPTS);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
};
|
|
261
|
+
this.livenessProbeTimer = setInterval(probe, PROBE_INTERVAL_MS);
|
|
262
|
+
// Don't keep the daemon alive just for the probe.
|
|
263
|
+
this.livenessProbeTimer.unref?.();
|
|
264
|
+
}
|
|
265
|
+
async attemptServerRestart(maxAttempts) {
|
|
266
|
+
if (this.livenessRestartAttempts >= maxAttempts) {
|
|
267
|
+
logger.error({ attempts: this.livenessRestartAttempts }, 'FalkorDB restart attempts exhausted — graph features disabled until daemon restart');
|
|
268
|
+
this.available = false;
|
|
269
|
+
if (this.livenessProbeTimer) {
|
|
270
|
+
clearInterval(this.livenessProbeTimer);
|
|
271
|
+
this.livenessProbeTimer = null;
|
|
272
|
+
}
|
|
273
|
+
return;
|
|
274
|
+
}
|
|
275
|
+
this.livenessRestartAttempts++;
|
|
276
|
+
logger.warn({ attempt: this.livenessRestartAttempts, max: maxAttempts }, 'FalkorDB restart attempt');
|
|
277
|
+
try {
|
|
278
|
+
// Tear down the current server gracefully.
|
|
279
|
+
try {
|
|
280
|
+
await this.db?.close?.();
|
|
281
|
+
}
|
|
282
|
+
catch { /* ignore */ }
|
|
283
|
+
this.db = null;
|
|
284
|
+
this.graph = null;
|
|
285
|
+
this.available = false;
|
|
286
|
+
try {
|
|
287
|
+
unlinkSync(this.socketFilePath);
|
|
288
|
+
}
|
|
289
|
+
catch { /* ignore */ }
|
|
290
|
+
// Re-initialize. initialize() will re-register error handlers and
|
|
291
|
+
// re-start the probe — but we don't want to start a NESTED probe,
|
|
292
|
+
// so clear the timer first.
|
|
293
|
+
if (this.livenessProbeTimer) {
|
|
294
|
+
clearInterval(this.livenessProbeTimer);
|
|
295
|
+
this.livenessProbeTimer = null;
|
|
296
|
+
}
|
|
297
|
+
this.livenessFailureStreak = 0;
|
|
298
|
+
await this.initialize();
|
|
299
|
+
if (this.available) {
|
|
300
|
+
logger.info({ attempt: this.livenessRestartAttempts }, 'FalkorDB restart succeeded');
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
catch (err) {
|
|
304
|
+
logger.error(extractErrorInfo(err), 'FalkorDB restart attempt failed');
|
|
305
|
+
}
|
|
306
|
+
}
|
|
154
307
|
async close() {
|
|
308
|
+
// Stop the liveness probe before tearing down (Phase 13).
|
|
309
|
+
if (this.livenessProbeTimer) {
|
|
310
|
+
clearInterval(this.livenessProbeTimer);
|
|
311
|
+
this.livenessProbeTimer = null;
|
|
312
|
+
}
|
|
155
313
|
if (this.ownsServer && this.db) {
|
|
156
314
|
// Clean up socket file
|
|
157
315
|
try {
|