clementine-agent 1.1.13 → 1.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,8 +10,17 @@
10
10
  * data pipeline is identical and any divergence is purely rule-evaluation.
11
11
  */
12
12
  import { CronRunLog } from '../../gateway/cron-scheduler.js';
13
- import { CIRCUIT_BREAKER_COOLDOWN_MS as _COOLDOWN_MS, DEFAULT_MAX_TURNS_FALLBACK, DEFAULT_TIMEOUT_MS, MAX_TIMEOUT_MS, TIER_MAX_TURNS, getInterventionStats, readReflections, } from '../execution-advisor.js';
14
- void _COOLDOWN_MS; // currently encoded as a literal in builtin YAMLs; re-export hook
13
+ import { DEFAULT_MAX_TURNS_FALLBACK, DEFAULT_TIMEOUT_MS, MAX_TIMEOUT_MS, TIER_MAX_TURNS, getInterventionStats, readReflections, } from '../execution-advisor.js';
14
+ // NOTE: Phase 9c (commit 4451f36) made execution-advisor.ts static-import
15
+ // THIS module, creating a circular dep. The previous module-init line
16
+ // `void CIRCUIT_BREAKER_COOLDOWN_MS as _COOLDOWN_MS` was deferring access
17
+ // in source comment terms but actually FORCED a TDZ access at context.ts
18
+ // module-init — which is BEFORE execution-advisor.ts has reached line 38
19
+ // where the const is declared. That produced "Cannot access '_COOLDOWN_MS'
20
+ // before initialization" errors on every cron run after Phase 9c shipped.
21
+ // Removed the import + the void line. The cooldown duration is encoded as
22
+ // a literal in the builtin YAML rules, so this module never actually
23
+ // needed the constant — the import was documentation noise.
15
24
  /**
16
25
  * Build a fresh RuleContext for a job. Pass an existing `advice` if you want
17
26
  * to mutate it (e.g. shadow mode passes a clone so the TS path's advice is
@@ -16,6 +16,14 @@
16
16
  * returns false and all graph features are silently skipped.
17
17
  */
18
18
  import type { EntityNode, EntityRef, GraphSyncStats, PathResult, RelationshipTriplet, TraversalResult } from '../types.js';
19
+ /**
20
+ * Phase 13 — extract structured info from arbitrary error objects so log
21
+ * entries always carry SOMETHING useful even when err.message is empty.
22
+ * Node socket errors have .code, .errno, .syscall, .address, .port that
23
+ * tell us "ECONNREFUSED on /tmp/x.sock" instead of the empty string the
24
+ * falkordb client surfaces by default.
25
+ */
26
+ export declare function extractErrorInfo(err: unknown): Record<string, unknown>;
19
27
  export declare class GraphStore {
20
28
  private db;
21
29
  private client;
@@ -23,6 +31,9 @@ export declare class GraphStore {
23
31
  private available;
24
32
  private persistenceDir;
25
33
  private ownsServer;
34
+ private livenessProbeTimer;
35
+ private livenessFailureStreak;
36
+ private livenessRestartAttempts;
26
37
  constructor(persistenceDir: string);
27
38
  /** Get the socket file path for this instance's data dir. */
28
39
  private get socketFilePath();
@@ -37,6 +48,19 @@ export declare class GraphStore {
37
48
  */
38
49
  connectToRunning(): Promise<boolean>;
39
50
  isAvailable(): boolean;
51
+ /**
52
+ * Periodic health check on the owned embedded server. Runs every 60s.
53
+ * Pings with a trivial Cypher query (RETURN 1). Two consecutive failures
54
+ * trigger a restart attempt. Backs off after 3 restart attempts to avoid
55
+ * crash-loop noise — at that point graph features stay disabled until
56
+ * the daemon is manually restarted.
57
+ *
58
+ * Daemon-only because client processes have their own reconnect loop on
59
+ * the .on('error') path. The probe specifically catches the case where
60
+ * the server hangs (no error event but stops responding).
61
+ */
62
+ private startLivenessProbe;
63
+ private attemptServerRestart;
40
64
  close(): Promise<void>;
41
65
  upsertEntity(label: string, id: string, props: Record<string, any>): Promise<void>;
42
66
  getEntity(label: string, id: string): Promise<EntityNode | null>;
@@ -22,6 +22,46 @@ import pino from 'pino';
22
22
  const logger = pino({ name: 'clementine.graph' });
23
23
  const GRAPH_NAME = 'clementine';
24
24
  const WIKILINK_RE = /\[\[([^\]|]+)(?:\|[^\]]+)?\]\]/g;
25
+ /**
26
+ * Phase 13 — extract structured info from arbitrary error objects so log
27
+ * entries always carry SOMETHING useful even when err.message is empty.
28
+ * Node socket errors have .code, .errno, .syscall, .address, .port that
29
+ * tell us "ECONNREFUSED on /tmp/x.sock" instead of the empty string the
30
+ * falkordb client surfaces by default.
31
+ */
32
+ export function extractErrorInfo(err) {
33
+ if (!err)
34
+ return { errKind: 'no-error-object' };
35
+ if (typeof err !== 'object')
36
+ return { errKind: 'primitive', err: String(err).slice(0, 200) };
37
+ const e = err;
38
+ const out = {};
39
+ if (typeof e.message === 'string' && e.message.length > 0)
40
+ out.errMessage = e.message.slice(0, 200);
41
+ if (typeof e.name === 'string')
42
+ out.errName = e.name;
43
+ if (e.code !== undefined)
44
+ out.errCode = e.code;
45
+ if (e.errno !== undefined)
46
+ out.errno = e.errno;
47
+ if (e.syscall !== undefined)
48
+ out.errSyscall = e.syscall;
49
+ if (e.address !== undefined)
50
+ out.errAddress = e.address;
51
+ if (e.port !== undefined)
52
+ out.errPort = e.port;
53
+ if (e.path !== undefined)
54
+ out.errPath = e.path;
55
+ // Fall back to the constructor name + JSON shape if nothing meaningful surfaced
56
+ if (Object.keys(out).length === 0) {
57
+ out.errKind = e.constructor?.name ?? 'unknown';
58
+ try {
59
+ out.errJson = JSON.stringify(e).slice(0, 300);
60
+ }
61
+ catch { /* ignore */ }
62
+ }
63
+ return out;
64
+ }
25
65
  /** Well-known file where the daemon writes the socket path for other processes. */
26
66
  const SOCKET_FILE_NAME = '.graph.sock';
27
67
  export class GraphStore {
@@ -31,6 +71,9 @@ export class GraphStore {
31
71
  available = false;
32
72
  persistenceDir;
33
73
  ownsServer = false;
74
+ livenessProbeTimer = null;
75
+ livenessFailureStreak = 0;
76
+ livenessRestartAttempts = 0;
34
77
  constructor(persistenceDir) {
35
78
  this.persistenceDir = persistenceDir;
36
79
  }
@@ -53,15 +96,21 @@ export class GraphStore {
53
96
  this.graph = this.db.selectGraph(GRAPH_NAME);
54
97
  this.available = true;
55
98
  this.ownsServer = true;
56
- // Catch connection-level errors: log once, disable gracefully
99
+ // Catch connection-level errors with full diagnosis (Phase 13).
57
100
  let serverErrorLogged = false;
58
101
  this.db.on?.('error', (err) => {
59
102
  if (!serverErrorLogged) {
60
103
  serverErrorLogged = true;
61
- logger.warn({ err: err.message }, 'FalkorDB server error — disabling graph features');
104
+ logger.warn(extractErrorInfo(err), 'FalkorDB server error — disabling graph features');
62
105
  this.available = false;
63
106
  }
64
107
  });
108
+ // Phase 13 — server-side liveness probe with auto-restart.
109
+ // Periodically (60s) ping the embedded server with a tiny query.
110
+ // If the ping fails, the server has hung or quietly died. Auto-
111
+ // restart by reinitializing instead of leaving graph features
112
+ // silently broken until the next daemon restart.
113
+ this.startLivenessProbe();
65
114
  // Write socket path so MCP/dashboard/assistant can connect
66
115
  writeFileSync(this.socketFilePath, this.db.socketPath, 'utf-8');
67
116
  // Create indexes for fast lookups
@@ -103,47 +152,62 @@ export class GraphStore {
103
152
  this.graph = this.client.selectGraph(GRAPH_NAME);
104
153
  this.available = true;
105
154
  this.ownsServer = false;
106
- // Catch connection-level errors: disable and start reconnect loop
155
+ // Catch connection-level errors: disable and start reconnect loop.
156
+ // Phase 13: capture FULL error context (errno, code, syscall) instead
157
+ // of just .message — falkordb client emits raw socket errors that
158
+ // often have empty .message strings, leaving us blind to root cause.
107
159
  let errorHandled = false;
108
160
  this.client.on?.('error', (err) => {
109
161
  if (errorHandled)
110
162
  return;
111
163
  errorHandled = true;
112
- logger.warn({ err: err.message }, 'FalkorDB connection lost — starting reconnect loop');
164
+ const errInfo = extractErrorInfo(err);
165
+ logger.warn({ ...errInfo, pid: process.pid, socketPath }, 'FalkorDB connection lost — starting reconnect loop');
113
166
  this.available = false;
114
167
  try {
115
168
  this.client?.disconnect?.();
116
169
  }
117
170
  catch { /* ignore */ }
118
- // Reconnect loop: try every 30s up to 5 times, then back off to 5 min
171
+ // Reconnect schedule (Phase 13): exponential backoff from 1s up to
172
+ // 60s for the first 6 attempts (covers transient blips fast), then
173
+ // 5min × 5 (handles a daemon-restart window), then a slow 30min
174
+ // probe forever. Total time to "give up fast retries" reduced from
175
+ // 150s (5×30s) to 109s (1+3+10+30+60+5*60) but the EARLY attempts
176
+ // are much more aggressive — most blips recover within seconds.
177
+ const SCHEDULE_MS = [
178
+ 1_000, 3_000, 10_000, 30_000, 60_000,
179
+ 5 * 60_000, 5 * 60_000, 5 * 60_000, 5 * 60_000, 5 * 60_000,
180
+ ];
181
+ const SLOW_PROBE_MS = 30 * 60_000;
119
182
  let attempts = 0;
120
183
  const reconnectLoop = async () => {
121
184
  attempts++;
122
185
  try {
123
186
  const reconnected = await this.connectToRunning();
124
187
  if (reconnected) {
125
- logger.info({ attempts }, 'FalkorDB reconnected');
126
- return; // Success — stop the loop
188
+ logger.info({ attempts, pid: process.pid }, 'FalkorDB reconnected');
189
+ return;
127
190
  }
128
191
  }
129
- catch { /* retry */ }
130
- if (attempts < 5) {
131
- setTimeout(reconnectLoop, 30_000); // Retry in 30s
192
+ catch (retryErr) {
193
+ // Capture retry failures too — silent catch was hiding root cause
194
+ const ri = extractErrorInfo(retryErr);
195
+ logger.debug({ ...ri, attempts }, 'FalkorDB reconnect attempt failed');
132
196
  }
133
- else if (attempts < 10) {
134
- setTimeout(reconnectLoop, 5 * 60_000); // Back off to 5 min
135
- }
136
- else {
137
- // Keep a slow background probe instead of giving up entirely
138
- logger.warn({ attempts }, 'FalkorDB reconnect entering slow probe (every 30 min)');
139
- setTimeout(reconnectLoop, 30 * 60_000);
197
+ const delay = SCHEDULE_MS[attempts - 1] ?? SLOW_PROBE_MS;
198
+ if (attempts === SCHEDULE_MS.length) {
199
+ logger.warn({ attempts, pid: process.pid }, 'FalkorDB reconnect exhausted fast-retry schedule — entering slow probe (every 30 min)');
140
200
  }
201
+ setTimeout(reconnectLoop, delay);
141
202
  };
142
- setTimeout(reconnectLoop, 30_000);
203
+ setTimeout(reconnectLoop, SCHEDULE_MS[0]);
143
204
  });
144
205
  return true;
145
206
  }
146
- catch {
207
+ catch (err) {
208
+ // Phase 13: log connect-time failures too (was swallowing them silently)
209
+ const errInfo = extractErrorInfo(err);
210
+ logger.debug({ ...errInfo, socketPath: this.socketFilePath }, 'FalkorDB initial connect failed');
147
211
  this.available = false;
148
212
  return false;
149
213
  }
@@ -151,7 +215,101 @@ export class GraphStore {
151
215
  isAvailable() {
152
216
  return this.available;
153
217
  }
218
+ // ── Liveness probe (daemon only — Phase 13) ──────────────────────────
219
+ /**
220
+ * Periodic health check on the owned embedded server. Runs every 60s.
221
+ * Pings with a trivial Cypher query (RETURN 1). Two consecutive failures
222
+ * trigger a restart attempt. Backs off after 3 restart attempts to avoid
223
+ * crash-loop noise — at that point graph features stay disabled until
224
+ * the daemon is manually restarted.
225
+ *
226
+ * Daemon-only because client processes have their own reconnect loop on
227
+ * the .on('error') path. The probe specifically catches the case where
228
+ * the server hangs (no error event but stops responding).
229
+ */
230
+ startLivenessProbe() {
231
+ if (this.livenessProbeTimer)
232
+ return;
233
+ const PROBE_INTERVAL_MS = 60_000;
234
+ const MAX_RESTART_ATTEMPTS = 3;
235
+ const probe = async () => {
236
+ if (!this.ownsServer)
237
+ return; // safety — only the server owner probes
238
+ if (!this.available || !this.graph) {
239
+ // Already disabled — let the existing recovery paths run.
240
+ return;
241
+ }
242
+ try {
243
+ await Promise.race([
244
+ this.graph.query('RETURN 1 AS ping'),
245
+ new Promise((_, reject) => setTimeout(() => reject(new Error('probe-timeout')), 5_000)),
246
+ ]);
247
+ // Probe succeeded — reset failure streak.
248
+ if (this.livenessFailureStreak > 0) {
249
+ logger.info({ priorFailures: this.livenessFailureStreak }, 'FalkorDB liveness probe recovered');
250
+ this.livenessFailureStreak = 0;
251
+ }
252
+ }
253
+ catch (err) {
254
+ this.livenessFailureStreak++;
255
+ logger.warn({ ...extractErrorInfo(err), streak: this.livenessFailureStreak }, 'FalkorDB liveness probe failed');
256
+ if (this.livenessFailureStreak >= 2) {
257
+ await this.attemptServerRestart(MAX_RESTART_ATTEMPTS);
258
+ }
259
+ }
260
+ };
261
+ this.livenessProbeTimer = setInterval(probe, PROBE_INTERVAL_MS);
262
+ // Don't keep the daemon alive just for the probe.
263
+ this.livenessProbeTimer.unref?.();
264
+ }
265
+ async attemptServerRestart(maxAttempts) {
266
+ if (this.livenessRestartAttempts >= maxAttempts) {
267
+ logger.error({ attempts: this.livenessRestartAttempts }, 'FalkorDB restart attempts exhausted — graph features disabled until daemon restart');
268
+ this.available = false;
269
+ if (this.livenessProbeTimer) {
270
+ clearInterval(this.livenessProbeTimer);
271
+ this.livenessProbeTimer = null;
272
+ }
273
+ return;
274
+ }
275
+ this.livenessRestartAttempts++;
276
+ logger.warn({ attempt: this.livenessRestartAttempts, max: maxAttempts }, 'FalkorDB restart attempt');
277
+ try {
278
+ // Tear down the current server gracefully.
279
+ try {
280
+ await this.db?.close?.();
281
+ }
282
+ catch { /* ignore */ }
283
+ this.db = null;
284
+ this.graph = null;
285
+ this.available = false;
286
+ try {
287
+ unlinkSync(this.socketFilePath);
288
+ }
289
+ catch { /* ignore */ }
290
+ // Re-initialize. initialize() will re-register error handlers and
291
+ // re-start the probe — but we don't want to start a NESTED probe,
292
+ // so clear the timer first.
293
+ if (this.livenessProbeTimer) {
294
+ clearInterval(this.livenessProbeTimer);
295
+ this.livenessProbeTimer = null;
296
+ }
297
+ this.livenessFailureStreak = 0;
298
+ await this.initialize();
299
+ if (this.available) {
300
+ logger.info({ attempt: this.livenessRestartAttempts }, 'FalkorDB restart succeeded');
301
+ }
302
+ }
303
+ catch (err) {
304
+ logger.error(extractErrorInfo(err), 'FalkorDB restart attempt failed');
305
+ }
306
+ }
154
307
  async close() {
308
+ // Stop the liveness probe before tearing down (Phase 13).
309
+ if (this.livenessProbeTimer) {
310
+ clearInterval(this.livenessProbeTimer);
311
+ this.livenessProbeTimer = null;
312
+ }
155
313
  if (this.ownsServer && this.db) {
156
314
  // Clean up socket file
157
315
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clementine-agent",
3
- "version": "1.1.13",
3
+ "version": "1.1.15",
4
4
  "description": "Clementine — Personal AI Assistant (TypeScript)",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",