gitnexus 1.6.6-rc.42 → 1.6.6-rc.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -158,6 +158,7 @@ gitnexus analyze --skip-agents-md # Preserve custom AGENTS.md/CLAUDE.md gitnexu
158
158
  gitnexus analyze --verbose # Log skipped files when parsers are unavailable
159
159
  gitnexus analyze --max-file-size 1024 # Skip files larger than N KB (default: 512, cap: 32768)
160
160
  gitnexus analyze --worker-timeout 60 # Increase worker idle timeout for slow parses
161
+ gitnexus analyze --wal-checkpoint-threshold 67108864 # 64 MiB. Control LadybugDB WAL auto-checkpoint threshold (default: 67108864 = 64 MiB; -1 keeps Ladybug stock ~16 MiB)
161
162
  gitnexus mcp # Start MCP server (stdio) — serves all indexed repos
162
163
  gitnexus serve # Start local HTTP server (multi-repo) for web UI
163
164
  gitnexus index # Register an existing .gitnexus/ folder into the global registry
@@ -307,6 +308,7 @@ Configure the behavior with two environment variables:
307
308
  |----------|--------|---------|--------|
308
309
  | `GITNEXUS_LBUG_EXTENSION_INSTALL` | `auto`, `load-only`, `never` | `auto` | `auto` runs one bounded INSTALL if LOAD fails. `load-only` only uses already-installed extensions (recommended for offline / firewalled environments). `never` skips optional extensions entirely. |
309
310
  | `GITNEXUS_LBUG_EXTENSION_INSTALL_TIMEOUT_MS` | positive integer | `15000` | Wall-clock budget for the out-of-process `INSTALL` child before it is killed. |
311
+ | `GITNEXUS_WAL_CHECKPOINT_THRESHOLD` | integer `>= -1` | `67108864` (64 MiB) | LadybugDB WAL auto-checkpoint threshold during analyze (bytes). Auto-checkpoint remains enabled; `-1` keeps Ladybug's stock ~16 MiB. Larger thresholds reduce checkpoint frequency but increase the WAL size at rotation time — choose a smaller value on disk-constrained environments. |
310
312
 
311
313
  ```bash
312
314
  # Offline/airgapped: never reach the network for extensions
@@ -69,6 +69,8 @@ export interface AnalyzeOptions {
69
69
  maxFileSize?: string;
70
70
  /** Override worker sub-batch idle timeout in seconds. */
71
71
  workerTimeout?: string;
72
+ /** Control LadybugDB WAL auto-checkpoint threshold during analyze. */
73
+ walCheckpointThreshold?: string;
72
74
  /** Parse worker pool size; 0 disables workers (sequential fallback). */
73
75
  workers?: string;
74
76
  embeddingThreads?: string;
@@ -12,7 +12,7 @@ import { spawn } from 'child_process';
12
12
  import v8 from 'v8';
13
13
  import cliProgress from 'cli-progress';
14
14
  import { closeLbug } from '../core/lbug/lbug-adapter.js';
15
- import { isWalCorruptionError, WAL_RECOVERY_SUGGESTION } from '../core/lbug/lbug-config.js';
15
+ import { isLbugCheckpointIoError, isWalCorruptionError, parseWalCheckpointThreshold, WAL_RECOVERY_SUGGESTION, } from '../core/lbug/lbug-config.js';
16
16
  import { getStoragePaths, getGlobalRegistryPath, RegistryNameCollisionError, AnalysisNotFinalizedError, assertAnalysisFinalized, } from '../storage/repo-manager.js';
17
17
  import { getGitRoot, hasGitDir } from '../storage/git.js';
18
18
  import { runFullAnalysis } from '../core/run-analyze.js';
@@ -322,6 +322,14 @@ const forceHeapOOMForTestIfEnabled = () => {
322
322
  for (;;)
323
323
  chunks.push('x'.repeat(1024 * 1024));
324
324
  };
325
+ // 64 MiB keeps auto-checkpoint enabled but triggers less frequently than
326
+ // Ladybug's stock ~16 MiB threshold, reducing rename/remove churn on large
327
+ // runs. Also matches the GitNexus default in `lbug-config.ts`.
328
+ //
329
+ // IMPORTANT: keep README examples (`README.md`, `gitnexus/README.md`) and
330
+ // the `DEFAULT_WAL_CHECKPOINT_THRESHOLD` constant in
331
+ // `gitnexus/src/core/lbug/lbug-config.ts` in sync with this value.
332
+ const RECOMMENDED_WAL_CHECKPOINT_THRESHOLD = 64 * 1024 * 1024;
325
333
  /** Re-exec the process with a 16GB heap and larger stack if we're currently below that. */
326
334
  async function ensureHeap() {
327
335
  const nodeOpts = process.env.NODE_OPTIONS || '';
@@ -378,6 +386,8 @@ const ANALYZE_CLI_ENV_KEYS = [
378
386
  'GITNEXUS_PROFILE_DEFERRED_SLOW_MS',
379
387
  'GITNEXUS_MAX_FILE_SIZE',
380
388
  'GITNEXUS_WORKER_SUB_BATCH_TIMEOUT_MS',
389
+ 'GITNEXUS_WAL_CHECKPOINT_THRESHOLD',
390
+ 'GITNEXUS_WAL_MANUAL_CHECKPOINT',
381
391
  'GITNEXUS_EMBEDDING_THREADS',
382
392
  'GITNEXUS_EMBEDDING_BATCH_SIZE',
383
393
  'GITNEXUS_EMBEDDING_SUB_BATCH_SIZE',
@@ -452,6 +462,15 @@ const analyzeCommandImpl = async (inputPath, options) => {
452
462
  }
453
463
  process.env.GITNEXUS_WORKER_SUB_BATCH_TIMEOUT_MS = String(Math.round(workerTimeoutSeconds * 1000));
454
464
  }
465
+ if (options?.walCheckpointThreshold !== undefined) {
466
+ const parsed = parseWalCheckpointThreshold(options.walCheckpointThreshold);
467
+ if (parsed === undefined) {
468
+ cliError(' --wal-checkpoint-threshold must be an integer >= -1.\n');
469
+ process.exitCode = 1;
470
+ return;
471
+ }
472
+ process.env.GITNEXUS_WAL_CHECKPOINT_THRESHOLD = String(parsed);
473
+ }
455
474
  // `--workers` is threaded through `runFullAnalysis` options → PipelineOptions
456
475
  // → createWorkerPool, intentionally bypassing the GITNEXUS_WORKER_POOL_SIZE
457
476
  // env channel so this CLI surface never mutates `process.env` for pool size.
@@ -859,6 +878,16 @@ const analyzeCommandImpl = async (inputPath, options) => {
859
878
  process.exitCode = 1;
860
879
  return;
861
880
  }
881
+ if (isLbugCheckpointIoError(err)) {
882
+ cliError(` LadybugDB failed while rotating/removing WAL checkpoint files.\n` +
883
+ ` This can happen when auto-checkpoint runs at the default threshold (~16MB).\n` +
884
+ ` Retry with a larger checkpoint threshold to reduce checkpoint frequency:\n` +
885
+ ` gitnexus analyze --wal-checkpoint-threshold ${RECOMMENDED_WAL_CHECKPOINT_THRESHOLD}\n` +
886
+ ` (or set GITNEXUS_WAL_CHECKPOINT_THRESHOLD=${RECOMMENDED_WAL_CHECKPOINT_THRESHOLD})\n` +
887
+ ` (Try 33554432 = 32 MiB on small-disk / CI runners.)\n`, { recoveryHint: 'wal-checkpoint-threshold' });
888
+ process.exitCode = 1;
889
+ return;
890
+ }
862
891
  // HF download failure — show clean guidance without the raw stack trace.
863
892
  // Checked before writeFatalToStderr so the user sees one focused message
864
893
  // rather than a stack-trace dump followed by a second remediation block.
@@ -1,15 +1,37 @@
1
+ /**
2
+ * String-literal union of all `recoveryHint` tags emitted by the CLI.
3
+ *
4
+ * Centralized so a new recovery branch added in `analyze.ts` cannot land
5
+ * without updating this union — TypeScript will reject the unknown literal
6
+ * passed via `cliError({ recoveryHint: '...' })`. To add a new hint:
7
+ * 1. Add the tag string to this union.
8
+ * 2. Pass it as the `recoveryHint` field at the relevant `cliError`
9
+ * call site.
10
+ *
11
+ * Consumers can import this type to narrow log-record `recoveryHint`
12
+ * fields without restating the literal list.
13
+ */
14
+ export type RecoveryHint = 'wal-corruption' | 'wal-checkpoint-threshold' | 'heap-oom-respawn' | 'native-worker-abort' | 'hf-endpoint-unreachable' | 'large-repo' | 'npm-resolution' | 'module-not-found';
15
+ /**
16
+ * Common shape for the optional structured-field bag passed to
17
+ * `cliError`/`cliWarn`/`cliInfo`. Typed so the `recoveryHint` slot is
18
+ * checked against the {@link RecoveryHint} union.
19
+ */
20
+ export interface CliMessageFields extends Record<string, unknown> {
21
+ recoveryHint?: RecoveryHint;
22
+ }
1
23
  /**
2
24
  * User-facing informational message. Use for banners, listening URLs,
3
25
  * and any message the user expects to read in plain text.
4
26
  */
5
- export declare function cliInfo(msg: string, fields?: Record<string, unknown>): void;
27
+ export declare function cliInfo(msg: string, fields?: CliMessageFields): void;
6
28
  /**
7
29
  * User-facing warning. Operator-actionable but non-fatal — `cliWarn`
8
30
  * indicates the command can still proceed in some form.
9
31
  */
10
- export declare function cliWarn(msg: string, fields?: Record<string, unknown>): void;
32
+ export declare function cliWarn(msg: string, fields?: CliMessageFields): void;
11
33
  /**
12
34
  * User-facing error. Indicates the command cannot proceed; usually
13
35
  * paired with a non-zero exit code at the call site.
14
36
  */
15
- export declare function cliError(msg: string, fields?: Record<string, unknown>): void;
37
+ export declare function cliError(msg: string, fields?: CliMessageFields): void;
package/dist/cli/index.js CHANGED
@@ -38,6 +38,8 @@ program
38
38
  .option('-v, --verbose', 'Enable verbose ingestion warnings (default: false)')
39
39
  .option('--max-file-size <kb>', 'Skip files larger than this (KB). Default: 512. Hard cap: 32768 (tree-sitter limit).')
40
40
  .option('--worker-timeout <seconds>', 'Worker sub-batch idle timeout before retry/fallback. Default: 30.')
41
+ .option('--wal-checkpoint-threshold <bytes>', 'LadybugDB WAL auto-checkpoint threshold in bytes during analyze ' +
42
+ '(integer >= -1; default: 67108864 = 64 MiB; -1 keeps Ladybug stock ~16 MiB).')
41
43
  .option('--workers <n>', 'Parse worker pool size. Default: cores-1 capped at 16. Pass 0 to disable workers (sequential).')
42
44
  .option('--embedding-threads <n>', 'Limit local ONNX embedding CPU threads')
43
45
  .option('--embedding-batch-size <n>', 'Number of nodes per embedding batch')
@@ -47,6 +49,7 @@ program
47
49
  ' GITNEXUS_NO_GITIGNORE=1 Skip .gitignore parsing (still reads .gitnexusignore)\n' +
48
50
  ' GITNEXUS_MAX_FILE_SIZE=N Override large-file skip threshold (KB). Default 512, max 32768.\n' +
49
51
  ' GITNEXUS_WORKER_SUB_BATCH_TIMEOUT_MS=N Worker idle timeout in milliseconds. Default 30000.\n' +
52
+ ' GITNEXUS_WAL_CHECKPOINT_THRESHOLD=N LadybugDB WAL auto-checkpoint threshold in bytes (default 67108864 = 64 MiB; -1 keeps Ladybug stock ~16 MiB).\n' +
50
53
  ' GITNEXUS_WORKER_SUB_BATCH_MAX_BYTES=N Worker job byte budget. Default 8388608.\n' +
51
54
  ' GITNEXUS_WORKER_POOL_SIZE=N Parse worker count override. Default cores-1 capped at 16.\n' +
52
55
  ' GITNEXUS_PARSE_CHUNK_CONCURRENCY=N Concurrent in-flight parse chunks. Default 2.\n' +
@@ -55,6 +58,7 @@ program
55
58
  ' GITNEXUS_WORKER_CONSECUTIVE_FAILURE_THRESHOLD=N Per-slot deaths to trip circuit breaker. Default max(3, poolSize).\n' +
56
59
  ' GITNEXUS_EMBEDDING_THREADS=N Limit local ONNX CPU threads for --embeddings.\n' +
57
60
  ' GITNEXUS_SEMANTIC_EXACT_SCAN_LIMIT=N Max embedding chunks for exact-scan fallback. Default 10000.\n' +
61
+ '\nFlags override the corresponding env vars when both are provided.\n' +
58
62
  '\nTip: `.gitnexusignore` supports `.gitignore`-style negation. Add e.g.\n' +
59
63
  ' `!__tests__/` to index a directory that is auto-filtered by default (#771).')
60
64
  .action(createLazyAction(() => import('./analyze.js'), 'analyzeCommand'));
@@ -142,6 +142,21 @@ export declare const fetchExistingEmbeddingHashes: (execQuery: (cypher: string)
142
142
  * @see safeClose — CHECKPOINT + connection/database close
143
143
  */
144
144
  export declare const flushWAL: () => Promise<void>;
145
+ /**
146
+ * Issue a manual `CHECKPOINT` against the current connection and surface
147
+ * any engine error to the caller. Unlike {@link flushWAL}, this variant
148
+ * does NOT swallow Ladybug rename/remove IO failures — the manual
149
+ * checkpoint driver (`wal-checkpoint-driver.ts`) relies on the rejection
150
+ * to drive its bounded retry loop. Returns `false` when no connection is
151
+ * open (the caller treats this as a no-op success — there is no WAL to
152
+ * flush). Returns `true` after a successful CHECKPOINT + drain.
153
+ *
154
+ * The split from `flushWAL` is deliberate: every other CHECKPOINT site
155
+ * (server flush, safeClose) is best-effort and prefers a silent skip;
156
+ * the manual driver, by contrast, must observe failures to decide
157
+ * whether to retry.
158
+ */
159
+ export declare const tryFlushWAL: () => Promise<boolean>;
145
160
  /**
146
161
  * Flush the WAL and close the connection and database handles.
147
162
  *
@@ -1334,6 +1334,27 @@ export const flushWAL = async () => {
1334
1334
  logger.debug(`GitNexus: LadybugDB CHECKPOINT skipped/failed during WAL flush: ${summarizeError(err)}`);
1335
1335
  }
1336
1336
  };
1337
+ /**
1338
+ * Issue a manual `CHECKPOINT` against the current connection and surface
1339
+ * any engine error to the caller. Unlike {@link flushWAL}, this variant
1340
+ * does NOT swallow Ladybug rename/remove IO failures — the manual
1341
+ * checkpoint driver (`wal-checkpoint-driver.ts`) relies on the rejection
1342
+ * to drive its bounded retry loop. Returns `false` when no connection is
1343
+ * open (the caller treats this as a no-op success — there is no WAL to
1344
+ * flush). Returns `true` after a successful CHECKPOINT + drain.
1345
+ *
1346
+ * The split from `flushWAL` is deliberate: every other CHECKPOINT site
1347
+ * (server flush, safeClose) is best-effort and prefers a silent skip;
1348
+ * the manual driver, by contrast, must observe failures to decide
1349
+ * whether to retry.
1350
+ */
1351
+ export const tryFlushWAL = async () => {
1352
+ if (!conn)
1353
+ return false;
1354
+ const checkpointResult = await conn.query('CHECKPOINT');
1355
+ await drainQueryResult(checkpointResult);
1356
+ return true;
1357
+ };
1337
1358
  /**
1338
1359
  * Flush the WAL and close the connection and database handles.
1339
1360
  *
@@ -32,8 +32,15 @@ import type lbug from '@ladybugdb/core';
32
32
  * integer; anything invalid falls back to the default.
33
33
  */
34
34
  export declare const LBUG_MAX_DB_SIZE: number;
35
+ export declare const parseWalCheckpointThreshold: (raw: string | undefined) => number | undefined;
35
36
  export declare const WAL_RECOVERY_SUGGESTION = "WAL corruption detected. Run `gitnexus analyze --force` to rebuild the index.";
36
37
  export declare function isWalCorruptionError(err: unknown): boolean;
38
+ /**
39
+ * True when `err` looks like a Ladybug WAL-checkpoint rotation/remove IO
40
+ * failure. Tries strict matchers first (renames + removes), then falls
41
+ * back to the permissive matcher.
42
+ */
43
+ export declare const isLbugCheckpointIoError: (err: unknown) => boolean;
37
44
  type LbugModule = typeof lbug;
38
45
  export interface LbugDatabaseOptions {
39
46
  readOnly?: boolean;
@@ -1,6 +1,7 @@
1
1
  import fs from 'fs/promises';
2
2
  import os from 'os';
3
3
  import path from 'path';
4
+ import { logger } from '../logger.js';
4
5
  /**
5
6
  * Shared configuration for `@ladybugdb/core` `Database` construction.
6
7
  *
@@ -42,6 +43,43 @@ export const LBUG_MAX_DB_SIZE = (() => {
42
43
  }
43
44
  return 16 * 1024 * 1024 * 1024;
44
45
  })();
46
+ export const parseWalCheckpointThreshold = (raw) => {
47
+ if (raw === undefined)
48
+ return undefined;
49
+ const normalized = raw.trim();
50
+ if (normalized.length === 0)
51
+ return undefined;
52
+ const parsed = Number(normalized);
53
+ if (!Number.isInteger(parsed) || parsed < -1)
54
+ return undefined;
55
+ return parsed;
56
+ };
57
+ /**
58
+ * Default GitNexus WAL auto-checkpoint threshold in bytes (64 MiB).
59
+ *
60
+ * Larger than Ladybug's stock ~16 MiB to reduce checkpoint rename/remove
61
+ * churn under heavy analyze write load — the original race that motivated
62
+ * issue #1741 triggered at the stock threshold. README examples in
63
+ * `README.md` and `gitnexus/README.md` and the recovery hint in
64
+ * `analyze.ts` MUST stay in sync with this value.
65
+ */
66
+ const DEFAULT_WAL_CHECKPOINT_THRESHOLD = 64 * 1024 * 1024;
67
+ const resolveCheckpointThreshold = () => {
68
+ const raw = process.env.GITNEXUS_WAL_CHECKPOINT_THRESHOLD;
69
+ if (raw === undefined)
70
+ return DEFAULT_WAL_CHECKPOINT_THRESHOLD;
71
+ const parsed = parseWalCheckpointThreshold(raw);
72
+ if (parsed !== undefined)
73
+ return parsed;
74
+ // Non-empty but unparseable input: warn the operator and fall back. Mirrors
75
+ // the CLI's `--wal-checkpoint-threshold` validation (which hard-errors)
76
+ // but the env-var path stays soft to preserve "set once in your shell"
77
+ // ergonomics across mixed-version invocations.
78
+ if (raw.trim().length > 0) {
79
+ logger.warn({ rawValue: raw, fallback: DEFAULT_WAL_CHECKPOINT_THRESHOLD }, `Ignoring invalid GITNEXUS_WAL_CHECKPOINT_THRESHOLD=${raw}; expected integer >= -1; falling back to default (${DEFAULT_WAL_CHECKPOINT_THRESHOLD}).`);
80
+ }
81
+ return DEFAULT_WAL_CHECKPOINT_THRESHOLD;
82
+ };
45
83
  /** Matches WAL corruption errors from the LadybugDB engine. */
46
84
  const WAL_CORRUPTION_RE = /corrupt(ed)?\s+wal|invalid\s+wal\s+record|wal.*corrupt|checksum.*wal/i;
47
85
  export const WAL_RECOVERY_SUGGESTION = 'WAL corruption detected. Run `gitnexus analyze --force` to rebuild the index.';
@@ -51,6 +89,48 @@ export function isWalCorruptionError(err) {
51
89
  const msg = err instanceof Error ? err.message : String(err);
52
90
  return WAL_CORRUPTION_RE.test(msg);
53
91
  }
92
+ // ─── Ladybug WAL checkpoint IO error matchers ───────────────────────────────
93
+ //
94
+ // Matched against LadybugDB v0.16.1 (see `gitnexus/package.json`
95
+ // @ladybugdb/core). Strict regexes encode local_file_system.cpp wording
96
+ // verified at that version. Two-tier strategy: strict matchers first so we
97
+ // only fire on real checkpoint-rotation shapes; a permissive fallback
98
+ // catches future Ladybug message drift so the recovery hint keeps surfacing
99
+ // even if upstream wording changes.
100
+ //
101
+ // From Ladybug native LocalFileSystem exceptions (`local_file_system.cpp`),
102
+ // surfaced in Node as:
103
+ // "Runtime exception: IO exception: Error renaming file ..."
104
+ // "Runtime exception: IO exception: Error removing directory or file ..."
105
+ // We only match checkpoint-rotation shapes:
106
+ // - "<db>.wal -> <db>.wal.checkpoint" rename failures
107
+ // - "<db>.wal.checkpoint" remove failures
108
+ // Example matches:
109
+ // "Runtime exception: IO exception: Error renaming file /x/lbug.wal to /x/lbug.wal.checkpoint. ErrorMessage: Permission denied"
110
+ // "Runtime exception: IO exception: Error removing directory or file /x/lbug.wal.checkpoint. Error Message: Permission denied"
111
+ // Matching is case-insensitive to remain robust across wrappers/platforms.
112
+ const LBUG_CHECKPOINT_RENAME_RE = /^runtime exception: io exception:\s*error renaming file\s+.+?\.wal\s+to\s+.+?\.wal\.checkpoint(?:\.|\s|$)/i;
113
+ const LBUG_CHECKPOINT_REMOVE_RE = /^runtime exception: io exception:\s*error removing directory or file\s+.+?\.wal\.checkpoint(?:\.|\s|$)/i;
114
+ /**
115
+ * Permissive fallback: any IO-exception-shaped message that mentions a
116
+ * `.wal.checkpoint` path. Catches future Ladybug message drift (different
117
+ * verb, additional preamble, locale variation) so the recovery hint keeps
118
+ * surfacing even if the strict regexes go stale.
119
+ */
120
+ const LBUG_CHECKPOINT_PERMISSIVE_RE = /io exception.*\.wal\.checkpoint/i;
121
+ /**
122
+ * True when `err` looks like a Ladybug WAL-checkpoint rotation/remove IO
123
+ * failure. Tries strict matchers first (renames + removes), then falls
124
+ * back to the permissive matcher.
125
+ */
126
+ export const isLbugCheckpointIoError = (err) => {
127
+ if (!err)
128
+ return false;
129
+ const msg = err instanceof Error ? err.message : String(err);
130
+ if (LBUG_CHECKPOINT_RENAME_RE.test(msg) || LBUG_CHECKPOINT_REMOVE_RE.test(msg))
131
+ return true;
132
+ return LBUG_CHECKPOINT_PERMISSIVE_RE.test(msg);
133
+ };
54
134
  /**
55
135
  * Return true when the error message indicates that a LadybugDB file lock
56
136
  * could not be acquired — either at construction time
@@ -76,8 +156,8 @@ export function createLbugDatabase(lbugModule, databasePath, options = {}) {
76
156
  // .d.ts declares fewer args than the native constructor accepts.
77
157
  return new lbugModule.Database(databasePath, 0, // bufferManagerSize
78
158
  false, // enableCompression (pinned for v0.16.0)
79
- options.readOnly ?? false, LBUG_MAX_DB_SIZE, true, // autoCheckpoint
80
- -1, // checkpointThreshold
159
+ options.readOnly ?? false, LBUG_MAX_DB_SIZE, true, // autoCheckpoint (always on)
160
+ resolveCheckpointThreshold(), // checkpointThreshold (default 64 MiB; override with GITNEXUS_WAL_CHECKPOINT_THRESHOLD; -1 keeps Ladybug stock ~16 MiB)
81
161
  options.throwOnWalReplayFailure ?? true, true);
82
162
  }
83
163
  // ─── Lock-busy retry tuning knobs ───────────────────────────────────────────
@@ -0,0 +1,98 @@
1
+ /**
2
+ * Manual WAL checkpoint driver with bounded retry (#1741 follow-up).
3
+ *
4
+ * Background
5
+ * ----------
6
+ * LadybugDB's native auto-checkpoint runs from inside the C++ engine on a
7
+ * background path that has no JS-side hook for mid-write rotation. When
8
+ * the rename of `<db>.wal` → `<db>.wal.checkpoint` races a transient file
9
+ * lock (Windows Defender, AV scanner, NTFS shadow copy) the engine raises
10
+ * a `Runtime exception: IO exception: Error renaming file …` that aborts
11
+ * the in-flight write. There is no engine-level retry.
12
+ *
13
+ * The auto-checkpoint cannot be made retryable from JS, but a *manual*
14
+ * `CHECKPOINT` query that the JS layer issues itself CAN be wrapped in a
15
+ * bounded retry. By draining the WAL on a tight cadence — more often than
16
+ * the native threshold — the auto-checkpoint almost never has work left
17
+ * to do, so the un-retriable native rename race is moved into the
18
+ * JS-controlled path where this module's retry absorbs it.
19
+ *
20
+ * Design contract
21
+ * ---------------
22
+ * - `autoCheckpoint` stays on (maintainer requirement). This driver is
23
+ * additive: it preempts the native checkpoint, it does not replace it.
24
+ * - The driver runs ONLY during analyze (callers opt-in explicitly). MCP
25
+ * and other long-lived flows continue to rely on the close-time
26
+ * CHECKPOINT in `safeClose`.
27
+ * - Opt-out is via `GITNEXUS_WAL_MANUAL_CHECKPOINT=0`. Default is on.
28
+ * - Retries only fire on `isLbugCheckpointIoError` — every other error
29
+ * surfaces immediately. The retry budget is small (3 attempts) with
30
+ * jittered backoff so a chronic rename failure escalates fast.
31
+ * - Retry attempts log at `debug`; only the final, exhausted failure
32
+ * surfaces to the caller (and is logged at `warn` here for operators).
33
+ */
34
+ /**
35
+ * Run a single CHECKPOINT with bounded retry on
36
+ * `isLbugCheckpointIoError`. Returns the number of attempts actually
37
+ * spent (1-`CHECKPOINT_RETRY_ATTEMPTS`) on success, or rethrows the last
38
+ * checkpoint error after exhausting the budget. Non-checkpoint errors
39
+ * (e.g. WAL corruption, lock-busy) propagate immediately on the first
40
+ * attempt — those are not what this retry is designed to absorb.
41
+ *
42
+ * The split from `flushWAL` is deliberate: `flushWAL` is the swallow-and-
43
+ * log helper used by `safeClose` and the server's best-effort flush,
44
+ * which by contract cannot fail the surrounding operation. The manual
45
+ * driver MUST observe failures to decide whether to retry, and that is
46
+ * the role of `tryFlushWAL`.
47
+ *
48
+ * Exported for direct unit testing — production callers use
49
+ * {@link startWalCheckpointDriver} or {@link checkpointOnce}.
50
+ */
51
+ export declare const runCheckpointWithRetry: (options?: {
52
+ /** Override the sleep implementation for tests. */
53
+ sleepFn?: (ms: number) => Promise<void>;
54
+ /** Override the CHECKPOINT call for tests. */
55
+ checkpointFn?: () => Promise<boolean>;
56
+ /** Override the jitter source for tests. Returns a value in [0, 1). */
57
+ randomFn?: () => number;
58
+ }) => Promise<{
59
+ attempts: number;
60
+ flushed: boolean;
61
+ }>;
62
+ /**
63
+ * Single-shot manual checkpoint. Use this when the caller drives the
64
+ * cadence itself (e.g. a phase boundary in `runFullAnalysis`).
65
+ *
66
+ * Honors the `GITNEXUS_WAL_MANUAL_CHECKPOINT=0` opt-out so operators can
67
+ * disable the manual path if it ever interacts badly with a future
68
+ * Ladybug release.
69
+ */
70
+ export declare const checkpointOnce: () => Promise<void>;
71
+ /**
72
+ * Start a periodic manual checkpoint driver. The returned handle has a
73
+ * `stop()` method that resolves once the in-flight checkpoint (if any)
74
+ * settles, so callers can `await driver.stop()` before close-time
75
+ * `safeClose` and avoid racing the final flush.
76
+ *
77
+ * The first checkpoint fires after `periodMs` (not immediately) so a
78
+ * cold analyze does not pay a CHECKPOINT round trip before any writes
79
+ * have happened.
80
+ */
81
+ export interface WalCheckpointDriver {
82
+ /** Stop the driver and await any in-flight checkpoint. Idempotent. */
83
+ stop(): Promise<void>;
84
+ }
85
+ export declare const startWalCheckpointDriver: (options?: {
86
+ periodMs?: number;
87
+ }) => WalCheckpointDriver;
88
+ /**
89
+ * Reading `GITNEXUS_WAL_MANUAL_CHECKPOINT` at every call site (rather
90
+ * than caching at module load) keeps `analyzeCommand` env restoration
91
+ * honest: tests that toggle the flag between invocations see the live
92
+ * value, matching the `ANALYZE_CLI_ENV_KEYS` snapshot/restore contract
93
+ * in `analyze.ts`.
94
+ *
95
+ * Accepted opt-out values: '0', 'false', 'off', 'no' (case-insensitive).
96
+ * Anything else — including undefined — leaves the driver enabled.
97
+ */
98
+ export declare const isManualCheckpointEnabled: () => boolean;
@@ -0,0 +1,189 @@
1
+ /**
2
+ * Manual WAL checkpoint driver with bounded retry (#1741 follow-up).
3
+ *
4
+ * Background
5
+ * ----------
6
+ * LadybugDB's native auto-checkpoint runs from inside the C++ engine on a
7
+ * background path that has no JS-side hook for mid-write rotation. When
8
+ * the rename of `<db>.wal` → `<db>.wal.checkpoint` races a transient file
9
+ * lock (Windows Defender, AV scanner, NTFS shadow copy) the engine raises
10
+ * a `Runtime exception: IO exception: Error renaming file …` that aborts
11
+ * the in-flight write. There is no engine-level retry.
12
+ *
13
+ * The auto-checkpoint cannot be made retryable from JS, but a *manual*
14
+ * `CHECKPOINT` query that the JS layer issues itself CAN be wrapped in a
15
+ * bounded retry. By draining the WAL on a tight cadence — more often than
16
+ * the native threshold — the auto-checkpoint almost never has work left
17
+ * to do, so the un-retriable native rename race is moved into the
18
+ * JS-controlled path where this module's retry absorbs it.
19
+ *
20
+ * Design contract
21
+ * ---------------
22
+ * - `autoCheckpoint` stays on (maintainer requirement). This driver is
23
+ * additive: it preempts the native checkpoint, it does not replace it.
24
+ * - The driver runs ONLY during analyze (callers opt-in explicitly). MCP
25
+ * and other long-lived flows continue to rely on the close-time
26
+ * CHECKPOINT in `safeClose`.
27
+ * - Opt-out is via `GITNEXUS_WAL_MANUAL_CHECKPOINT=0`. Default is on.
28
+ * - Retries only fire on `isLbugCheckpointIoError` — every other error
29
+ * surfaces immediately. The retry budget is small (3 attempts) with
30
+ * jittered backoff so a chronic rename failure escalates fast.
31
+ * - Retry attempts log at `debug`; only the final, exhausted failure
32
+ * surfaces to the caller (and is logged at `warn` here for operators).
33
+ */
34
+ import { logger } from '../logger.js';
35
+ import { tryFlushWAL } from './lbug-adapter.js';
36
+ import { isLbugCheckpointIoError } from './lbug-config.js';
37
+ /**
38
+ * Bounded retry budget. Total worst-case wall time is dominated by the
39
+ * three sleeps below (~750 ms before jitter) plus three CHECKPOINT round
40
+ * trips — small enough to stay invisible during a large analyze, large
41
+ * enough to ride out a single AV scanner sweep on Windows.
42
+ */
43
+ const CHECKPOINT_RETRY_ATTEMPTS = 3;
44
+ /**
45
+ * Base back-off in ms. Each attempt waits `BASE_DELAYS[attempt-1]`
46
+ * milliseconds before the next try, plus a small jitter to avoid
47
+ * synchronized retries when multiple analyzers ever share a host.
48
+ */
49
+ const BASE_DELAYS_MS = [50, 200, 500];
50
+ /** Maximum jitter added on top of each base delay. */
51
+ const JITTER_MAX_MS = 50;
52
+ const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
53
+ /**
54
+ * Run a single CHECKPOINT with bounded retry on
55
+ * `isLbugCheckpointIoError`. Returns the number of attempts actually
56
+ * spent (1-`CHECKPOINT_RETRY_ATTEMPTS`) on success, or rethrows the last
57
+ * checkpoint error after exhausting the budget. Non-checkpoint errors
58
+ * (e.g. WAL corruption, lock-busy) propagate immediately on the first
59
+ * attempt — those are not what this retry is designed to absorb.
60
+ *
61
+ * The split from `flushWAL` is deliberate: `flushWAL` is the swallow-and-
62
+ * log helper used by `safeClose` and the server's best-effort flush,
63
+ * which by contract cannot fail the surrounding operation. The manual
64
+ * driver MUST observe failures to decide whether to retry, and that is
65
+ * the role of `tryFlushWAL`.
66
+ *
67
+ * Exported for direct unit testing — production callers use
68
+ * {@link startWalCheckpointDriver} or {@link checkpointOnce}.
69
+ */
70
+ export const runCheckpointWithRetry = async (options = {}) => {
71
+ const sleepImpl = options.sleepFn ?? sleep;
72
+ const checkpointImpl = options.checkpointFn ?? tryFlushWAL;
73
+ const randomImpl = options.randomFn ?? Math.random;
74
+ let lastError;
75
+ for (let attempt = 1; attempt <= CHECKPOINT_RETRY_ATTEMPTS; attempt++) {
76
+ try {
77
+ const flushed = await checkpointImpl();
78
+ return { attempts: attempt, flushed };
79
+ }
80
+ catch (err) {
81
+ lastError = err;
82
+ if (!isLbugCheckpointIoError(err)) {
83
+ // Non-checkpoint error — propagate immediately. Examples:
84
+ // WAL corruption, missing connection, query syntax failure.
85
+ // Retrying these would only mask the real signal.
86
+ throw err;
87
+ }
88
+ if (attempt === CHECKPOINT_RETRY_ATTEMPTS)
89
+ break;
90
+ const base = BASE_DELAYS_MS[Math.min(attempt - 1, BASE_DELAYS_MS.length - 1)] ?? 500;
91
+ // randomImpl defaults to Math.random — non-cryptographic by design; jitter only avoids
92
+ // synchronized retries between concurrent analyzers.
93
+ const delayMs = base + Math.floor(randomImpl() * JITTER_MAX_MS);
94
+ logger.debug({ attempt, totalAttempts: CHECKPOINT_RETRY_ATTEMPTS, delayMs }, 'GitNexus: WAL checkpoint IO error — retrying');
95
+ await sleepImpl(delayMs);
96
+ }
97
+ }
98
+ logger.warn({ attempts: CHECKPOINT_RETRY_ATTEMPTS }, 'GitNexus: manual WAL checkpoint exhausted retry budget — surfacing IO error to caller');
99
+ throw lastError;
100
+ };
101
+ /**
102
+ * Single-shot manual checkpoint. Use this when the caller drives the
103
+ * cadence itself (e.g. a phase boundary in `runFullAnalysis`).
104
+ *
105
+ * Honors the `GITNEXUS_WAL_MANUAL_CHECKPOINT=0` opt-out so operators can
106
+ * disable the manual path if it ever interacts badly with a future
107
+ * Ladybug release.
108
+ */
109
+ export const checkpointOnce = async () => {
110
+ if (!isManualCheckpointEnabled())
111
+ return;
112
+ await runCheckpointWithRetry();
113
+ };
114
+ /** Default cadence (ms) for the periodic driver. */
115
+ const DEFAULT_PERIOD_MS = 5_000;
116
+ export const startWalCheckpointDriver = (options = {}) => {
117
+ if (!isManualCheckpointEnabled()) {
118
+ return { stop: async () => undefined };
119
+ }
120
+ const periodMs = options.periodMs ?? DEFAULT_PERIOD_MS;
121
+ let stopped = false;
122
+ let inflight = null;
123
+ const tick = async () => {
124
+ if (stopped)
125
+ return;
126
+ inflight = runCheckpointWithRetry()
127
+ .then(() => undefined)
128
+ .catch((err) => {
129
+ // The retry budget exhausted. The caller's surrounding write
130
+ // will see the same engine error on its next operation and the
131
+ // `analyzeCommand` catch block will emit the recovery hint.
132
+ // Logging here keeps the operator-visible trail without
133
+ // double-logging the user-facing message.
134
+ logger.warn({ err: err instanceof Error ? err.message : String(err) }, 'GitNexus: manual WAL checkpoint failed after retries');
135
+ });
136
+ try {
137
+ await inflight;
138
+ }
139
+ finally {
140
+ inflight = null;
141
+ }
142
+ };
143
+ const handle = setInterval(() => {
144
+ // Fire-and-forget: setInterval cannot await directly. The next tick
145
+ // is guarded by `stopped` and the `inflight` reference.
146
+ void tick();
147
+ }, periodMs);
148
+ // `setInterval` returned by Node is a `Timeout` object with `.unref()`
149
+ // so a hung driver never prevents process exit.
150
+ if (typeof handle.unref === 'function') {
151
+ handle.unref();
152
+ }
153
+ return {
154
+ stop: async () => {
155
+ if (stopped) {
156
+ if (inflight)
157
+ await inflight;
158
+ return;
159
+ }
160
+ stopped = true;
161
+ clearInterval(handle);
162
+ if (inflight) {
163
+ try {
164
+ await inflight;
165
+ }
166
+ catch {
167
+ /* swallowed in tick() — surface path is the surrounding write */
168
+ }
169
+ }
170
+ },
171
+ };
172
+ };
173
+ /**
174
+ * Reading `GITNEXUS_WAL_MANUAL_CHECKPOINT` at every call site (rather
175
+ * than caching at module load) keeps `analyzeCommand` env restoration
176
+ * honest: tests that toggle the flag between invocations see the live
177
+ * value, matching the `ANALYZE_CLI_ENV_KEYS` snapshot/restore contract
178
+ * in `analyze.ts`.
179
+ *
180
+ * Accepted opt-out values: '0', 'false', 'off', 'no' (case-insensitive).
181
+ * Anything else — including undefined — leaves the driver enabled.
182
+ */
183
+ export const isManualCheckpointEnabled = () => {
184
+ const raw = process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT;
185
+ if (raw === undefined)
186
+ return true;
187
+ const normalized = raw.trim().toLowerCase();
188
+ return !['0', 'false', 'off', 'no'].includes(normalized);
189
+ };
@@ -14,6 +14,7 @@ import { execFileSync } from 'child_process';
14
14
  import { runPipelineFromRepo } from './ingestion/pipeline.js';
15
15
  import { initLbug, loadGraphToLbug, getLbugStats, executeQuery, executeWithReusedStatement, closeLbug, loadCachedEmbeddings, deleteNodesForFile, deleteAllCommunitiesAndProcesses, queryImporters, } from './lbug/lbug-adapter.js';
16
16
  import { createSearchFTSIndexes, verifySearchFTSIndexes } from './search/fts-indexes.js';
17
+ import { startWalCheckpointDriver, } from './lbug/wal-checkpoint-driver.js';
17
18
  import { getStoragePaths, saveMeta, loadMeta, ensureGitNexusIgnored, registerRepo, cleanupOldKuzuFiles, INCREMENTAL_SCHEMA_VERSION, } from '../storage/repo-manager.js';
18
19
  import { computeFileHashes, diffFileHashes } from '../storage/file-hash.js';
19
20
  import { extractChangedSubgraph, computeEffectiveWriteSet, } from './incremental/subgraph-extract.js';
@@ -343,6 +344,15 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
343
344
  }
344
345
  }
345
346
  await initLbug(lbugPath);
347
+ // Manual WAL checkpoint driver (#1741): periodically drain the WAL
348
+ // from JS so the un-retriable native auto-checkpoint almost never
349
+ // has work left to do. Failures of the manual CHECKPOINT are absorbed
350
+ // by the driver's bounded retry; the final un-recoverable error still
351
+ // surfaces via the surrounding write that follows the failed flush.
352
+ // Opt-out via `GITNEXUS_WAL_MANUAL_CHECKPOINT=0` (the driver itself
353
+ // returns a no-op handle when disabled). Analyze-only: MCP and serve
354
+ // paths continue to rely on the close-time CHECKPOINT in `safeClose`.
355
+ const walCheckpointDriver = startWalCheckpointDriver();
346
356
  try {
347
357
  // All work after initLbug is wrapped in try/finally to ensure closeLbug()
348
358
  // is called even if an error occurs — the module-level singleton DB handle
@@ -726,6 +736,9 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
726
736
  // Best-effort — don't fail the entire analysis for context file issues
727
737
  }
728
738
  // ── Close LadybugDB ──────────────────────────────────────────────
739
+ // Stop the manual checkpoint driver before closeLbug so its
740
+ // in-flight CHECKPOINT cannot race the `safeClose` CHECKPOINT.
741
+ await walCheckpointDriver.stop();
729
742
  await closeLbug();
730
743
  progress('done', 100, 'Done');
731
744
  return {
@@ -736,7 +749,14 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
736
749
  };
737
750
  }
738
751
  catch (err) {
739
- // Ensure LadybugDB is closed even on error
752
+ // Ensure LadybugDB is closed even on error. Stop the driver first
753
+ // so its retry loop cannot extend an already-failing analyze.
754
+ try {
755
+ await walCheckpointDriver.stop();
756
+ }
757
+ catch {
758
+ /* swallow — surface path is the rethrow below */
759
+ }
740
760
  try {
741
761
  await closeLbug();
742
762
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gitnexus",
3
- "version": "1.6.6-rc.42",
3
+ "version": "1.6.6-rc.43",
4
4
  "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
5
5
  "author": "Abhigyan Patwari",
6
6
  "license": "PolyForm-Noncommercial-1.0.0",