npm - gitnexus - Versions diffs - 1.6.6-rc.42 → 1.6.6-rc.43 - Mend

gitnexus 1.6.6-rc.42 → 1.6.6-rc.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +2 -0
package/dist/cli/analyze.d.ts +2 -0
package/dist/cli/analyze.js +30 -1
package/dist/cli/cli-message.d.ts +25 -3
package/dist/cli/index.js +4 -0
package/dist/core/lbug/lbug-adapter.d.ts +15 -0
package/dist/core/lbug/lbug-adapter.js +21 -0
package/dist/core/lbug/lbug-config.d.ts +7 -0
package/dist/core/lbug/lbug-config.js +82 -2
package/dist/core/lbug/wal-checkpoint-driver.d.ts +98 -0
package/dist/core/lbug/wal-checkpoint-driver.js +189 -0
package/dist/core/run-analyze.js +21 -1
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -158,6 +158,7 @@ gitnexus analyze --skip-agents-md  # Preserve custom AGENTS.md/CLAUDE.md gitnexu
 gitnexus analyze --verbose       # Log skipped files when parsers are unavailable
 gitnexus analyze --max-file-size 1024  # Skip files larger than N KB (default: 512, cap: 32768)
 gitnexus analyze --worker-timeout 60  # Increase worker idle timeout for slow parses
+gitnexus analyze --wal-checkpoint-threshold 67108864  # 64 MiB. Control LadybugDB WAL auto-checkpoint threshold (default: 67108864 = 64 MiB; -1 keeps Ladybug stock ~16 MiB)
 gitnexus mcp                     # Start MCP server (stdio) — serves all indexed repos
 gitnexus serve                   # Start local HTTP server (multi-repo) for web UI
 gitnexus index                   # Register an existing .gitnexus/ folder into the global registry
@@ -307,6 +308,7 @@ Configure the behavior with two environment variables:
 |----------|--------|---------|--------|
 | `GITNEXUS_LBUG_EXTENSION_INSTALL` | `auto`, `load-only`, `never` | `auto` | `auto` runs one bounded INSTALL if LOAD fails. `load-only` only uses already-installed extensions (recommended for offline / firewalled environments). `never` skips optional extensions entirely. |
 | `GITNEXUS_LBUG_EXTENSION_INSTALL_TIMEOUT_MS` | positive integer | `15000` | Wall-clock budget for the out-of-process `INSTALL` child before it is killed. |
+| `GITNEXUS_WAL_CHECKPOINT_THRESHOLD` | integer `>= -1` | `67108864` (64 MiB) | LadybugDB WAL auto-checkpoint threshold during analyze (bytes). Auto-checkpoint remains enabled; `-1` keeps Ladybug's stock ~16 MiB. Larger thresholds reduce checkpoint frequency but increase the WAL size at rotation time — choose a smaller value on disk-constrained environments. |
 ```bash
 # Offline/airgapped: never reach the network for extensions

package/dist/cli/analyze.d.ts CHANGED Viewed

@@ -69,6 +69,8 @@ export interface AnalyzeOptions {
     maxFileSize?: string;
     /** Override worker sub-batch idle timeout in seconds. */
     workerTimeout?: string;
+    /** Control LadybugDB WAL auto-checkpoint threshold during analyze. */
+    walCheckpointThreshold?: string;
     /** Parse worker pool size; 0 disables workers (sequential fallback). */
     workers?: string;
     embeddingThreads?: string;

package/dist/cli/analyze.js CHANGED Viewed

@@ -12,7 +12,7 @@ import { spawn } from 'child_process';
 import v8 from 'v8';
 import cliProgress from 'cli-progress';
 import { closeLbug } from '../core/lbug/lbug-adapter.js';
-import { isWalCorruptionError, WAL_RECOVERY_SUGGESTION } from '../core/lbug/lbug-config.js';
+import { isLbugCheckpointIoError, isWalCorruptionError, parseWalCheckpointThreshold, WAL_RECOVERY_SUGGESTION, } from '../core/lbug/lbug-config.js';
 import { getStoragePaths, getGlobalRegistryPath, RegistryNameCollisionError, AnalysisNotFinalizedError, assertAnalysisFinalized, } from '../storage/repo-manager.js';
 import { getGitRoot, hasGitDir } from '../storage/git.js';
 import { runFullAnalysis } from '../core/run-analyze.js';
@@ -322,6 +322,14 @@ const forceHeapOOMForTestIfEnabled = () => {
     for (;;)
         chunks.push('x'.repeat(1024 * 1024));
 };
+// 64 MiB keeps auto-checkpoint enabled but triggers less frequently than
+// Ladybug's stock ~16 MiB threshold, reducing rename/remove churn on large
+// runs. Also matches the GitNexus default in `lbug-config.ts`.
+//
+// IMPORTANT: keep README examples (`README.md`, `gitnexus/README.md`) and
+// the `DEFAULT_WAL_CHECKPOINT_THRESHOLD` constant in
+// `gitnexus/src/core/lbug/lbug-config.ts` in sync with this value.
+const RECOMMENDED_WAL_CHECKPOINT_THRESHOLD = 64 * 1024 * 1024;
 /** Re-exec the process with a 16GB heap and larger stack if we're currently below that. */
 async function ensureHeap() {
     const nodeOpts = process.env.NODE_OPTIONS || '';
@@ -378,6 +386,8 @@ const ANALYZE_CLI_ENV_KEYS = [
     'GITNEXUS_PROFILE_DEFERRED_SLOW_MS',
     'GITNEXUS_MAX_FILE_SIZE',
     'GITNEXUS_WORKER_SUB_BATCH_TIMEOUT_MS',
+    'GITNEXUS_WAL_CHECKPOINT_THRESHOLD',
+    'GITNEXUS_WAL_MANUAL_CHECKPOINT',
     'GITNEXUS_EMBEDDING_THREADS',
     'GITNEXUS_EMBEDDING_BATCH_SIZE',
     'GITNEXUS_EMBEDDING_SUB_BATCH_SIZE',
@@ -452,6 +462,15 @@ const analyzeCommandImpl = async (inputPath, options) => {
         }
         process.env.GITNEXUS_WORKER_SUB_BATCH_TIMEOUT_MS = String(Math.round(workerTimeoutSeconds * 1000));
     }
+    if (options?.walCheckpointThreshold !== undefined) {
+        const parsed = parseWalCheckpointThreshold(options.walCheckpointThreshold);
+        if (parsed === undefined) {
+            cliError('  --wal-checkpoint-threshold must be an integer >= -1.\n');
+            process.exitCode = 1;
+            return;
+        }
+        process.env.GITNEXUS_WAL_CHECKPOINT_THRESHOLD = String(parsed);
+    }
     // `--workers` is threaded through `runFullAnalysis` options → PipelineOptions
     // → createWorkerPool, intentionally bypassing the GITNEXUS_WORKER_POOL_SIZE
     // env channel so this CLI surface never mutates `process.env` for pool size.
@@ -859,6 +878,16 @@ const analyzeCommandImpl = async (inputPath, options) => {
             process.exitCode = 1;
             return;
         }
+        if (isLbugCheckpointIoError(err)) {
+            cliError(`  LadybugDB failed while rotating/removing WAL checkpoint files.\n` +
+                `  This can happen when auto-checkpoint runs at the default threshold (~16MB).\n` +
+                `  Retry with a larger checkpoint threshold to reduce checkpoint frequency:\n` +
+                `    gitnexus analyze --wal-checkpoint-threshold ${RECOMMENDED_WAL_CHECKPOINT_THRESHOLD}\n` +
+                `    (or set GITNEXUS_WAL_CHECKPOINT_THRESHOLD=${RECOMMENDED_WAL_CHECKPOINT_THRESHOLD})\n` +
+                `    (Try 33554432 = 32 MiB on small-disk / CI runners.)\n`, { recoveryHint: 'wal-checkpoint-threshold' });
+            process.exitCode = 1;
+            return;
+        }
         // HF download failure — show clean guidance without the raw stack trace.
         // Checked before writeFatalToStderr so the user sees one focused message
         // rather than a stack-trace dump followed by a second remediation block.

package/dist/cli/cli-message.d.ts CHANGED Viewed

@@ -1,15 +1,37 @@
+/**
+ * String-literal union of all `recoveryHint` tags emitted by the CLI.
+ *
+ * Centralized so a new recovery branch added in `analyze.ts` cannot land
+ * without updating this union — TypeScript will reject the unknown literal
+ * passed via `cliError({ recoveryHint: '...' })`. To add a new hint:
+ *   1. Add the tag string to this union.
+ *   2. Pass it as the `recoveryHint` field at the relevant `cliError`
+ *      call site.
+ *
+ * Consumers can import this type to narrow log-record `recoveryHint`
+ * fields without restating the literal list.
+ */
+export type RecoveryHint = 'wal-corruption' | 'wal-checkpoint-threshold' | 'heap-oom-respawn' | 'native-worker-abort' | 'hf-endpoint-unreachable' | 'large-repo' | 'npm-resolution' | 'module-not-found';
+/**
+ * Common shape for the optional structured-field bag passed to
+ * `cliError`/`cliWarn`/`cliInfo`. Typed so the `recoveryHint` slot is
+ * checked against the {@link RecoveryHint} union.
+ */
+export interface CliMessageFields extends Record<string, unknown> {
+    recoveryHint?: RecoveryHint;
+}
 /**
  * User-facing informational message. Use for banners, listening URLs,
  * and any message the user expects to read in plain text.
  */
-export declare function cliInfo(msg: string, fields?: Record<string, unknown>): void;
+export declare function cliInfo(msg: string, fields?: CliMessageFields): void;
 /**
  * User-facing warning. Operator-actionable but non-fatal — `cliWarn`
  * indicates the command can still proceed in some form.
  */
-export declare function cliWarn(msg: string, fields?: Record<string, unknown>): void;
+export declare function cliWarn(msg: string, fields?: CliMessageFields): void;
 /**
  * User-facing error. Indicates the command cannot proceed; usually
  * paired with a non-zero exit code at the call site.
  */
-export declare function cliError(msg: string, fields?: Record<string, unknown>): void;
+export declare function cliError(msg: string, fields?: CliMessageFields): void;

package/dist/cli/index.js CHANGED Viewed

@@ -38,6 +38,8 @@ program
     .option('-v, --verbose', 'Enable verbose ingestion warnings (default: false)')
     .option('--max-file-size <kb>', 'Skip files larger than this (KB). Default: 512. Hard cap: 32768 (tree-sitter limit).')
     .option('--worker-timeout <seconds>', 'Worker sub-batch idle timeout before retry/fallback. Default: 30.')
+    .option('--wal-checkpoint-threshold <bytes>', 'LadybugDB WAL auto-checkpoint threshold in bytes during analyze ' +
+    '(integer >= -1; default: 67108864 = 64 MiB; -1 keeps Ladybug stock ~16 MiB).')
     .option('--workers <n>', 'Parse worker pool size. Default: cores-1 capped at 16. Pass 0 to disable workers (sequential).')
     .option('--embedding-threads <n>', 'Limit local ONNX embedding CPU threads')
     .option('--embedding-batch-size <n>', 'Number of nodes per embedding batch')
@@ -47,6 +49,7 @@ program
     '  GITNEXUS_NO_GITIGNORE=1   Skip .gitignore parsing (still reads .gitnexusignore)\n' +
     '  GITNEXUS_MAX_FILE_SIZE=N  Override large-file skip threshold (KB). Default 512, max 32768.\n' +
     '  GITNEXUS_WORKER_SUB_BATCH_TIMEOUT_MS=N  Worker idle timeout in milliseconds. Default 30000.\n' +
+    '  GITNEXUS_WAL_CHECKPOINT_THRESHOLD=N  LadybugDB WAL auto-checkpoint threshold in bytes (default 67108864 = 64 MiB; -1 keeps Ladybug stock ~16 MiB).\n' +
     '  GITNEXUS_WORKER_SUB_BATCH_MAX_BYTES=N  Worker job byte budget. Default 8388608.\n' +
     '  GITNEXUS_WORKER_POOL_SIZE=N  Parse worker count override. Default cores-1 capped at 16.\n' +
     '  GITNEXUS_PARSE_CHUNK_CONCURRENCY=N  Concurrent in-flight parse chunks. Default 2.\n' +
@@ -55,6 +58,7 @@ program
     '  GITNEXUS_WORKER_CONSECUTIVE_FAILURE_THRESHOLD=N  Per-slot deaths to trip circuit breaker. Default max(3, poolSize).\n' +
     '  GITNEXUS_EMBEDDING_THREADS=N  Limit local ONNX CPU threads for --embeddings.\n' +
     '  GITNEXUS_SEMANTIC_EXACT_SCAN_LIMIT=N  Max embedding chunks for exact-scan fallback. Default 10000.\n' +
+    '\nFlags override the corresponding env vars when both are provided.\n' +
     '\nTip: `.gitnexusignore` supports `.gitignore`-style negation. Add e.g.\n' +
     '     `!__tests__/` to index a directory that is auto-filtered by default (#771).')
     .action(createLazyAction(() => import('./analyze.js'), 'analyzeCommand'));

package/dist/core/lbug/lbug-adapter.d.ts CHANGED Viewed

@@ -142,6 +142,21 @@ export declare const fetchExistingEmbeddingHashes: (execQuery: (cypher: string)
  * @see safeClose — CHECKPOINT + connection/database close
  */
 export declare const flushWAL: () => Promise<void>;
+/**
+ * Issue a manual `CHECKPOINT` against the current connection and surface
+ * any engine error to the caller. Unlike {@link flushWAL}, this variant
+ * does NOT swallow Ladybug rename/remove IO failures — the manual
+ * checkpoint driver (`wal-checkpoint-driver.ts`) relies on the rejection
+ * to drive its bounded retry loop. Returns `false` when no connection is
+ * open (the caller treats this as a no-op success — there is no WAL to
+ * flush). Returns `true` after a successful CHECKPOINT + drain.
+ *
+ * The split from `flushWAL` is deliberate: every other CHECKPOINT site
+ * (server flush, safeClose) is best-effort and prefers a silent skip;
+ * the manual driver, by contrast, must observe failures to decide
+ * whether to retry.
+ */
+export declare const tryFlushWAL: () => Promise<boolean>;
 /**
  * Flush the WAL and close the connection and database handles.
  *

package/dist/core/lbug/lbug-adapter.js CHANGED Viewed

@@ -1334,6 +1334,27 @@ export const flushWAL = async () => {
         logger.debug(`GitNexus: LadybugDB CHECKPOINT skipped/failed during WAL flush: ${summarizeError(err)}`);
     }
 };
+/**
+ * Issue a manual `CHECKPOINT` against the current connection and surface
+ * any engine error to the caller. Unlike {@link flushWAL}, this variant
+ * does NOT swallow Ladybug rename/remove IO failures — the manual
+ * checkpoint driver (`wal-checkpoint-driver.ts`) relies on the rejection
+ * to drive its bounded retry loop. Returns `false` when no connection is
+ * open (the caller treats this as a no-op success — there is no WAL to
+ * flush). Returns `true` after a successful CHECKPOINT + drain.
+ *
+ * The split from `flushWAL` is deliberate: every other CHECKPOINT site
+ * (server flush, safeClose) is best-effort and prefers a silent skip;
+ * the manual driver, by contrast, must observe failures to decide
+ * whether to retry.
+ */
+export const tryFlushWAL = async () => {
+    if (!conn)
+        return false;
+    const checkpointResult = await conn.query('CHECKPOINT');
+    await drainQueryResult(checkpointResult);
+    return true;
+};
 /**
  * Flush the WAL and close the connection and database handles.
  *

package/dist/core/lbug/lbug-config.d.ts CHANGED Viewed

@@ -32,8 +32,15 @@ import type lbug from '@ladybugdb/core';
  * integer; anything invalid falls back to the default.
  */
 export declare const LBUG_MAX_DB_SIZE: number;
+export declare const parseWalCheckpointThreshold: (raw: string | undefined) => number | undefined;
 export declare const WAL_RECOVERY_SUGGESTION = "WAL corruption detected. Run `gitnexus analyze --force` to rebuild the index.";
 export declare function isWalCorruptionError(err: unknown): boolean;
+/**
+ * True when `err` looks like a Ladybug WAL-checkpoint rotation/remove IO
+ * failure. Tries strict matchers first (renames + removes), then falls
+ * back to the permissive matcher.
+ */
+export declare const isLbugCheckpointIoError: (err: unknown) => boolean;
 type LbugModule = typeof lbug;
 export interface LbugDatabaseOptions {
     readOnly?: boolean;

package/dist/core/lbug/lbug-config.js CHANGED Viewed

@@ -1,6 +1,7 @@
 import fs from 'fs/promises';
 import os from 'os';
 import path from 'path';
+import { logger } from '../logger.js';
 /**
  * Shared configuration for `@ladybugdb/core` `Database` construction.
  *
@@ -42,6 +43,43 @@ export const LBUG_MAX_DB_SIZE = (() => {
     }
     return 16 * 1024 * 1024 * 1024;
 })();
+export const parseWalCheckpointThreshold = (raw) => {
+    if (raw === undefined)
+        return undefined;
+    const normalized = raw.trim();
+    if (normalized.length === 0)
+        return undefined;
+    const parsed = Number(normalized);
+    if (!Number.isInteger(parsed) || parsed < -1)
+        return undefined;
+    return parsed;
+};
+/**
+ * Default GitNexus WAL auto-checkpoint threshold in bytes (64 MiB).
+ *
+ * Larger than Ladybug's stock ~16 MiB to reduce checkpoint rename/remove
+ * churn under heavy analyze write load — the original race that motivated
+ * issue #1741 triggered at the stock threshold. README examples in
+ * `README.md` and `gitnexus/README.md` and the recovery hint in
+ * `analyze.ts` MUST stay in sync with this value.
+ */
+const DEFAULT_WAL_CHECKPOINT_THRESHOLD = 64 * 1024 * 1024;
+const resolveCheckpointThreshold = () => {
+    const raw = process.env.GITNEXUS_WAL_CHECKPOINT_THRESHOLD;
+    if (raw === undefined)
+        return DEFAULT_WAL_CHECKPOINT_THRESHOLD;
+    const parsed = parseWalCheckpointThreshold(raw);
+    if (parsed !== undefined)
+        return parsed;
+    // Non-empty but unparseable input: warn the operator and fall back. Mirrors
+    // the CLI's `--wal-checkpoint-threshold` validation (which hard-errors)
+    // but the env-var path stays soft to preserve "set once in your shell"
+    // ergonomics across mixed-version invocations.
+    if (raw.trim().length > 0) {
+        logger.warn({ rawValue: raw, fallback: DEFAULT_WAL_CHECKPOINT_THRESHOLD }, `Ignoring invalid GITNEXUS_WAL_CHECKPOINT_THRESHOLD=${raw}; expected integer >= -1; falling back to default (${DEFAULT_WAL_CHECKPOINT_THRESHOLD}).`);
+    }
+    return DEFAULT_WAL_CHECKPOINT_THRESHOLD;
+};
 /** Matches WAL corruption errors from the LadybugDB engine. */
 const WAL_CORRUPTION_RE = /corrupt(ed)?\s+wal|invalid\s+wal\s+record|wal.*corrupt|checksum.*wal/i;
 export const WAL_RECOVERY_SUGGESTION = 'WAL corruption detected. Run `gitnexus analyze --force` to rebuild the index.';
@@ -51,6 +89,48 @@ export function isWalCorruptionError(err) {
     const msg = err instanceof Error ? err.message : String(err);
     return WAL_CORRUPTION_RE.test(msg);
 }
+// ─── Ladybug WAL checkpoint IO error matchers ───────────────────────────────
+//
+// Matched against LadybugDB v0.16.1 (see `gitnexus/package.json`
+// @ladybugdb/core). Strict regexes encode local_file_system.cpp wording
+// verified at that version. Two-tier strategy: strict matchers first so we
+// only fire on real checkpoint-rotation shapes; a permissive fallback
+// catches future Ladybug message drift so the recovery hint keeps surfacing
+// even if upstream wording changes.
+//
+// From Ladybug native LocalFileSystem exceptions (`local_file_system.cpp`),
+// surfaced in Node as:
+// "Runtime exception: IO exception: Error renaming file ..."
+// "Runtime exception: IO exception: Error removing directory or file ..."
+// We only match checkpoint-rotation shapes:
+//   - "<db>.wal -> <db>.wal.checkpoint" rename failures
+//   - "<db>.wal.checkpoint" remove failures
+// Example matches:
+//   "Runtime exception: IO exception: Error renaming file /x/lbug.wal to /x/lbug.wal.checkpoint. ErrorMessage: Permission denied"
+//   "Runtime exception: IO exception: Error removing directory or file /x/lbug.wal.checkpoint.  Error Message: Permission denied"
+// Matching is case-insensitive to remain robust across wrappers/platforms.
+const LBUG_CHECKPOINT_RENAME_RE = /^runtime exception: io exception:\s*error renaming file\s+.+?\.wal\s+to\s+.+?\.wal\.checkpoint(?:\.|\s|$)/i;
+const LBUG_CHECKPOINT_REMOVE_RE = /^runtime exception: io exception:\s*error removing directory or file\s+.+?\.wal\.checkpoint(?:\.|\s|$)/i;
+/**
+ * Permissive fallback: any IO-exception-shaped message that mentions a
+ * `.wal.checkpoint` path. Catches future Ladybug message drift (different
+ * verb, additional preamble, locale variation) so the recovery hint keeps
+ * surfacing even if the strict regexes go stale.
+ */
+const LBUG_CHECKPOINT_PERMISSIVE_RE = /io exception.*\.wal\.checkpoint/i;
+/**
+ * True when `err` looks like a Ladybug WAL-checkpoint rotation/remove IO
+ * failure. Tries strict matchers first (renames + removes), then falls
+ * back to the permissive matcher.
+ */
+export const isLbugCheckpointIoError = (err) => {
+    if (!err)
+        return false;
+    const msg = err instanceof Error ? err.message : String(err);
+    if (LBUG_CHECKPOINT_RENAME_RE.test(msg) || LBUG_CHECKPOINT_REMOVE_RE.test(msg))
+        return true;
+    return LBUG_CHECKPOINT_PERMISSIVE_RE.test(msg);
+};
 /**
  * Return true when the error message indicates that a LadybugDB file lock
  * could not be acquired — either at construction time
@@ -76,8 +156,8 @@ export function createLbugDatabase(lbugModule, databasePath, options = {}) {
     // .d.ts declares fewer args than the native constructor accepts.
     return new lbugModule.Database(databasePath, 0, // bufferManagerSize
     false, // enableCompression (pinned for v0.16.0)
-    options.readOnly ?? false, LBUG_MAX_DB_SIZE, true, // autoCheckpoint
-    -1, // checkpointThreshold
+    options.readOnly ?? false, LBUG_MAX_DB_SIZE, true, // autoCheckpoint (always on)
+    resolveCheckpointThreshold(), // checkpointThreshold (default 64 MiB; override with GITNEXUS_WAL_CHECKPOINT_THRESHOLD; -1 keeps Ladybug stock ~16 MiB)
     options.throwOnWalReplayFailure ?? true, true);
 }
 // ─── Lock-busy retry tuning knobs ───────────────────────────────────────────

package/dist/core/lbug/wal-checkpoint-driver.d.ts ADDED Viewed

@@ -0,0 +1,98 @@
+/**
+ * Manual WAL checkpoint driver with bounded retry (#1741 follow-up).
+ *
+ * Background
+ * ----------
+ * LadybugDB's native auto-checkpoint runs from inside the C++ engine on a
+ * background path that has no JS-side hook for mid-write rotation. When
+ * the rename of `<db>.wal` → `<db>.wal.checkpoint` races a transient file
+ * lock (Windows Defender, AV scanner, NTFS shadow copy) the engine raises
+ * a `Runtime exception: IO exception: Error renaming file …` that aborts
+ * the in-flight write. There is no engine-level retry.
+ *
+ * The auto-checkpoint cannot be made retryable from JS, but a *manual*
+ * `CHECKPOINT` query that the JS layer issues itself CAN be wrapped in a
+ * bounded retry. By draining the WAL on a tight cadence — more often than
+ * the native threshold — the auto-checkpoint almost never has work left
+ * to do, so the un-retriable native rename race is moved into the
+ * JS-controlled path where this module's retry absorbs it.
+ *
+ * Design contract
+ * ---------------
+ * - `autoCheckpoint` stays on (maintainer requirement). This driver is
+ *   additive: it preempts the native checkpoint, it does not replace it.
+ * - The driver runs ONLY during analyze (callers opt-in explicitly). MCP
+ *   and other long-lived flows continue to rely on the close-time
+ *   CHECKPOINT in `safeClose`.
+ * - Opt-out is via `GITNEXUS_WAL_MANUAL_CHECKPOINT=0`. Default is on.
+ * - Retries only fire on `isLbugCheckpointIoError` — every other error
+ *   surfaces immediately. The retry budget is small (3 attempts) with
+ *   jittered backoff so a chronic rename failure escalates fast.
+ * - Retry attempts log at `debug`; only the final, exhausted failure
+ *   surfaces to the caller (and is logged at `warn` here for operators).
+ */
+/**
+ * Run a single CHECKPOINT with bounded retry on
+ * `isLbugCheckpointIoError`. Returns the number of attempts actually
+ * spent (1-`CHECKPOINT_RETRY_ATTEMPTS`) on success, or rethrows the last
+ * checkpoint error after exhausting the budget. Non-checkpoint errors
+ * (e.g. WAL corruption, lock-busy) propagate immediately on the first
+ * attempt — those are not what this retry is designed to absorb.
+ *
+ * The split from `flushWAL` is deliberate: `flushWAL` is the swallow-and-
+ * log helper used by `safeClose` and the server's best-effort flush,
+ * which by contract cannot fail the surrounding operation. The manual
+ * driver MUST observe failures to decide whether to retry, and that is
+ * the role of `tryFlushWAL`.
+ *
+ * Exported for direct unit testing — production callers use
+ * {@link startWalCheckpointDriver} or {@link checkpointOnce}.
+ */
+export declare const runCheckpointWithRetry: (options?: {
+    /** Override the sleep implementation for tests. */
+    sleepFn?: (ms: number) => Promise<void>;
+    /** Override the CHECKPOINT call for tests. */
+    checkpointFn?: () => Promise<boolean>;
+    /** Override the jitter source for tests. Returns a value in [0, 1). */
+    randomFn?: () => number;
+}) => Promise<{
+    attempts: number;
+    flushed: boolean;
+}>;
+/**
+ * Single-shot manual checkpoint. Use this when the caller drives the
+ * cadence itself (e.g. a phase boundary in `runFullAnalysis`).
+ *
+ * Honors the `GITNEXUS_WAL_MANUAL_CHECKPOINT=0` opt-out so operators can
+ * disable the manual path if it ever interacts badly with a future
+ * Ladybug release.
+ */
+export declare const checkpointOnce: () => Promise<void>;
+/**
+ * Start a periodic manual checkpoint driver. The returned handle has a
+ * `stop()` method that resolves once the in-flight checkpoint (if any)
+ * settles, so callers can `await driver.stop()` before close-time
+ * `safeClose` and avoid racing the final flush.
+ *
+ * The first checkpoint fires after `periodMs` (not immediately) so a
+ * cold analyze does not pay a CHECKPOINT round trip before any writes
+ * have happened.
+ */
+export interface WalCheckpointDriver {
+    /** Stop the driver and await any in-flight checkpoint. Idempotent. */
+    stop(): Promise<void>;
+}
+export declare const startWalCheckpointDriver: (options?: {
+    periodMs?: number;
+}) => WalCheckpointDriver;
+/**
+ * Reading `GITNEXUS_WAL_MANUAL_CHECKPOINT` at every call site (rather
+ * than caching at module load) keeps `analyzeCommand` env restoration
+ * honest: tests that toggle the flag between invocations see the live
+ * value, matching the `ANALYZE_CLI_ENV_KEYS` snapshot/restore contract
+ * in `analyze.ts`.
+ *
+ * Accepted opt-out values: '0', 'false', 'off', 'no' (case-insensitive).
+ * Anything else — including undefined — leaves the driver enabled.
+ */
+export declare const isManualCheckpointEnabled: () => boolean;

package/dist/core/lbug/wal-checkpoint-driver.js ADDED Viewed

@@ -0,0 +1,189 @@
+/**
+ * Manual WAL checkpoint driver with bounded retry (#1741 follow-up).
+ *
+ * Background
+ * ----------
+ * LadybugDB's native auto-checkpoint runs from inside the C++ engine on a
+ * background path that has no JS-side hook for mid-write rotation. When
+ * the rename of `<db>.wal` → `<db>.wal.checkpoint` races a transient file
+ * lock (Windows Defender, AV scanner, NTFS shadow copy) the engine raises
+ * a `Runtime exception: IO exception: Error renaming file …` that aborts
+ * the in-flight write. There is no engine-level retry.
+ *
+ * The auto-checkpoint cannot be made retryable from JS, but a *manual*
+ * `CHECKPOINT` query that the JS layer issues itself CAN be wrapped in a
+ * bounded retry. By draining the WAL on a tight cadence — more often than
+ * the native threshold — the auto-checkpoint almost never has work left
+ * to do, so the un-retriable native rename race is moved into the
+ * JS-controlled path where this module's retry absorbs it.
+ *
+ * Design contract
+ * ---------------
+ * - `autoCheckpoint` stays on (maintainer requirement). This driver is
+ *   additive: it preempts the native checkpoint, it does not replace it.
+ * - The driver runs ONLY during analyze (callers opt-in explicitly). MCP
+ *   and other long-lived flows continue to rely on the close-time
+ *   CHECKPOINT in `safeClose`.
+ * - Opt-out is via `GITNEXUS_WAL_MANUAL_CHECKPOINT=0`. Default is on.
+ * - Retries only fire on `isLbugCheckpointIoError` — every other error
+ *   surfaces immediately. The retry budget is small (3 attempts) with
+ *   jittered backoff so a chronic rename failure escalates fast.
+ * - Retry attempts log at `debug`; only the final, exhausted failure
+ *   surfaces to the caller (and is logged at `warn` here for operators).
+ */
+import { logger } from '../logger.js';
+import { tryFlushWAL } from './lbug-adapter.js';
+import { isLbugCheckpointIoError } from './lbug-config.js';
+/**
+ * Bounded retry budget. Total worst-case wall time is dominated by the
+ * three sleeps below (~750 ms before jitter) plus three CHECKPOINT round
+ * trips — small enough to stay invisible during a large analyze, large
+ * enough to ride out a single AV scanner sweep on Windows.
+ */
+const CHECKPOINT_RETRY_ATTEMPTS = 3;
+/**
+ * Base back-off in ms. Each attempt waits `BASE_DELAYS[attempt-1]`
+ * milliseconds before the next try, plus a small jitter to avoid
+ * synchronized retries when multiple analyzers ever share a host.
+ */
+const BASE_DELAYS_MS = [50, 200, 500];
+/** Maximum jitter added on top of each base delay. */
+const JITTER_MAX_MS = 50;
+const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
+/**
+ * Run a single CHECKPOINT with bounded retry on
+ * `isLbugCheckpointIoError`. Returns the number of attempts actually
+ * spent (1-`CHECKPOINT_RETRY_ATTEMPTS`) on success, or rethrows the last
+ * checkpoint error after exhausting the budget. Non-checkpoint errors
+ * (e.g. WAL corruption, lock-busy) propagate immediately on the first
+ * attempt — those are not what this retry is designed to absorb.
+ *
+ * The split from `flushWAL` is deliberate: `flushWAL` is the swallow-and-
+ * log helper used by `safeClose` and the server's best-effort flush,
+ * which by contract cannot fail the surrounding operation. The manual
+ * driver MUST observe failures to decide whether to retry, and that is
+ * the role of `tryFlushWAL`.
+ *
+ * Exported for direct unit testing — production callers use
+ * {@link startWalCheckpointDriver} or {@link checkpointOnce}.
+ */
+export const runCheckpointWithRetry = async (options = {}) => {
+    const sleepImpl = options.sleepFn ?? sleep;
+    const checkpointImpl = options.checkpointFn ?? tryFlushWAL;
+    const randomImpl = options.randomFn ?? Math.random;
+    let lastError;
+    for (let attempt = 1; attempt <= CHECKPOINT_RETRY_ATTEMPTS; attempt++) {
+        try {
+            const flushed = await checkpointImpl();
+            return { attempts: attempt, flushed };
+        }
+        catch (err) {
+            lastError = err;
+            if (!isLbugCheckpointIoError(err)) {
+                // Non-checkpoint error — propagate immediately. Examples:
+                // WAL corruption, missing connection, query syntax failure.
+                // Retrying these would only mask the real signal.
+                throw err;
+            }
+            if (attempt === CHECKPOINT_RETRY_ATTEMPTS)
+                break;
+            const base = BASE_DELAYS_MS[Math.min(attempt - 1, BASE_DELAYS_MS.length - 1)] ?? 500;
+            // randomImpl defaults to Math.random — non-cryptographic by design; jitter only avoids
+            // synchronized retries between concurrent analyzers.
+            const delayMs = base + Math.floor(randomImpl() * JITTER_MAX_MS);
+            logger.debug({ attempt, totalAttempts: CHECKPOINT_RETRY_ATTEMPTS, delayMs }, 'GitNexus: WAL checkpoint IO error — retrying');
+            await sleepImpl(delayMs);
+        }
+    }
+    logger.warn({ attempts: CHECKPOINT_RETRY_ATTEMPTS }, 'GitNexus: manual WAL checkpoint exhausted retry budget — surfacing IO error to caller');
+    throw lastError;
+};
+/**
+ * Single-shot manual checkpoint. Use this when the caller drives the
+ * cadence itself (e.g. a phase boundary in `runFullAnalysis`).
+ *
+ * Honors the `GITNEXUS_WAL_MANUAL_CHECKPOINT=0` opt-out so operators can
+ * disable the manual path if it ever interacts badly with a future
+ * Ladybug release.
+ */
+export const checkpointOnce = async () => {
+    if (!isManualCheckpointEnabled())
+        return;
+    await runCheckpointWithRetry();
+};
+/** Default cadence (ms) for the periodic driver. */
+const DEFAULT_PERIOD_MS = 5_000;
+export const startWalCheckpointDriver = (options = {}) => {
+    if (!isManualCheckpointEnabled()) {
+        return { stop: async () => undefined };
+    }
+    const periodMs = options.periodMs ?? DEFAULT_PERIOD_MS;
+    let stopped = false;
+    let inflight = null;
+    const tick = async () => {
+        if (stopped)
+            return;
+        inflight = runCheckpointWithRetry()
+            .then(() => undefined)
+            .catch((err) => {
+            // The retry budget exhausted. The caller's surrounding write
+            // will see the same engine error on its next operation and the
+            // `analyzeCommand` catch block will emit the recovery hint.
+            // Logging here keeps the operator-visible trail without
+            // double-logging the user-facing message.
+            logger.warn({ err: err instanceof Error ? err.message : String(err) }, 'GitNexus: manual WAL checkpoint failed after retries');
+        });
+        try {
+            await inflight;
+        }
+        finally {
+            inflight = null;
+        }
+    };
+    const handle = setInterval(() => {
+        // Fire-and-forget: setInterval cannot await directly. The next tick
+        // is guarded by `stopped` and the `inflight` reference.
+        void tick();
+    }, periodMs);
+    // `setInterval` returned by Node is a `Timeout` object with `.unref()`
+    // so a hung driver never prevents process exit.
+    if (typeof handle.unref === 'function') {
+        handle.unref();
+    }
+    return {
+        stop: async () => {
+            if (stopped) {
+                if (inflight)
+                    await inflight;
+                return;
+            }
+            stopped = true;
+            clearInterval(handle);
+            if (inflight) {
+                try {
+                    await inflight;
+                }
+                catch {
+                    /* swallowed in tick() — surface path is the surrounding write */
+                }
+            }
+        },
+    };
+};
+/**
+ * Reading `GITNEXUS_WAL_MANUAL_CHECKPOINT` at every call site (rather
+ * than caching at module load) keeps `analyzeCommand` env restoration
+ * honest: tests that toggle the flag between invocations see the live
+ * value, matching the `ANALYZE_CLI_ENV_KEYS` snapshot/restore contract
+ * in `analyze.ts`.
+ *
+ * Accepted opt-out values: '0', 'false', 'off', 'no' (case-insensitive).
+ * Anything else — including undefined — leaves the driver enabled.
+ */
+export const isManualCheckpointEnabled = () => {
+    const raw = process.env.GITNEXUS_WAL_MANUAL_CHECKPOINT;
+    if (raw === undefined)
+        return true;
+    const normalized = raw.trim().toLowerCase();
+    return !['0', 'false', 'off', 'no'].includes(normalized);
+};

package/dist/core/run-analyze.js CHANGED Viewed

@@ -14,6 +14,7 @@ import { execFileSync } from 'child_process';
 import { runPipelineFromRepo } from './ingestion/pipeline.js';
 import { initLbug, loadGraphToLbug, getLbugStats, executeQuery, executeWithReusedStatement, closeLbug, loadCachedEmbeddings, deleteNodesForFile, deleteAllCommunitiesAndProcesses, queryImporters, } from './lbug/lbug-adapter.js';
 import { createSearchFTSIndexes, verifySearchFTSIndexes } from './search/fts-indexes.js';
+import { startWalCheckpointDriver, } from './lbug/wal-checkpoint-driver.js';
 import { getStoragePaths, saveMeta, loadMeta, ensureGitNexusIgnored, registerRepo, cleanupOldKuzuFiles, INCREMENTAL_SCHEMA_VERSION, } from '../storage/repo-manager.js';
 import { computeFileHashes, diffFileHashes } from '../storage/file-hash.js';
 import { extractChangedSubgraph, computeEffectiveWriteSet, } from './incremental/subgraph-extract.js';
@@ -343,6 +344,15 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
         }
     }
     await initLbug(lbugPath);
+    // Manual WAL checkpoint driver (#1741): periodically drain the WAL
+    // from JS so the un-retriable native auto-checkpoint almost never
+    // has work left to do. Failures of the manual CHECKPOINT are absorbed
+    // by the driver's bounded retry; the final un-recoverable error still
+    // surfaces via the surrounding write that follows the failed flush.
+    // Opt-out via `GITNEXUS_WAL_MANUAL_CHECKPOINT=0` (the driver itself
+    // returns a no-op handle when disabled). Analyze-only: MCP and serve
+    // paths continue to rely on the close-time CHECKPOINT in `safeClose`.
+    const walCheckpointDriver = startWalCheckpointDriver();
     try {
         // All work after initLbug is wrapped in try/finally to ensure closeLbug()
         // is called even if an error occurs — the module-level singleton DB handle
@@ -726,6 +736,9 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
             // Best-effort — don't fail the entire analysis for context file issues
         }
         // ── Close LadybugDB ──────────────────────────────────────────────
+        // Stop the manual checkpoint driver before closeLbug so its
+        // in-flight CHECKPOINT cannot race the `safeClose` CHECKPOINT.
+        await walCheckpointDriver.stop();
         await closeLbug();
         progress('done', 100, 'Done');
         return {
@@ -736,7 +749,14 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
         };
     }
     catch (err) {
-        // Ensure LadybugDB is closed even on error
+        // Ensure LadybugDB is closed even on error. Stop the driver first
+        // so its retry loop cannot extend an already-failing analyze.
+        try {
+            await walCheckpointDriver.stop();
+        }
+        catch {
+            /* swallow — surface path is the rethrow below */
+        }
         try {
             await closeLbug();
         }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "gitnexus",
-  "version": "1.6.6-rc.42",
+  "version": "1.6.6-rc.43",
   "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
   "author": "Abhigyan Patwari",
   "license": "PolyForm-Noncommercial-1.0.0",