nexus-prime 7.2.0 → 7.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,6 +40,26 @@ function safeStatSize(filePath) {
40
40
  }
41
41
  }
42
42
  // ─────────────────────────────────────────────────────────────────────────────
43
+ // SQLite WAL footprint helpers — used by quota guard, rotation, and cleanup.
44
+ // SQLite databases are db + db-wal + db-shm. Any size accounting that ignores
45
+ // wal/shm misses the bug that turned ngram-index.db-wal into 84GB on disk.
46
+ // ─────────────────────────────────────────────────────────────────────────────
47
+ const NGRAM_DEFAULT_WAL_LIMIT_BYTES = 64 * 1024 * 1024; // 64 MB
48
+ const NGRAM_DEFAULT_FOOTPRINT_BYTES = 512 * 1024 * 1024; // 512 MB
49
+ const NGRAM_DEFAULT_CHECKPOINT_INTERVAL_MS = 30_000;
50
+ const NGRAM_DEFAULT_CHECKPOINT_DOC_COUNT = 200;
51
+ export function getNgramWalPath(dbPath) {
52
+ return `${dbPath}-wal`;
53
+ }
54
+ export function getNgramShmPath(dbPath) {
55
+ return `${dbPath}-shm`;
56
+ }
57
+ export function getNgramFootprintBytes(dbPath) {
58
+ return safeStatSize(dbPath)
59
+ + safeStatSize(getNgramWalPath(dbPath))
60
+ + safeStatSize(getNgramShmPath(dbPath));
61
+ }
62
+ // ─────────────────────────────────────────────────────────────────────────────
43
63
  // Character pair frequency weights (precomputed from common code patterns)
44
64
  // Rarer pairs get higher weights for sparse n-gram extraction.
45
65
  // ─────────────────────────────────────────────────────────────────────────────
@@ -157,6 +177,10 @@ export class NgramIndex {
157
177
  deleteStmt;
158
178
  lookupStmt;
159
179
  docExistsStmt;
180
+ // Write-side checkpoint accounting (Phase 2): bound WAL growth on hot indexers.
181
+ writesSinceCheckpoint = 0;
182
+ lastCheckpointAt = 0;
183
+ quotaSkipNoticeShown = false;
160
184
  // Search analytics
161
185
  _searchStats = {
162
186
  totalQueries: 0,
@@ -190,36 +214,69 @@ export class NgramIndex {
190
214
  }
191
215
  }
192
216
  rotateOversizeDbIfNeeded() {
217
+ // Count the full SQLite footprint (db + wal + shm). The 84GB regression
218
+ // happened because a 32MB db file had a 84GB -wal sibling that this
219
+ // routine never inspected.
193
220
  const rotateBytes = readEnvBytes('NEXUS_NGRAM_ROTATE_BYTES', 1024 * 1024 * 1024); // 1GB default
194
- const sizeBytes = safeStatSize(this.dbPath);
195
- if (sizeBytes <= 0 || sizeBytes < rotateBytes)
221
+ const dbBytes = safeStatSize(this.dbPath);
222
+ const footprint = getNgramFootprintBytes(this.dbPath);
223
+ if (footprint <= 0 || footprint < rotateBytes)
196
224
  return;
197
- // Keep only the most recent oversize archive to bound disk usage.
225
+ // Keep only the most recent oversize archive to bound disk usage. When the
226
+ // operator opts out of archiving (ARCHIVE_OVERSIZE=0), drop the leftovers
227
+ // outright instead of growing a backlog of multi-GB carcasses.
198
228
  const dir = path.dirname(this.dbPath);
199
229
  const base = path.basename(this.dbPath);
230
+ const archiveEnabled = process.env.NEXUS_NGRAM_ARCHIVE_OVERSIZE !== '0';
200
231
  const existing = fs.existsSync(dir)
201
232
  ? fs.readdirSync(dir).filter((entry) => entry.startsWith(`${base}.oversize.`)).sort().reverse()
202
233
  : [];
203
- for (const entry of existing.slice(1)) {
234
+ const keepCount = archiveEnabled ? 1 : 0;
235
+ for (const entry of existing.slice(keepCount)) {
204
236
  try {
205
237
  fs.rmSync(path.join(dir, entry), { force: true });
206
238
  }
207
239
  catch { /* best effort */ }
208
240
  }
209
- const rotatedPath = `${this.dbPath}.oversize.${Date.now()}`;
241
+ const stamp = Date.now();
242
+ const rotatedPath = `${this.dbPath}.oversize.${stamp}`;
243
+ const removeSibling = (suffix) => {
244
+ try {
245
+ fs.rmSync(`${this.dbPath}${suffix}`, { force: true });
246
+ }
247
+ catch { /* best effort */ }
248
+ };
210
249
  try {
211
- fs.renameSync(this.dbPath, rotatedPath);
212
- logNgramNoticeOnce(`ngram:rotated:${this.dbPath}`, `[NgramIndex] rotated oversized DB (${Math.round(sizeBytes / 1024 / 1024)}MB >= ${Math.round(rotateBytes / 1024 / 1024)}MB) db=${this.dbPath} rotated=${rotatedPath}`);
250
+ if (archiveEnabled) {
251
+ fs.renameSync(this.dbPath, rotatedPath);
252
+ }
253
+ else {
254
+ fs.rmSync(this.dbPath, { force: true });
255
+ }
256
+ // db is gone; the WAL/SHM siblings are stale and unsafe to keep — drop
257
+ // them so SQLite never re-attaches them on the next open.
258
+ removeSibling('-wal');
259
+ removeSibling('-shm');
260
+ logNgramNoticeOnce(`ngram:rotated:${this.dbPath}:${stamp}`, `[NgramIndex] rotated oversized DB (footprint=${Math.round(footprint / 1024 / 1024)}MB db=${Math.round(dbBytes / 1024 / 1024)}MB threshold=${Math.round(rotateBytes / 1024 / 1024)}MB archive=${archiveEnabled ? 'on' : 'off'}) db=${this.dbPath}${archiveEnabled ? ` rotated=${rotatedPath}` : ''}`);
213
261
  }
214
262
  catch (err) {
215
263
  logNgramNoticeOnce(`ngram:rotate-failed:${this.dbPath}`, `[NgramIndex] oversize rotation failed; continuing with existing DB db=${this.dbPath} err=${String(err?.message ?? err)}`);
216
264
  }
217
265
  }
218
266
  initSchema() {
267
+ const walLimitBytes = readEnvBytes('NEXUS_NGRAM_WAL_LIMIT_BYTES', NGRAM_DEFAULT_WAL_LIMIT_BYTES);
219
268
  this.db.pragma('journal_mode = WAL');
220
269
  this.db.pragma('synchronous = NORMAL');
221
270
  this.db.pragma('cache_size = -16000');
222
271
  this.db.pragma('busy_timeout = 5000');
272
+ // Bound WAL growth: autocheckpoint every N pages and hard-cap journal size
273
+ // so it never balloons (the 84GB regression). temp_store=MEMORY avoids
274
+ // spilling sort/temp tables into another on-disk file. mmap_size=0
275
+ // disables the OS-mapped read cache that can keep pages alive on small VMs.
276
+ this.db.pragma('wal_autocheckpoint = 1000');
277
+ this.db.pragma(`journal_size_limit = ${walLimitBytes}`);
278
+ this.db.pragma('temp_store = MEMORY');
279
+ this.db.pragma('mmap_size = 0');
223
280
  this.db.exec(`
224
281
  CREATE TABLE IF NOT EXISTS ngram_postings (
225
282
  ngram_hash INTEGER NOT NULL,
@@ -288,8 +345,62 @@ export class NgramIndex {
288
345
  }
289
346
  }
290
347
  // ── Document Management ─────────────────────────────────────────────────
348
+ /** Run an opportunistic WAL checkpoint. PASSIVE first, escalate to TRUNCATE
349
+ * if the WAL is still over its configured limit. Returns the WAL byte size
350
+ * observed after the attempt so callers can report progress. */
351
+ checkpointIfNeeded(reason) {
352
+ const walLimit = readEnvBytes('NEXUS_NGRAM_WAL_LIMIT_BYTES', NGRAM_DEFAULT_WAL_LIMIT_BYTES);
353
+ try {
354
+ this.db.exec('PRAGMA wal_checkpoint(PASSIVE)');
355
+ }
356
+ catch {
357
+ // best effort
358
+ }
359
+ let walBytes = this.getWalBytes();
360
+ if (walBytes > walLimit || reason !== 'periodic') {
361
+ try {
362
+ this.db.exec('PRAGMA wal_checkpoint(TRUNCATE)');
363
+ }
364
+ catch {
365
+ // best effort
366
+ }
367
+ walBytes = this.getWalBytes();
368
+ }
369
+ this.writesSinceCheckpoint = 0;
370
+ this.lastCheckpointAt = Date.now();
371
+ return walBytes;
372
+ }
373
+ /** Footprint quota check before a write. Returns true if the write should
374
+ * proceed; false if it should be skipped. Throws when NEXUS_NGRAM_STRICT_QUOTA=1
375
+ * so callers can surface hard quota failures during tests/CI. */
376
+ allowWrite() {
377
+ const walLimit = readEnvBytes('NEXUS_NGRAM_WAL_LIMIT_BYTES', NGRAM_DEFAULT_WAL_LIMIT_BYTES);
378
+ const maxFootprint = readEnvBytes('NEXUS_NGRAM_MAX_FOOTPRINT_BYTES', NGRAM_DEFAULT_FOOTPRINT_BYTES);
379
+ const strict = process.env.NEXUS_NGRAM_STRICT_QUOTA === '1';
380
+ let walBytes = this.getWalBytes();
381
+ if (walBytes > walLimit) {
382
+ walBytes = this.checkpointIfNeeded('quota');
383
+ }
384
+ const footprint = this.getSqliteFootprintBytes();
385
+ if (footprint > maxFootprint) {
386
+ const msg = `[NgramIndex] footprint quota exceeded — db=${this.dbPath} footprint=${footprint} cap=${maxFootprint} wal=${walBytes}`;
387
+ if (strict) {
388
+ throw new Error(msg);
389
+ }
390
+ if (!this.quotaSkipNoticeShown) {
391
+ this.quotaSkipNoticeShown = true;
392
+ logNgramNoticeOnce(`ngram:quota-skip:${this.dbPath}`, msg);
393
+ }
394
+ return false;
395
+ }
396
+ // Recovered — re-arm the warning so a later breach prints again.
397
+ this.quotaSkipNoticeShown = false;
398
+ return true;
399
+ }
291
400
  /** Index a document's text content */
292
401
  addDocument(docId, text) {
402
+ if (!this.allowWrite())
403
+ return;
293
404
  // Remove existing postings for this doc (idempotent)
294
405
  this.deleteStmt.run(docId);
295
406
  const trigrams = extractTrigrams(text);
@@ -319,6 +430,19 @@ export class NgramIndex {
319
430
  `).run(docId, text.length, Date.now());
320
431
  });
321
432
  insertMany();
433
+ this.maybePeriodicCheckpoint();
434
+ }
435
+ /** Trigger a passive checkpoint when either the doc or time interval threshold
436
+ * is hit. Cheap on a healthy DB, prevents WAL growth on bulk indexing. */
437
+ maybePeriodicCheckpoint() {
438
+ this.writesSinceCheckpoint += 1;
439
+ const intervalMs = readEnvBytes('NEXUS_NGRAM_CHECKPOINT_INTERVAL_MS', NGRAM_DEFAULT_CHECKPOINT_INTERVAL_MS);
440
+ const docThreshold = readEnvBytes('NEXUS_NGRAM_CHECKPOINT_DOCS', NGRAM_DEFAULT_CHECKPOINT_DOC_COUNT);
441
+ const dueByDocs = this.writesSinceCheckpoint >= docThreshold;
442
+ const dueByTime = Date.now() - this.lastCheckpointAt > intervalMs;
443
+ if (!dueByDocs && !dueByTime)
444
+ return;
445
+ this.checkpointIfNeeded('periodic');
322
446
  }
323
447
  /** Remove a document from the index */
324
448
  removeDocument(docId) {
@@ -337,6 +461,27 @@ export class NgramIndex {
337
461
  const row = this.db.prepare('SELECT COUNT(*) as cnt FROM ngram_docs').get();
338
462
  return row.cnt;
339
463
  }
464
+ // ── Footprint accessors ─────────────────────────────────────────────────
465
+ /** Path to the underlying SQLite database file. */
466
+ getDbPath() {
467
+ return this.dbPath;
468
+ }
469
+ /** Bytes occupied by the .db file alone. */
470
+ getDbBytes() {
471
+ return safeStatSize(this.dbPath);
472
+ }
473
+ /** Bytes occupied by the .db-wal file. */
474
+ getWalBytes() {
475
+ return safeStatSize(getNgramWalPath(this.dbPath));
476
+ }
477
+ /** Bytes occupied by the .db-shm file. */
478
+ getShmBytes() {
479
+ return safeStatSize(getNgramShmPath(this.dbPath));
480
+ }
481
+ /** Total SQLite footprint = db + wal + shm. Use this for quota checks. */
482
+ getSqliteFootprintBytes() {
483
+ return getNgramFootprintBytes(this.dbPath);
484
+ }
340
485
  // ── Search ──────────────────────────────────────────────────────────────
341
486
  /**
342
487
  * Search for documents matching a text query.
@@ -522,15 +667,24 @@ export class NgramIndex {
522
667
  }
523
668
  /**
524
669
  * Operator-focused maintenance for the on-disk ngram DB.
525
- * - Bounds runaway DB growth via rotation (default >= 1GB)
526
- * - Vacuums only when safe (<= vacuumMaxBytes) and either forced or dirty
670
+ * - Bounds runaway DB growth via rotation (default >= 1GB), counting the
671
+ * full SQLite footprint (db + wal + shm) so a runaway WAL triggers it.
672
+ * - Vacuums only when safe (<= vacuumMaxBytes) and either forced or dirty.
527
673
  */
528
674
  maintainBounded(options = {}) {
529
- const sizeBytes = safeStatSize(this.dbPath);
675
+ const dbBytes = safeStatSize(this.dbPath);
676
+ const footprint = this.getSqliteFootprintBytes();
530
677
  const rotateBytes = readEnvBytes('NEXUS_NGRAM_ROTATE_BYTES', 1024 * 1024 * 1024);
531
678
  const vacuumMaxBytes = readEnvBytes('NEXUS_NGRAM_VACUUM_MAX_BYTES', 256 * 1024 * 1024);
532
- if (sizeBytes >= rotateBytes && sizeBytes > 0) {
679
+ if (footprint >= rotateBytes && footprint > 0) {
680
+ const archiveEnabled = process.env.NEXUS_NGRAM_ARCHIVE_OVERSIZE !== '0';
533
681
  const rotatedPath = `${this.dbPath}.oversize.${Date.now()}`;
682
+ const removeSibling = (suffix) => {
683
+ try {
684
+ fs.rmSync(`${this.dbPath}${suffix}`, { force: true });
685
+ }
686
+ catch { /* best effort */ }
687
+ };
534
688
  try {
535
689
  try {
536
690
  this.db.exec('PRAGMA wal_checkpoint(TRUNCATE)');
@@ -540,7 +694,14 @@ export class NgramIndex {
540
694
  this.db.close();
541
695
  }
542
696
  catch { /* ignore */ }
543
- fs.renameSync(this.dbPath, rotatedPath);
697
+ if (archiveEnabled) {
698
+ fs.renameSync(this.dbPath, rotatedPath);
699
+ }
700
+ else {
701
+ fs.rmSync(this.dbPath, { force: true });
702
+ }
703
+ removeSibling('-wal');
704
+ removeSibling('-shm');
544
705
  }
545
706
  catch (err) {
546
707
  // Re-open if close succeeded but rename failed
@@ -553,13 +714,16 @@ export class NgramIndex {
553
714
  return {
554
715
  action: 'none',
555
716
  dbPath: this.dbPath,
556
- sizeBytes,
717
+ sizeBytes: footprint,
557
718
  reason: `rotate failed: ${String(err?.message ?? err)}`,
558
719
  };
559
720
  }
560
721
  // Recreate a fresh DB
561
722
  this.knownHashes.clear();
562
723
  this.storageDirty = false;
724
+ this.writesSinceCheckpoint = 0;
725
+ this.lastCheckpointAt = Date.now();
726
+ this.quotaSkipNoticeShown = false;
563
727
  this.db = new Database(this.dbPath);
564
728
  this.initSchema();
565
729
  this.prepareStatements();
@@ -567,26 +731,26 @@ export class NgramIndex {
567
731
  return {
568
732
  action: 'rotated',
569
733
  dbPath: this.dbPath,
570
- sizeBytes,
571
- previousPath: rotatedPath,
572
- reason: `db exceeded rotate threshold (${rotateBytes} bytes)`,
734
+ sizeBytes: footprint,
735
+ previousPath: archiveEnabled ? rotatedPath : undefined,
736
+ reason: `footprint exceeded rotate threshold (db=${dbBytes}, footprint=${footprint}, threshold=${rotateBytes})`,
573
737
  };
574
738
  }
575
- if (sizeBytes > 0 && sizeBytes <= vacuumMaxBytes && (options.force || this.storageDirty)) {
739
+ if (dbBytes > 0 && dbBytes <= vacuumMaxBytes && (options.force || this.storageDirty)) {
576
740
  this.optimizeStorage(true);
577
741
  return {
578
742
  action: 'vacuum',
579
743
  dbPath: this.dbPath,
580
- sizeBytes,
744
+ sizeBytes: dbBytes,
581
745
  reason: options.force ? 'forced vacuum' : 'storageDirty vacuum',
582
746
  };
583
747
  }
584
748
  return {
585
749
  action: 'none',
586
750
  dbPath: this.dbPath,
587
- sizeBytes,
588
- reason: sizeBytes > vacuumMaxBytes
589
- ? `skip vacuum: db too large (${sizeBytes} > ${vacuumMaxBytes})`
751
+ sizeBytes: footprint,
752
+ reason: dbBytes > vacuumMaxBytes
753
+ ? `skip vacuum: db too large (${dbBytes} > ${vacuumMaxBytes})`
590
754
  : 'no maintenance needed',
591
755
  };
592
756
  }
@@ -607,9 +771,31 @@ export class NgramIndex {
607
771
  dbSizeBytes,
608
772
  };
609
773
  }
610
- /** Close the database connection */
774
+ /** Close the database connection.
775
+ *
776
+ * Always truncates the WAL so the .db-wal sibling can never outgrow the
777
+ * configured cap on shutdown — the failure mode that produced the 84GB
778
+ * WAL on a user machine. VACUUM is only run when the operator has both a
779
+ * small DB and explicitly opts in via NEXUS_NGRAM_VACUUM_ON_CLOSE=1, since
780
+ * blind VACUUM on multi-GB DBs blocks shutdown indefinitely. */
611
781
  close() {
612
- this.optimizeStorage(this.storageDirty);
782
+ try {
783
+ this.db.exec('PRAGMA wal_checkpoint(TRUNCATE)');
784
+ }
785
+ catch {
786
+ // best effort — never block shutdown on checkpoint failure
787
+ }
788
+ if (process.env.NEXUS_NGRAM_VACUUM_ON_CLOSE === '1') {
789
+ const vacuumMaxBytes = readEnvBytes('NEXUS_NGRAM_VACUUM_MAX_BYTES', 256 * 1024 * 1024);
790
+ const dbBytes = safeStatSize(this.dbPath);
791
+ if (this.storageDirty && dbBytes > 0 && dbBytes <= vacuumMaxBytes) {
792
+ try {
793
+ this.db.exec('VACUUM');
794
+ this.storageDirty = false;
795
+ }
796
+ catch { /* best effort */ }
797
+ }
798
+ }
613
799
  this.db.close();
614
800
  }
615
801
  }
@@ -267,4 +267,9 @@ export declare const MAX_DISCOVERED_FILES = 32;
267
267
  export declare const ABSOLUTE_SECONDARY_MIN = 0.3;
268
268
  export declare const DISCOVERY_EXTENSIONS: Set<string>;
269
269
  export declare const DISCOVERY_IGNORES: Set<string>;
270
+ export declare const DISCOVERY_FILENAME_SKIPS: Set<string>;
271
+ export declare const DISCOVERY_BINARY_EXTENSIONS: Set<string>;
270
272
  export declare const REPO_SEARCH_HEAD_BYTES = 2000;
273
+ /** Default per-file byte cap for the ngram indexer. Override with
274
+ * NEXUS_NGRAM_MAX_FILE_BYTES. Files above this are skipped (not truncated). */
275
+ export declare const DISCOVERY_DEFAULT_MAX_FILE_BYTES = 256000;
@@ -7,6 +7,68 @@
7
7
  export const MAX_AUTONOMY_HISTORY = 24;
8
8
  export const MAX_DISCOVERED_FILES = 32;
9
9
  export const ABSOLUTE_SECONDARY_MIN = 0.3;
10
- export const DISCOVERY_EXTENSIONS = new Set(['.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs', '.json', '.md', '.yml', '.yaml']);
11
- export const DISCOVERY_IGNORES = new Set(['.git', 'node_modules', 'dist', 'coverage', '.next', '.playwright-cli', 'tmp', '.agents', '.agent']);
10
+ // Source-like file extensions worth indexing. Covers JS/TS, Python, Go, Rust,
11
+ // JVM langs, native, web, configs and prose. NEXUS_NGRAM_INDEX_ALL=1 disables
12
+ // the allowlist (useful for diagnostics on non-mainstream repos).
13
+ export const DISCOVERY_EXTENSIONS = new Set([
14
+ '.ts', '.tsx', '.js', '.jsx', '.mjs', '.cjs',
15
+ '.py', '.go', '.rs', '.java', '.kt', '.swift',
16
+ '.cpp', '.cc', '.c', '.h', '.hpp', '.cs',
17
+ '.rb', '.php',
18
+ '.md', '.mdx',
19
+ '.json', '.yaml', '.yml', '.toml',
20
+ '.sql', '.sh',
21
+ '.css', '.scss', '.html',
22
+ ]);
23
+ // Directories the indexer must skip — generated artefacts, dependency caches,
24
+ // virtualenvs, build outputs, and Nexus's own working dirs.
25
+ export const DISCOVERY_IGNORES = new Set([
26
+ '.git',
27
+ 'node_modules',
28
+ 'dist',
29
+ 'build',
30
+ 'out',
31
+ 'coverage',
32
+ '.next',
33
+ '.turbo',
34
+ '.playwright-cli',
35
+ '.cache',
36
+ 'vendor',
37
+ 'target',
38
+ '.venv',
39
+ '__pycache__',
40
+ 'tmp',
41
+ '.agents',
42
+ '.agent',
43
+ ]);
44
+ // Specific filenames to skip even when they match an allowed extension.
45
+ // Lockfiles and large generated manifests blow up trigram indexes without
46
+ // any retrieval value.
47
+ export const DISCOVERY_FILENAME_SKIPS = new Set([
48
+ 'package-lock.json',
49
+ 'pnpm-lock.yaml',
50
+ 'yarn.lock',
51
+ 'bun.lockb',
52
+ 'composer.lock',
53
+ 'poetry.lock',
54
+ 'Pipfile.lock',
55
+ 'Cargo.lock',
56
+ 'Gemfile.lock',
57
+ 'go.sum',
58
+ ]);
59
+ // Binary / media / archive extensions never to read into the indexer even
60
+ // when the user passes NEXUS_NGRAM_INDEX_ALL=1 — these contain no useful
61
+ // trigrams and inflate the WAL when accidentally included.
62
+ export const DISCOVERY_BINARY_EXTENSIONS = new Set([
63
+ '.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.ico', '.tiff',
64
+ '.mp3', '.mp4', '.mov', '.avi', '.mkv', '.webm', '.wav', '.flac', '.ogg',
65
+ '.zip', '.tar', '.gz', '.tgz', '.bz2', '.xz', '.7z', '.rar',
66
+ '.pdf', '.psd', '.ai',
67
+ '.so', '.dll', '.dylib', '.a', '.o', '.node', '.exe', '.bin',
68
+ '.wasm', '.class', '.jar', '.pyc', '.pyo',
69
+ '.lockb',
70
+ ]);
12
71
  export const REPO_SEARCH_HEAD_BYTES = 2_000;
72
+ /** Default per-file byte cap for the ngram indexer. Override with
73
+ * NEXUS_NGRAM_MAX_FILE_BYTES. Files above this are skipped (not truncated). */
74
+ export const DISCOVERY_DEFAULT_MAX_FILE_BYTES = 256_000;
@@ -38,7 +38,7 @@ import { computeFilePriors } from './priors/file-priors.js';
38
38
  import { GhostPass, createSubAgentRuntime, } from '../phantom/index.js';
39
39
  // ─── Types and constants (extracted to orchestrator/types.ts) ─────────────────
40
40
  export * from './orchestrator/types.js';
41
- import { MAX_AUTONOMY_HISTORY, MAX_DISCOVERED_FILES, DISCOVERY_EXTENSIONS, DISCOVERY_IGNORES, } from './orchestrator/types.js';
41
+ import { MAX_AUTONOMY_HISTORY, MAX_DISCOVERED_FILES, DISCOVERY_EXTENSIONS, DISCOVERY_IGNORES, DISCOVERY_FILENAME_SKIPS, DISCOVERY_BINARY_EXTENSIONS, DISCOVERY_DEFAULT_MAX_FILE_BYTES, } from './orchestrator/types.js';
42
42
  // ─── Scoring / planning utilities (extracted to orchestrator/scoring.ts) ───────
43
43
  import { dedupeStrings, shortLabel, extractKeywords, scoreText, scorePath, decomposeTask as decomposeTaskFn, classifyIntent, buildTaskGraph, buildWorkerPlan, decideWorkers, determineMode, toSourceAwareTokenBudget, toRagCandidateStatus, toRagUsageSummary, buildArtifactOutcome, mergeArtifactOutcomeHistory, getArtifactHistoryRecord, getArtifactHistoryScore, deriveArtifactMemoryClass, deriveArtifactFreshness, buildRepoSearchDocument, readRepoSearchHead, extractRepoSearchTerms, } from './orchestrator/scoring.js';
44
44
  import { categoryFilter, emitFunnelStage } from './orchestrator/funnel.js';
@@ -2103,14 +2103,33 @@ export class OrchestratorEngine {
2103
2103
  // Cheap extension check first (no I/O), then bounded-parallel stat for
2104
2104
  // the size + isFile gate. Replaces a per-file synchronous statSync that
2105
2105
  // ran inside .filter() loops on the hot path. CLAUDE.md §2 hard rule #1.
2106
- const candidates = filePaths.filter((f) => DISCOVERY_EXTENSIONS.has(path.extname(f)));
2106
+ //
2107
+ // Indexing discipline: skip lockfiles and binary/media extensions even
2108
+ // when NEXUS_NGRAM_INDEX_ALL=1 — these never produce useful trigrams and
2109
+ // bloat the WAL on big monorepos.
2110
+ const indexAll = process.env.NEXUS_NGRAM_INDEX_ALL === '1';
2111
+ const rawCap = Number(process.env.NEXUS_NGRAM_MAX_FILE_BYTES);
2112
+ const maxFileBytes = Number.isFinite(rawCap) && rawCap > 0
2113
+ ? Math.floor(rawCap)
2114
+ : DISCOVERY_DEFAULT_MAX_FILE_BYTES;
2115
+ const candidates = filePaths.filter((f) => {
2116
+ const base = path.basename(f);
2117
+ if (DISCOVERY_FILENAME_SKIPS.has(base))
2118
+ return false;
2119
+ const ext = path.extname(f).toLowerCase();
2120
+ if (DISCOVERY_BINARY_EXTENSIONS.has(ext))
2121
+ return false;
2122
+ if (indexAll)
2123
+ return true;
2124
+ return DISCOVERY_EXTENSIONS.has(ext);
2125
+ });
2107
2126
  if (candidates.length === 0)
2108
2127
  return [];
2109
2128
  const limit = pLimit(8);
2110
2129
  const results = await Promise.all(candidates.map((f) => limit(async () => {
2111
2130
  try {
2112
2131
  const stat = await fs.promises.stat(f);
2113
- return stat.isFile() && stat.size <= 200_000 ? f : null;
2132
+ return stat.isFile() && stat.size <= maxFileBytes ? f : null;
2114
2133
  }
2115
2134
  catch {
2116
2135
  return null;
@@ -2202,6 +2221,7 @@ export class OrchestratorEngine {
2202
2221
  // we don't need a separate stat per entry, with bounded parallel reads
2203
2222
  // via pLimit(8). Size filtering is done in a single second pass through
2204
2223
  // filterDiscoverableFiles. CLAUDE.md §2 hard rule #1.
2224
+ const walkIndexAll = process.env.NEXUS_NGRAM_INDEX_ALL === '1';
2205
2225
  const limit = pLimit(8);
2206
2226
  const candidates = [];
2207
2227
  let queue = [root];
@@ -2228,7 +2248,12 @@ export class OrchestratorEngine {
2228
2248
  }
2229
2249
  if (!entry.isFile())
2230
2250
  continue;
2231
- if (!DISCOVERY_EXTENSIONS.has(path.extname(entry.name)))
2251
+ if (DISCOVERY_FILENAME_SKIPS.has(entry.name))
2252
+ continue;
2253
+ const ext = path.extname(entry.name).toLowerCase();
2254
+ if (DISCOVERY_BINARY_EXTENSIONS.has(ext))
2255
+ continue;
2256
+ if (!walkIndexAll && !DISCOVERY_EXTENSIONS.has(ext))
2232
2257
  continue;
2233
2258
  candidates.push(fullPath);
2234
2259
  }
@@ -12,6 +12,10 @@ export interface HygieneReport {
12
12
  orphanWorktreesRemoved: number;
13
13
  /** Runs dirs swept due to budgets. */
14
14
  boundedRunsSweeps: number;
15
+ /** Orphan ngram-index sidecar files removed (-wal/-shm without .db). */
16
+ ngramSidecarOrphansRemoved: number;
17
+ /** ngram-index oversize archives pruned beyond keep=1. */
18
+ ngramArchivesPruned: number;
15
19
  }
16
20
  export declare function runStartupHygiene(input: {
17
21
  repoRoot: string;
@@ -6,7 +6,7 @@ import { SessionDNAManager } from './session-dna.js';
6
6
  import { clearBootstrapReceipt, readBootstrapReceipt } from './bootstrap/bootstrap-registry.js';
7
7
  import { doctorGitWorktrees } from './worktree-health.js';
8
8
  import { sweepDirectory, sweepOrphanWorktrees } from '../install/fs-purge.js';
9
- import { getRuntimeTmpRoots, getWorktreeRoots } from '../install/state-locator.js';
9
+ import { enumerateNgramArchives, getRuntimeTmpRoots, getWorktreeRoots } from '../install/state-locator.js';
10
10
  import { resolveWorktreeBudget, resolveRunsBudget } from '../cli/cleanup.js';
11
11
  function isOlderThan(target, maxAgeMs) {
12
12
  try {
@@ -140,6 +140,7 @@ export async function runStartupHygiene(input) {
140
140
  // best-effort
141
141
  }
142
142
  }
143
+ const { ngramSidecarOrphansRemoved, ngramArchivesPruned } = pruneNgramArtifacts(stateDir);
143
144
  return {
144
145
  mode,
145
146
  cleanedRuntimeRegistryEntries: Math.max(0, registryBefore - registryAfter),
@@ -150,5 +151,48 @@ export async function runStartupHygiene(input) {
150
151
  boundedWorktreeSweeps,
151
152
  orphanWorktreesRemoved,
152
153
  boundedRunsSweeps,
154
+ ngramSidecarOrphansRemoved,
155
+ ngramArchivesPruned,
153
156
  };
154
157
  }
158
+ /**
159
+ * Best-effort ngram cleanup at startup:
160
+ * - delete orphan `*.db-wal` / `*.db-shm` when the matching `.db` is missing
161
+ * - keep at most one `ngram-index.db.oversize.*` archive
162
+ * The live DB is left alone — NgramIndex itself enforces WAL bounds via
163
+ * journal_size_limit + autocheckpoint when it opens.
164
+ */
165
+ function pruneNgramArtifacts(stateDir) {
166
+ let ngramSidecarOrphansRemoved = 0;
167
+ let ngramArchivesPruned = 0;
168
+ if (!fs.existsSync(stateDir)) {
169
+ return { ngramSidecarOrphansRemoved, ngramArchivesPruned };
170
+ }
171
+ // Orphan sidecars: foo.db-wal / foo.db-shm with no foo.db.
172
+ try {
173
+ for (const entry of fs.readdirSync(stateDir)) {
174
+ if (!entry.endsWith('.db-wal') && !entry.endsWith('.db-shm'))
175
+ continue;
176
+ const dbBase = entry.replace(/-(?:wal|shm)$/, '');
177
+ if (fs.existsSync(path.join(stateDir, dbBase)))
178
+ continue;
179
+ if (safeUnlink(path.join(stateDir, entry)))
180
+ ngramSidecarOrphansRemoved += 1;
181
+ }
182
+ }
183
+ catch {
184
+ // best-effort
185
+ }
186
+ // Archive cap: keep newest, drop the rest.
187
+ try {
188
+ const archives = enumerateNgramArchives(stateDir).sort((a, b) => b.modifiedAt - a.modifiedAt);
189
+ for (const archive of archives.slice(1)) {
190
+ if (safeUnlink(archive.path))
191
+ ngramArchivesPruned += 1;
192
+ }
193
+ }
194
+ catch {
195
+ // best-effort
196
+ }
197
+ return { ngramSidecarOrphansRemoved, ngramArchivesPruned };
198
+ }
@@ -0,0 +1,33 @@
1
+ /** Marker that identifies a hook entry as Nexus-owned. */
2
+ export declare const NEXUS_HOOK_COMMAND_MARKER = "nexus-prime hook";
3
+ interface HookCommand {
4
+ type: 'command';
5
+ command: string;
6
+ timeout?: number;
7
+ }
8
+ interface HookEntry {
9
+ matcher?: string;
10
+ hooks: HookCommand[];
11
+ }
12
+ /**
13
+ * Canonical Nexus hook spec. Edit here, not in callers.
14
+ * Timeouts protect Claude Code from a wedged Nexus process — leave them set.
15
+ */
16
+ export declare function getNexusHookSpec(): Record<string, HookEntry[]>;
17
+ export interface NexusHookWriteResult {
18
+ /** True when the file would be written (or was written). */
19
+ changed: boolean;
20
+ /** Hook event names that ended up with Nexus entries. */
21
+ events: string[];
22
+ /** Number of stale Nexus entries removed before re-adding. */
23
+ staleRemoved: number;
24
+ }
25
+ /**
26
+ * Idempotent writer for the Nexus hook block. Returns metadata so callers can
27
+ * report status (`installed | unchanged | dry-run`) without re-implementing
28
+ * the merge logic.
29
+ */
30
+ export declare function writeNexusClaudeCodeHooks(settingsPath: string, options?: {
31
+ dryRun?: boolean;
32
+ }): NexusHookWriteResult;
33
+ export {};