@softerist/heuristic-mcp 3.0.15 → 3.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +104 -104
  2. package/config.jsonc +173 -173
  3. package/features/ann-config.js +131 -0
  4. package/features/clear-cache.js +84 -0
  5. package/features/find-similar-code.js +291 -0
  6. package/features/hybrid-search.js +544 -0
  7. package/features/index-codebase.js +3268 -0
  8. package/features/lifecycle.js +1189 -0
  9. package/features/package-version.js +302 -0
  10. package/features/register.js +408 -0
  11. package/features/resources.js +156 -0
  12. package/features/set-workspace.js +265 -0
  13. package/index.js +96 -96
  14. package/lib/cache-ops.js +22 -22
  15. package/lib/cache-utils.js +565 -565
  16. package/lib/cache.js +1870 -1870
  17. package/lib/call-graph.js +396 -396
  18. package/lib/cli.js +1 -1
  19. package/lib/config.js +517 -517
  20. package/lib/constants.js +39 -39
  21. package/lib/embed-query-process.js +7 -7
  22. package/lib/embedding-process.js +7 -7
  23. package/lib/embedding-worker.js +299 -299
  24. package/lib/ignore-patterns.js +316 -316
  25. package/lib/json-worker.js +14 -14
  26. package/lib/json-writer.js +337 -337
  27. package/lib/logging.js +164 -164
  28. package/lib/memory-logger.js +13 -13
  29. package/lib/onnx-backend.js +193 -193
  30. package/lib/project-detector.js +84 -84
  31. package/lib/server-lifecycle.js +165 -165
  32. package/lib/settings-editor.js +754 -754
  33. package/lib/tokenizer.js +256 -256
  34. package/lib/utils.js +428 -428
  35. package/lib/vector-store-binary.js +627 -627
  36. package/lib/vector-store-sqlite.js +95 -95
  37. package/lib/workspace-env.js +28 -28
  38. package/mcp_config.json +9 -9
  39. package/package.json +86 -75
  40. package/scripts/clear-cache.js +20 -0
  41. package/scripts/download-model.js +43 -0
  42. package/scripts/mcp-launcher.js +49 -0
  43. package/scripts/postinstall.js +12 -0
  44. package/search-configs.js +36 -36
  45. package/.prettierrc +0 -7
  46. package/debug-pids.js +0 -30
  47. package/eslint.config.js +0 -36
  48. package/specs/plan.md +0 -23
  49. package/vitest.config.js +0 -39
package/lib/cache.js CHANGED
@@ -1,1870 +1,1870 @@
1
- import fs from 'fs/promises';
2
- import path from 'path';
3
- import { Worker } from 'worker_threads';
4
- import { StreamingJsonWriter } from './json-writer.js';
5
- import { BinaryVectorStore } from './vector-store-binary.js';
6
- import { SqliteVectorStore } from './vector-store-sqlite.js';
7
- import {
8
- JSON_WORKER_THRESHOLD_BYTES,
9
- ANN_DIMENSION_SAMPLE_SIZE,
10
- HNSWLIB_ERROR_RESET_MS,
11
- DEFAULT_READER_WAIT_TIMEOUT_MS,
12
- } from './constants.js';
13
-
14
- const CACHE_META_VERSION = 1;
15
- const CACHE_META_FILE = 'meta.json';
16
-
17
- // ANN meta version stays at 1 for compatibility; maxElements is optional.
18
- const ANN_META_VERSION = 1;
19
- const ANN_INDEX_FILE = 'ann-index.bin';
20
- const ANN_META_FILE = 'ann-meta.json';
21
-
22
- const CALL_GRAPH_FILE = 'call-graph.json';
23
-
24
- const IS_TEST_ENV = process.env.VITEST === 'true' || process.env.NODE_ENV === 'test';
25
-
26
- // Yield to event loop to keep IDE/extension host responsive during heavy CPU loops
27
- const yieldToLoop = () => new Promise((resolve) => setImmediate(resolve));
28
-
29
- let hnswlibPromise = null;
30
- let hnswlibLoadError = null;
31
-
32
- async function parseJsonInWorker(filePath) {
33
- return new Promise((resolve, reject) => {
34
- let settled = false;
35
- const worker = new Worker(new URL('./json-worker.js', import.meta.url), {
36
- workerData: { filePath },
37
- });
38
-
39
- // finish() provides safe cleanup: removeAllListeners ensures no memory leak from
40
- // orphaned handlers, terminate() cleans up the worker process. The 'settled' flag
41
- // prevents double-resolution if multiple events fire before cleanup completes.
42
- const finish = (handler, value) => {
43
- if (settled) return;
44
- settled = true;
45
- worker.removeAllListeners();
46
- const termination = worker.terminate?.();
47
- if (termination && typeof termination.catch === 'function') termination.catch(() => null);
48
- handler(value);
49
- };
50
-
51
- worker.once('message', (msg) => {
52
- if (msg?.ok) {
53
- finish(resolve, msg.data);
54
- } else {
55
- const err = new Error(msg?.error || 'JSON worker failed');
56
- console.warn(`[Cache] ${err.message}`);
57
- finish(reject, err);
58
- }
59
- });
60
-
61
- worker.once('error', (err) => {
62
- console.error(`[Cache] JSON worker error: ${err.message}`);
63
- finish(reject, err);
64
- });
65
-
66
- worker.once('exit', (code) => {
67
- if (code !== 0) {
68
- const err = new Error(`JSON worker exited with code ${code}`);
69
- console.error(`[Cache] ${err.message}`);
70
- finish(reject, err);
71
- return;
72
- }
73
- if (!settled) {
74
- const err = new Error('JSON worker exited without sending a response');
75
- console.error(`[Cache] ${err.message}`);
76
- finish(reject, err);
77
- }
78
- });
79
- });
80
- }
81
-
82
- async function readJsonFile(
83
- filePath,
84
- { workerThresholdBytes = JSON_WORKER_THRESHOLD_BYTES } = {}
85
- ) {
86
- let stats;
87
- try {
88
- stats = await fs.stat(filePath);
89
- } catch {
90
- // File doesn't exist - this is expected and not an error condition
91
- return null;
92
- }
93
-
94
- try {
95
- const canUseWorker = typeof Worker === 'function';
96
- const useWorker =
97
- canUseWorker && stats && typeof stats.size === 'number'
98
- ? stats.size >= workerThresholdBytes
99
- : false;
100
-
101
- if (useWorker) return await parseJsonInWorker(filePath);
102
-
103
- const data = await fs.readFile(filePath, 'utf-8');
104
- return JSON.parse(data);
105
- } catch (error) {
106
- console.warn(`[Cache] Failed to parse ${path.basename(filePath)}: ${error.message}`);
107
- return null;
108
- }
109
- }
110
-
111
- async function loadHnswlib() {
112
- // Reset error state after configured timeout to allow retry
113
- if (hnswlibLoadError) {
114
- if (hnswlibLoadError._timestamp && Date.now() - hnswlibLoadError._timestamp > HNSWLIB_ERROR_RESET_MS) {
115
- hnswlibLoadError = null;
116
- hnswlibPromise = null;
117
- } else {
118
- return null;
119
- }
120
- }
121
-
122
- if (!hnswlibPromise) {
123
- hnswlibPromise = import('hnswlib-node')
124
- .then((mod) => {
125
- const HierarchicalNSW = mod?.HierarchicalNSW || mod?.default?.HierarchicalNSW;
126
- if (!HierarchicalNSW) throw new Error('HierarchicalNSW export not found');
127
- return HierarchicalNSW;
128
- })
129
- .catch((err) => {
130
- // Store timestamp to allow later retry
131
- err._timestamp = Date.now();
132
- hnswlibLoadError = err;
133
- console.warn(`[ANN] hnswlib-node unavailable, using linear search (${err.message})`);
134
- return null;
135
- });
136
- }
137
-
138
- return hnswlibPromise;
139
- }
140
-
141
- function initHnswIndex(index, maxElements, m, efConstruction) {
142
- try {
143
- index.initIndex(maxElements, m, efConstruction, 100);
144
- return;
145
- } catch (err) {
146
- console.warn(`[ANN] Standard init failed: ${err.message}`);
147
- }
148
- try {
149
- index.initIndex(maxElements, m, efConstruction);
150
- return;
151
- } catch (err) {
152
- console.warn(`[ANN] Legacy init failed: ${err.message}`);
153
- }
154
- index.initIndex(maxElements);
155
- }
156
-
157
- function readHnswIndex(index, filePath, maxElements) {
158
- try {
159
- index.readIndexSync(filePath, maxElements);
160
- return true;
161
- } catch {
162
- /* ignore */
163
- }
164
- try {
165
- index.readIndexSync(filePath);
166
- return true;
167
- } catch (err) {
168
- console.warn(`[ANN] Read index failed: ${err.message}`);
169
- }
170
- return false;
171
- }
172
-
173
- function normalizeLabels(result) {
174
- if (!result) return [];
175
- if (Array.isArray(result)) return result;
176
- const labels = result.labels || result.neighbors || result.indices;
177
- return labels ? Array.from(labels) : [];
178
- }
179
-
180
- function ensureFloat32(vector) {
181
- if (!vector) return null;
182
- if (vector instanceof Float32Array) return vector;
183
-
184
- // Convert values (do NOT reinterpret bytes)
185
- let result;
186
- if (ArrayBuffer.isView(vector)) {
187
- result = Float32Array.from(vector);
188
- } else {
189
- result = new Float32Array(vector);
190
- }
191
-
192
- // In test environment, validate that all values are finite to catch corruption early
193
- if (IS_TEST_ENV && result.length > 0) {
194
- for (let i = 0; i < result.length; i++) {
195
- if (!Number.isFinite(result[i])) {
196
- throw new Error(
197
- `Invalid vector value at index ${i}: ${result[i]}. ` +
198
- 'Vector contains NaN or Infinity, which will corrupt search results.'
199
- );
200
- }
201
- }
202
- }
203
-
204
- return result;
205
- }
206
-
207
- function normalizeChunkVector(chunk) {
208
- if (chunk?.vector) chunk.vector = ensureFloat32(chunk.vector);
209
- }
210
-
211
- function assignChunkIndices(store) {
212
- if (!Array.isArray(store)) return;
213
- for (let i = 0; i < store.length; i += 1) {
214
- const chunk = store[i];
215
- if (chunk) {
216
- chunk._index = i;
217
- }
218
- }
219
- }
220
-
221
- function normalizeFileHashEntry(entry) {
222
- if (!entry) return null;
223
- if (typeof entry === 'string') return { hash: entry };
224
- if (typeof entry !== 'object') return null;
225
- if (typeof entry.hash !== 'string') return null;
226
- const normalized = { hash: entry.hash };
227
- if (Number.isFinite(entry.mtimeMs)) normalized.mtimeMs = entry.mtimeMs;
228
- if (Number.isFinite(entry.size)) normalized.size = entry.size;
229
- return normalized;
230
- }
231
-
232
- function serializeFileHashEntry(entry) {
233
- if (!entry) return null;
234
- if (typeof entry === 'string') return { hash: entry };
235
- if (typeof entry !== 'object') return null;
236
- if (typeof entry.hash !== 'string') return null;
237
- const serialized = { hash: entry.hash };
238
- if (Number.isFinite(entry.mtimeMs)) serialized.mtimeMs = entry.mtimeMs;
239
- if (Number.isFinite(entry.size)) serialized.size = entry.size;
240
- return serialized;
241
- }
242
-
243
- function computeAnnCapacity(total, config) {
244
- const factor = typeof config.annCapacityFactor === 'number' ? config.annCapacityFactor : 1.2;
245
- const extra = Number.isInteger(config.annCapacityExtra) ? config.annCapacityExtra : 1024;
246
- const byFactor = Math.ceil(total * factor);
247
- const byExtra = total + extra;
248
- return Math.max(total, byFactor, byExtra);
249
- }
250
-
251
- export class EmbeddingsCache {
252
- constructor(config) {
253
- this.config = config;
254
-
255
- this.vectorStore = [];
256
- this.fileHashes = new Map();
257
- this.isSaving = false;
258
- this.lastIndexDurationMs = null;
259
- this.lastIndexStats = null;
260
-
261
- this.cacheMeta = {
262
- version: CACHE_META_VERSION,
263
- embeddingModel: config.embeddingModel,
264
- embeddingDimension: config.embeddingDimension ?? null,
265
- };
266
-
267
- // Save coalescing / debounce (serialized via saveQueue)
268
- this.saveQueue = Promise.resolve();
269
- this._saveTimer = null;
270
- this._saveRequested = false;
271
- this._savePromise = null;
272
-
273
- // ANN state
274
- this.annIndex = null;
275
- this.annMeta = null;
276
- this.annDirty = false; // needs rebuild
277
- this.annPersistDirty = false; // in-memory differs from disk
278
- this.annLoading = null;
279
- this.annVectorCache = null;
280
-
281
- // Call graph
282
- this.fileCallData = new Map();
283
- this.callGraph = null;
284
- this._callGraphBuild = null;
285
-
286
- // Binary vector store (optional)
287
- this.binaryStore = null;
288
-
289
- // SQLite vector store (optional)
290
- this.sqliteStore = null;
291
-
292
- // Error tracking
293
- this.initErrors = [];
294
-
295
- // Concurrency hooks (read tracking)
296
- this.activeReads = 0;
297
- this._readWaiters = [];
298
- this._saveInProgress = false; // Prevents new reads during save
299
-
300
- // Lazy reload support after dropping in-memory vectors
301
- this._clearedAfterIndex = false;
302
- this._loadPromise = null;
303
- }
304
-
305
- /**
306
- * Add an initialization error with consistent structure.
307
- * @param {string} stage - The stage where the error occurred (e.g., 'loadHnswlib', 'ensureAnnIndex')
308
- * @param {Error|string} error - The error object or message
309
- */
310
- addInitError(stage, error) {
311
- this.initErrors.push({
312
- stage,
313
- message: error instanceof Error ? error.message : String(error),
314
- stack: error instanceof Error ? error.stack : null,
315
- timestamp: Date.now(),
316
- });
317
- }
318
-
319
- clearInMemoryState() {
320
- this.vectorStore = [];
321
- this.fileHashes.clear();
322
- this.invalidateAnnIndex();
323
- this.fileCallData.clear();
324
- this.callGraph = null;
325
- this.initErrors = [];
326
- if (this.binaryStore) {
327
- try {
328
- this.binaryStore.close?.();
329
- } catch {
330
- // ignore close errors
331
- }
332
- this.binaryStore = null;
333
- }
334
- if (this.sqliteStore) {
335
- try {
336
- this.sqliteStore.close?.();
337
- } catch {
338
- // ignore close errors
339
- }
340
- this.sqliteStore = null;
341
- }
342
- }
343
-
344
- async close() {
345
- if (this.binaryStore) {
346
- await this.binaryStore.close();
347
- this.binaryStore = null;
348
- }
349
- if (this.sqliteStore) {
350
- try {
351
- this.sqliteStore.close();
352
- } catch {
353
- // SQLite may already be closed or in error state
354
- }
355
- this.sqliteStore = null;
356
- }
357
- }
358
-
359
- async ensureLoaded({ preferDisk = false } = {}) {
360
- if (!this.config.enableCache) return;
361
- if (!this._clearedAfterIndex) return;
362
- if (this._loadPromise) return this._loadPromise;
363
-
364
- this._loadPromise = (async () => {
365
- if (preferDisk && this.config.verbose) {
366
- console.info('[Cache] ensureLoaded: forcing disk vector mode for incremental low-RAM reload');
367
- }
368
- await this.load({
369
- forceVectorLoadMode: preferDisk ? 'disk' : undefined,
370
- });
371
- this._clearedAfterIndex = false;
372
- })().finally(() => {
373
- this._loadPromise = null;
374
- });
375
-
376
- return this._loadPromise;
377
- }
378
-
379
- async dropInMemoryVectors() {
380
- if (!this.config.enableCache) return;
381
-
382
- if (this.activeReads > 0) {
383
- await this.waitForReaders();
384
- }
385
-
386
- this.vectorStore = [];
387
- this.annVectorCache = null;
388
- this.annIndex = null;
389
- this.annMeta = null;
390
- this.annDirty = true;
391
- this.annPersistDirty = false;
392
-
393
- if (this.binaryStore) {
394
- try {
395
- await this.binaryStore.close();
396
- } catch {
397
- // ignore close errors
398
- }
399
- this.binaryStore = null;
400
- }
401
-
402
- if (this.sqliteStore) {
403
- try {
404
- this.sqliteStore.close();
405
- } catch {
406
- // ignore close errors
407
- }
408
- this.sqliteStore = null;
409
- }
410
-
411
- this._clearedAfterIndex = true;
412
- }
413
-
414
- // -------------------- Concurrency Hooks --------------------
415
-
416
- startRead() {
417
- // Prevent new reads while save is in progress to avoid race conditions
418
- if (this._saveInProgress) {
419
- throw new Error('Cache save in progress, try again shortly');
420
- }
421
- this.activeReads++;
422
- }
423
-
424
- endRead() {
425
- if (this.activeReads > 0) {
426
- this.activeReads--;
427
- if (this.activeReads === 0 && this._readWaiters.length > 0) {
428
- const waiters = this._readWaiters;
429
- this._readWaiters = [];
430
- for (const resolve of waiters) {
431
- resolve();
432
- }
433
- }
434
- }
435
- }
436
-
437
- async waitForReaders() {
438
- if (this.activeReads === 0) return;
439
- await new Promise((resolve) => {
440
- this._readWaiters.push(resolve);
441
- });
442
- }
443
-
444
- async waitForReadersWithTimeout(timeoutMs = 5000) {
445
- if (this.activeReads === 0) return true;
446
- let timedOut = false;
447
- let resolved = false;
448
- let waiterResolve;
449
- const waiterPromise = new Promise((resolve) => {
450
- waiterResolve = () => {
451
- if (!resolved) {
452
- resolved = true;
453
- resolve();
454
- }
455
- };
456
- this._readWaiters.push(waiterResolve);
457
- });
458
- await Promise.race([
459
- waiterPromise,
460
- new Promise((resolve) => {
461
- setTimeout(() => {
462
- if (!resolved) {
463
- resolved = true;
464
- timedOut = true;
465
- // Remove waiter from array to prevent late invocation after timeout
466
- const idx = this._readWaiters.indexOf(waiterResolve);
467
- if (idx >= 0) this._readWaiters.splice(idx, 1);
468
- resolve();
469
- }
470
- }, timeoutMs);
471
- }),
472
- ]);
473
- if (timedOut) {
474
- // Always warn (not just verbose) since proceeding with active readers is risky
475
- console.warn(
476
- `[Cache] Timed out waiting for ${this.activeReads} active reader(s); proceeding with save anyway. ` +
477
- 'This may cause data inconsistency if readers access the store during write.'
478
- );
479
- }
480
- return !timedOut;
481
- }
482
-
483
- // -------------------- Reset --------------------
484
-
485
- /**
486
- * Resets the cache state (clears vectors, hashes, and call graph).
487
- * Used for forced reindexing.
488
- */
489
- async reset() {
490
- this.vectorStore = [];
491
- if (this.binaryStore) {
492
- try {
493
- await this.binaryStore.close();
494
- } catch {
495
- // ignore close errors
496
- }
497
- this.binaryStore = null;
498
- }
499
- if (this.sqliteStore) {
500
- try {
501
- this.sqliteStore.close();
502
- } catch {
503
- // ignore close errors
504
- }
505
- this.sqliteStore = null;
506
- }
507
- this.fileHashes.clear();
508
- this.invalidateAnnIndex();
509
- await this.clearCallGraphData({ removeFile: true });
510
- this.initErrors = [];
511
- }
512
-
513
- // -------------------- Load --------------------
514
-
515
- async load({ forceVectorLoadMode } = {}) {
516
- if (!this.config.enableCache) return;
517
-
518
- try {
519
- await fs.mkdir(this.config.cacheDirectory, { recursive: true });
520
-
521
- const cacheFile = path.join(this.config.cacheDirectory, 'embeddings.json');
522
- const hashFile = path.join(this.config.cacheDirectory, 'file-hashes.json');
523
- const metaFile = path.join(this.config.cacheDirectory, CACHE_META_FILE);
524
-
525
- const workerThresholdBytes =
526
- Number.isInteger(this.config.jsonWorkerThresholdBytes) &&
527
- this.config.jsonWorkerThresholdBytes > 0
528
- ? this.config.jsonWorkerThresholdBytes
529
- : JSON_WORKER_THRESHOLD_BYTES;
530
-
531
- const useBinary = this.config.vectorStoreFormat === 'binary';
532
- const useSqlite = this.config.vectorStoreFormat === 'sqlite';
533
-
534
- const { vectorsPath, recordsPath, contentPath, filesPath } = BinaryVectorStore.getPaths(
535
- this.config.cacheDirectory
536
- );
537
- const pathExists = async (targetPath) => {
538
- try {
539
- await fs.access(targetPath);
540
- return true;
541
- } catch {
542
- return false;
543
- }
544
- };
545
-
546
- // In tests, read cache files eagerly to exercise worker paths.
547
- let cacheData = null;
548
- let hashData = null;
549
- let prefetched = false;
550
- if (IS_TEST_ENV) {
551
- prefetched = true;
552
- const cachePromise = useBinary || useSqlite
553
- ? Promise.resolve(null)
554
- : readJsonFile(cacheFile, { workerThresholdBytes });
555
- [cacheData, hashData] = await Promise.all([
556
- cachePromise,
557
- readJsonFile(hashFile, { workerThresholdBytes }),
558
- ]);
559
- }
560
-
561
- // Read meta first to avoid parsing huge cache files when invalid
562
- const metaData = await fs.readFile(metaFile, 'utf-8').catch(() => null);
563
- if (!metaData) {
564
- console.warn('[Cache] Missing cache metadata, ignoring cache');
565
- this.clearInMemoryState();
566
- return;
567
- }
568
-
569
- let meta;
570
- try {
571
- meta = JSON.parse(metaData);
572
- } catch {
573
- console.warn('[Cache] Invalid cache metadata, ignoring cache');
574
- this.clearInMemoryState();
575
- return;
576
- }
577
-
578
- if (meta?.version !== CACHE_META_VERSION) {
579
- console.warn(`[Cache] Cache version mismatch (${meta?.version}), ignoring cache`);
580
- this.clearInMemoryState();
581
- return;
582
- }
583
-
584
- if (meta?.embeddingModel !== this.config.embeddingModel) {
585
- console.warn(
586
- `[Cache] Embedding model changed, ignoring cache (${meta?.embeddingModel} -> ${this.config.embeddingModel})`
587
- );
588
- this.clearInMemoryState();
589
- return;
590
- }
591
- const expectedDimension = this.config.embeddingDimension ?? null;
592
- const metaDimension = meta?.embeddingDimension ?? null;
593
- if (metaDimension !== expectedDimension) {
594
- console.warn(
595
- `[Cache] Embedding dimension changed, ignoring cache (${metaDimension} -> ${expectedDimension})`
596
- );
597
- this.clearInMemoryState();
598
- return;
599
- }
600
-
601
- if (!prefetched) {
602
- [cacheData, hashData] = await Promise.all([
603
- useBinary || useSqlite ? Promise.resolve(null) : readJsonFile(cacheFile, { workerThresholdBytes }),
604
- readJsonFile(hashFile, { workerThresholdBytes }),
605
- ]);
606
- }
607
-
608
- this.cacheMeta = meta;
609
-
610
- const [binaryFilesPresent, jsonCachePresent] = await Promise.all([
611
- (async () => {
612
- const [vectorsOk, recordsOk, contentOk, filesOk] = await Promise.all([
613
- pathExists(vectorsPath),
614
- pathExists(recordsPath),
615
- pathExists(contentPath),
616
- pathExists(filesPath),
617
- ]);
618
- return vectorsOk && recordsOk && contentOk && filesOk;
619
- })(),
620
- pathExists(cacheFile),
621
- ]);
622
-
623
- if (useBinary && !binaryFilesPresent) {
624
- if (jsonCachePresent) {
625
- console.warn(
626
- '[Cache] vectorStoreFormat=binary but binary cache files are missing; embeddings.json exists. If you switched formats, reindex or set vectorStoreFormat=json.'
627
- );
628
- } else {
629
- console.warn(
630
- '[Cache] vectorStoreFormat=binary but binary cache files are missing. Reindex to regenerate the cache.'
631
- );
632
- }
633
- } else if (!useBinary && !useSqlite && !jsonCachePresent) {
634
- if (binaryFilesPresent) {
635
- console.warn(
636
- '[Cache] vectorStoreFormat=json but binary cache files exist. If you switched formats, set vectorStoreFormat=binary or reindex.'
637
- );
638
- } else {
639
- console.warn(
640
- '[Cache] vectorStoreFormat=json but embeddings.json is missing. Reindex to regenerate the cache.'
641
- );
642
- }
643
- }
644
-
645
- const configuredVectorLoadMode =
646
- typeof this.config.vectorStoreLoadMode === 'string'
647
- ? this.config.vectorStoreLoadMode.toLowerCase()
648
- : 'memory';
649
- const effectiveVectorLoadMode =
650
- forceVectorLoadMode === 'disk' || forceVectorLoadMode === 'memory'
651
- ? forceVectorLoadMode
652
- : configuredVectorLoadMode;
653
-
654
- if (useBinary) {
655
- try {
656
- this.binaryStore = await BinaryVectorStore.load(this.config.cacheDirectory, {
657
- contentCacheEntries: this.config.contentCacheEntries,
658
- vectorCacheEntries: this.config.vectorCacheEntries,
659
- vectorLoadMode: effectiveVectorLoadMode,
660
- });
661
- cacheData = await this.binaryStore.toChunkViews({
662
- includeContent: this.config.vectorStoreContentMode === 'inline',
663
- includeVector: effectiveVectorLoadMode !== 'disk',
664
- });
665
- } catch (err) {
666
- this.binaryStore = null;
667
- console.warn(`[Cache] Failed to load binary vector store: ${err.message}`);
668
- }
669
- }
670
-
671
- // SQLite store loading
672
- if (useSqlite) {
673
- try {
674
- this.sqliteStore = await SqliteVectorStore.load(this.config.cacheDirectory);
675
- if (this.sqliteStore) {
676
- cacheData = this.sqliteStore.toChunkViews({
677
- includeContent: this.config.vectorStoreContentMode === 'inline',
678
- includeVector: effectiveVectorLoadMode !== 'disk',
679
- });
680
- } else {
681
- // SQLite file missing, need reindex
682
- console.warn('[Cache] vectorStoreFormat=sqlite but vectors.sqlite is missing. Reindex to regenerate the cache.');
683
- }
684
- } catch (err) {
685
- this.sqliteStore = null;
686
- console.warn(`[Cache] Failed to load SQLite vector store: ${err.message}`);
687
- }
688
- }
689
-
690
- if (!cacheData) {
691
- cacheData = await readJsonFile(cacheFile, { workerThresholdBytes });
692
- }
693
-
694
- const hasCacheData = Array.isArray(cacheData);
695
- const hasHashData = hashData && typeof hashData === 'object';
696
-
697
- if (hasCacheData) {
698
- const allowedExtensions = new Set(
699
- (this.config.fileExtensions || []).map((ext) => `.${ext}`)
700
- );
701
- const allowedFileNames = new Set(this.config.fileNames || []);
702
- const applyExtensionFilter = !this.binaryStore;
703
- const shouldKeepFile = (filePath) => {
704
- const ext = path.extname(filePath);
705
- if (allowedExtensions.has(ext)) return true;
706
- return allowedFileNames.has(path.basename(filePath));
707
- };
708
-
709
- const rawHashes = hasHashData ? new Map(Object.entries(hashData)) : new Map();
710
- this.vectorStore = [];
711
- this.fileHashes.clear();
712
-
713
- // Single-pass filter + normalization
714
- for (const chunk of cacheData) {
715
- if (applyExtensionFilter) {
716
- if (!shouldKeepFile(chunk.file)) continue;
717
- }
718
- normalizeChunkVector(chunk);
719
- this.vectorStore.push(chunk);
720
- }
721
- const filteredCount = cacheData.length - this.vectorStore.length;
722
- if (filteredCount > 0 && this.config.verbose) {
723
- console.info(`[Cache] Filtered ${filteredCount} outdated cache entries`);
724
- }
725
-
726
- if (hasHashData) {
727
- // Only keep hashes for allowed extensions
728
- for (const [file, entry] of rawHashes) {
729
- if (!applyExtensionFilter || shouldKeepFile(file)) {
730
- const normalized = normalizeFileHashEntry(entry);
731
- if (normalized) {
732
- this.fileHashes.set(file, normalized);
733
- }
734
- }
735
- }
736
- } else {
737
- console.warn(
738
- '[Cache] Missing file-hashes.json; loaded embeddings but hashes were cleared'
739
- );
740
- }
741
-
742
- assignChunkIndices(this.vectorStore);
743
-
744
- if (this.config.verbose) {
745
- console.info(`[Cache] Loaded ${this.vectorStore.length} cached embeddings`);
746
- }
747
-
748
- // ANN index is lazily loaded/built on first query
749
- this.annDirty = false;
750
- this.annPersistDirty = false;
751
- this.annIndex = null;
752
- this.annMeta = null;
753
- this.annVectorCache = null;
754
- } else if (cacheData) {
755
- console.warn('[Cache] Cache data is not an array; ignoring cached embeddings');
756
- } else if (hasHashData) {
757
- console.warn('[Cache] Hashes exist without embeddings; ignoring file-hashes.json');
758
- }
759
-
760
- // Load call-graph data if it exists
761
- const callGraphFile = path.join(this.config.cacheDirectory, CALL_GRAPH_FILE);
762
- try {
763
- const callGraphData = await fs.readFile(callGraphFile, 'utf8');
764
- const parsed = JSON.parse(callGraphData);
765
- this.fileCallData = new Map(Object.entries(parsed));
766
- if (this.config.verbose) {
767
- console.info(`[Cache] Loaded call-graph data for ${this.fileCallData.size} files`);
768
- }
769
- } catch {
770
- // no cache yet, OK
771
- }
772
- } catch (error) {
773
- console.warn('[Cache] Failed to load cache:', error.message);
774
- this.clearInMemoryState();
775
- }
776
- }
777
-
778
- // -------------------- Save (debounced + serialized) --------------------
779
-
780
- save() {
781
- if (!this.config.enableCache) return Promise.resolve();
782
-
783
- this._saveRequested = true;
784
-
785
- if (this._saveTimer) return this._savePromise ?? Promise.resolve();
786
-
787
- const debounceMs = Number.isInteger(this.config.saveDebounceMs)
788
- ? this.config.saveDebounceMs
789
- : 250;
790
-
791
- this._savePromise = new Promise((resolve, reject) => {
792
- this._saveTimer = setTimeout(() => {
793
- this._saveTimer = null;
794
-
795
- this.saveQueue = this.saveQueue
796
- .then(async () => {
797
- while (this._saveRequested) {
798
- this._saveRequested = false;
799
- await this.performSave();
800
- }
801
- })
802
- .then(resolve, reject)
803
- .finally(() => {
804
- this._savePromise = null;
805
- });
806
- }, debounceMs);
807
- });
808
-
809
- return this._savePromise;
810
- }
811
-
812
- async performSave() {
813
- // Block new reads from starting during save operation
814
- this._saveInProgress = true;
815
-
816
- // Wait for active readers before modifying state to prevent data corruption
817
- if (this.activeReads > 0) {
818
- const timeoutMs = this.config.saveReaderWaitTimeoutMs ?? DEFAULT_READER_WAIT_TIMEOUT_MS;
819
- const allReadersFinished = await this.waitForReadersWithTimeout(timeoutMs);
820
- if (!allReadersFinished && !this.config.forceSaveWithActiveReaders) {
821
- console.warn('[Cache] Aborting save - active readers still present after timeout');
822
- this._saveInProgress = false; // Reset flag on early return
823
- return; // Abort instead of risking data corruption
824
- }
825
- }
826
-
827
- this.isSaving = true;
828
-
829
- try {
830
- await fs.mkdir(this.config.cacheDirectory, { recursive: true });
831
-
832
- const cacheFile = path.join(this.config.cacheDirectory, 'embeddings.json');
833
- const hashFile = path.join(this.config.cacheDirectory, 'file-hashes.json');
834
- const metaFile = path.join(this.config.cacheDirectory, CACHE_META_FILE);
835
-
836
- // Snapshot to avoid race conditions during async write.
837
- // Keep this shallow for binary/sqlite to prevent multi-GB vector materialization.
838
- const snapshotStore = Array.isArray(this.vectorStore) ? [...this.vectorStore] : [];
839
- const supportsBackendVectorResolve =
840
- this.config.vectorStoreFormat === 'binary' || this.config.vectorStoreFormat === 'sqlite';
841
- const hasMissingVectors = snapshotStore.some(
842
- (chunk) => chunk && (chunk.vector === undefined || chunk.vector === null)
843
- );
844
- const useDiskVectors =
845
- supportsBackendVectorResolve &&
846
- (this.config.vectorStoreLoadMode === 'disk' || hasMissingVectors);
847
- if (hasMissingVectors && !useDiskVectors) {
848
- throw new Error(
849
- 'Missing vector data for cache write and backend vector resolution is unavailable'
850
- );
851
- }
852
-
853
- this.cacheMeta = {
854
- version: CACHE_META_VERSION,
855
- embeddingModel: this.config.embeddingModel,
856
- embeddingDimension: this.config.embeddingDimension ?? null,
857
- lastSaveTime: new Date().toISOString(),
858
- filesIndexed: this.fileHashes.size,
859
- chunksStored: snapshotStore.length,
860
- workspace: this.config.searchDirectory || null,
861
- };
862
- if (Number.isFinite(this.lastIndexDurationMs) && this.lastIndexDurationMs >= 0) {
863
- this.cacheMeta.indexDurationMs = Math.round(this.lastIndexDurationMs);
864
- }
865
- if (this.lastIndexStats && typeof this.lastIndexStats === 'object') {
866
- Object.assign(this.cacheMeta, this.lastIndexStats);
867
- }
868
-
869
- const total = snapshotStore.length;
870
- if (this.config.vectorStoreFormat === 'binary') {
871
- this.binaryStore = await BinaryVectorStore.write(
872
- this.config.cacheDirectory,
873
- snapshotStore,
874
- {
875
- contentCacheEntries: this.config.contentCacheEntries,
876
- vectorCacheEntries: this.config.vectorCacheEntries,
877
- vectorLoadMode: useDiskVectors ? 'disk' : this.config.vectorStoreLoadMode,
878
- getContent: (chunk, index) => this.getChunkContent(chunk, index),
879
- getVector: useDiskVectors ? (chunk, index) => this.getChunkVector(chunk, index) : null,
880
- preRename: async () => {
881
- if (this.activeReads > 0) {
882
- await this.waitForReadersWithTimeout(
883
- Number.isInteger(this.config.saveReaderWaitTimeoutMs)
884
- ? this.config.saveReaderWaitTimeoutMs
885
- : 5000
886
- );
887
- }
888
- if (this.binaryStore) {
889
- await this.binaryStore.close();
890
- this.binaryStore = null;
891
- }
892
- },
893
- }
894
- );
895
- if (this.binaryStore) {
896
- this.cacheMeta.chunksStored = this.binaryStore.length;
897
- }
898
- } else if (this.config.vectorStoreFormat === 'sqlite') {
899
- // SQLite store save
900
- if (this.sqliteStore) {
901
- try {
902
- this.sqliteStore.close();
903
- } catch {
904
- // ignore close errors
905
- }
906
- this.sqliteStore = null;
907
- }
908
- this.sqliteStore = await SqliteVectorStore.write(
909
- this.config.cacheDirectory,
910
- snapshotStore,
911
- {
912
- getContent: (chunk, index) => this.getChunkContent(chunk, index),
913
- getVector: useDiskVectors ? (chunk, index) => this.getChunkVector(chunk, index) : null,
914
- preRename: async () => {
915
- if (this.activeReads > 0) {
916
- await this.waitForReadersWithTimeout(
917
- Number.isInteger(this.config.saveReaderWaitTimeoutMs)
918
- ? this.config.saveReaderWaitTimeoutMs
919
- : 5000
920
- );
921
- }
922
- },
923
- }
924
- );
925
- if (this.sqliteStore) {
926
- this.cacheMeta.chunksStored = this.sqliteStore.length();
927
- }
928
- } else {
929
- const vectorWriter = new StreamingJsonWriter(cacheFile, {
930
- highWaterMark: this.config.cacheWriteHighWaterMark ?? 256 * 1024,
931
- floatDigits: this.config.cacheVectorFloatDigits ?? 6,
932
- flushChars: this.config.cacheVectorFlushChars ?? 256 * 1024,
933
- indent: '', // set to " " if you prefer pretty formatting
934
- assumeFinite: this.config.cacheVectorAssumeFinite,
935
- checkFinite: this.config.cacheVectorCheckFinite,
936
- noMutation: this.config.cacheVectorNoMutation ?? false,
937
- joinThreshold: this.config.cacheVectorJoinThreshold ?? 8192,
938
- joinChunkSize: this.config.cacheVectorJoinChunkSize ?? 2048,
939
- });
940
-
941
- await vectorWriter.writeStart();
942
-
943
- // Optional responsiveness yield (only for huge saves)
944
- const yieldEvery = total >= 50_000 ? 5000 : 0;
945
-
946
- try {
947
- for (let i = 0; i < total; i++) {
948
- const pending = vectorWriter.writeItem(snapshotStore[i]);
949
- if (pending) await pending;
950
- if (yieldEvery && i > 0 && i % yieldEvery === 0) await yieldToLoop();
951
- }
952
- await vectorWriter.writeEnd();
953
- } catch (e) {
954
- vectorWriter.abort(e);
955
- throw e;
956
- }
957
- }
958
-
959
- const hashEntries = {};
960
- for (const [file, entry] of this.fileHashes) {
961
- const serialized = serializeFileHashEntry(entry);
962
- if (serialized) {
963
- hashEntries[file] = serialized;
964
- }
965
- }
966
-
967
- await Promise.all([
968
- fs.writeFile(hashFile, JSON.stringify(hashEntries, null, 2)),
969
- fs.writeFile(metaFile, JSON.stringify(this.cacheMeta, null, 2)),
970
- ]);
971
-
972
- // Save call-graph data (or remove stale cache)
973
- const callGraphFile = path.join(this.config.cacheDirectory, CALL_GRAPH_FILE);
974
- if (this.fileCallData.size > 0) {
975
- await fs.writeFile(
976
- callGraphFile,
977
- JSON.stringify(Object.fromEntries(this.fileCallData), null, 2)
978
- );
979
- } else {
980
- await fs.rm(callGraphFile, { force: true });
981
- }
982
-
983
- // Persist ANN index if it exists and changed in memory
984
- // Use mutex to prevent concurrent writes (index could be modified during save)
985
- if (
986
- this.config.annIndexCache !== false &&
987
- this.annPersistDirty &&
988
- !this.annDirty &&
989
- !this._annWriting &&
990
- this.annIndex &&
991
- this.annMeta
992
- ) {
993
- this._annWriting = true;
994
- try {
995
- const { indexFile, metaFile: annMetaFile } = this.getAnnIndexPaths();
996
- this.annIndex.writeIndexSync(indexFile);
997
- await fs.writeFile(annMetaFile, JSON.stringify(this.annMeta, null, 2));
998
- this.annPersistDirty = false;
999
- if (this.config.verbose) {
1000
- console.info(`[ANN] Persisted updated ANN index (${this.annMeta.count} vectors)`);
1001
- }
1002
- } catch (error) {
1003
- console.warn(`[ANN] Failed to persist ANN index: ${error.message}`);
1004
- } finally {
1005
- this._annWriting = false;
1006
- }
1007
- }
1008
- } catch (error) {
1009
- console.warn('[Cache] Failed to save cache:', error.message);
1010
- // Attempt to recover binary store if it was closed during failed save
1011
- if (
1012
- this.config.vectorStoreFormat === 'binary' &&
1013
- this.binaryStore &&
1014
- !this.binaryStore.vectorsBuffer
1015
- ) {
1016
- try {
1017
- console.info('[Cache] Attempting to recover binary store after failed save...');
1018
- this.binaryStore = await BinaryVectorStore.load(this.config.cacheDirectory, {
1019
- contentCacheEntries: this.config.contentCacheEntries,
1020
- });
1021
- console.info('[Cache] Binary store recovered.');
1022
- } catch (recoverErr) {
1023
- console.warn(`[Cache] Failed to recover binary store: ${recoverErr.message}`);
1024
- this.binaryStore = null; // Ensure it's null if unusable
1025
- }
1026
- }
1027
- // Attempt to recover SQLite store if closed during failed save
1028
- if (
1029
- this.config.vectorStoreFormat === 'sqlite' &&
1030
- !this.sqliteStore
1031
- ) {
1032
- try {
1033
- console.info('[Cache] Attempting to recover SQLite store after failed save...');
1034
- this.sqliteStore = await SqliteVectorStore.load(this.config.cacheDirectory);
1035
- if (this.sqliteStore) {
1036
- console.info('[Cache] SQLite store recovered.');
1037
- }
1038
- } catch (recoverErr) {
1039
- console.warn(`[Cache] Failed to recover SQLite store: ${recoverErr.message}`);
1040
- this.sqliteStore = null;
1041
- }
1042
- }
1043
- } finally {
1044
- this.isSaving = false;
1045
- this._saveInProgress = false; // Allow reads to resume
1046
- }
1047
- }
1048
-
1049
- // -------------------- Vector Store API --------------------
1050
-
1051
- getVectorStore() {
1052
- return Array.isArray(this.vectorStore) ? this.vectorStore : [];
1053
- }
1054
-
1055
- async setVectorStore(store) {
1056
- const previousBinaryStore = this.binaryStore;
1057
- const previousSqliteStore = this.sqliteStore;
1058
- this.vectorStore = store;
1059
- this.binaryStore = null;
1060
- this.sqliteStore = null;
1061
- if (Array.isArray(this.vectorStore)) {
1062
- for (const chunk of this.vectorStore) normalizeChunkVector(chunk);
1063
- assignChunkIndices(this.vectorStore);
1064
- }
1065
- this.invalidateAnnIndex();
1066
- if (previousBinaryStore) {
1067
- try {
1068
- await previousBinaryStore.close();
1069
- } catch {
1070
- // ignore close errors
1071
- }
1072
- }
1073
- if (previousSqliteStore) {
1074
- try {
1075
- previousSqliteStore.close();
1076
- } catch {
1077
- // ignore close errors
1078
- }
1079
- }
1080
- }
1081
-
1082
- setLastIndexDuration(durationMs) {
1083
- if (Number.isFinite(durationMs) && durationMs >= 0) {
1084
- this.lastIndexDurationMs = durationMs;
1085
- }
1086
- }
1087
-
1088
- setLastIndexStats(stats) {
1089
- if (stats && typeof stats === 'object') {
1090
- this.lastIndexStats = { ...stats };
1091
- }
1092
- }
1093
-
1094
- getFileHash(file) {
1095
- const entry = this.fileHashes.get(file);
1096
- if (typeof entry === 'string') return entry;
1097
- return entry?.hash;
1098
- }
1099
-
1100
- getFileHashKeys() {
1101
- return Array.from(this.fileHashes.keys());
1102
- }
1103
-
1104
- getFileHashCount() {
1105
- return this.fileHashes.size;
1106
- }
1107
-
1108
- clearFileHashes() {
1109
- this.fileHashes.clear();
1110
- }
1111
-
1112
- setFileHashes(entries) {
1113
- this.fileHashes.clear();
1114
- if (!entries || typeof entries !== 'object') return;
1115
- const iterator =
1116
- entries instanceof Map
1117
- ? entries.entries()
1118
- : Object.entries(entries);
1119
- if (!iterator) return;
1120
- for (const [file, entry] of iterator) {
1121
- const normalized = normalizeFileHashEntry(entry);
1122
- if (normalized) {
1123
- this.fileHashes.set(file, normalized);
1124
- }
1125
- }
1126
- }
1127
-
1128
- setFileHash(file, hash, meta = null) {
1129
- const entry = { hash };
1130
- if (meta && typeof meta === 'object') {
1131
- if (Number.isFinite(meta.mtimeMs)) entry.mtimeMs = meta.mtimeMs;
1132
- if (Number.isFinite(meta.size)) entry.size = meta.size;
1133
- }
1134
- this.fileHashes.set(file, entry);
1135
- }
1136
-
1137
- getFileMeta(file) {
1138
- const entry = this.fileHashes.get(file);
1139
- if (!entry) return null;
1140
- if (typeof entry === 'string') return { hash: entry };
1141
- return entry;
1142
- }
1143
-
1144
- getChunkVector(chunk, index = null) {
1145
- if (typeof chunk === 'number') {
1146
- const store = Array.isArray(this.vectorStore) ? this.vectorStore : null;
1147
- const entry = store ? store[chunk] : null;
1148
- if (entry?.vector) return entry.vector;
1149
- if (this.binaryStore) {
1150
- const resolved = Number.isInteger(entry?._binaryIndex) ? entry._binaryIndex : chunk;
1151
- return this.binaryStore.getVector(resolved);
1152
- }
1153
- if (this.sqliteStore) {
1154
- const resolved = Number.isInteger(entry?._sqliteIndex) ? entry._sqliteIndex : chunk;
1155
- return this.sqliteStore.getVector(resolved);
1156
- }
1157
- return null;
1158
- }
1159
-
1160
- if (chunk?.vector) return chunk.vector;
1161
- const resolved = Number.isInteger(index) ? index : chunk?._index;
1162
- if (this.binaryStore && Number.isInteger(chunk?._binaryIndex)) {
1163
- return this.binaryStore.getVector(chunk._binaryIndex);
1164
- }
1165
- if (this.binaryStore && !Array.isArray(this.vectorStore) && Number.isInteger(resolved)) {
1166
- return this.binaryStore.getVector(resolved);
1167
- }
1168
- if (this.sqliteStore) {
1169
- const sqliteIndex = Number.isInteger(chunk?._sqliteIndex)
1170
- ? chunk._sqliteIndex
1171
- : Number.isInteger(chunk?.index)
1172
- ? chunk.index
1173
- : resolved;
1174
- if (Number.isInteger(sqliteIndex)) {
1175
- return this.sqliteStore.getVector(sqliteIndex);
1176
- }
1177
- }
1178
- return null;
1179
- }
1180
-
1181
- async getChunkContent(chunk, index = null) {
1182
- if (typeof chunk === 'number') {
1183
- const store = Array.isArray(this.vectorStore) ? this.vectorStore : null;
1184
- const entry = store ? store[chunk] : null;
1185
- if (entry) return await this.getChunkContent(entry, chunk);
1186
- if (!store && this.binaryStore) {
1187
- const content = await this.binaryStore.getContent(chunk);
1188
- return content ?? ''; // Ensure consistent empty string return
1189
- }
1190
- if (!store && this.sqliteStore) {
1191
- return this.sqliteStore.getContent(chunk) ?? '';
1192
- }
1193
- return '';
1194
- }
1195
- if (chunk?.content !== undefined && chunk?.content !== null) {
1196
- return chunk.content;
1197
- }
1198
- if (this.binaryStore && Number.isInteger(chunk?._binaryIndex)) {
1199
- const content = await this.binaryStore.getContent(chunk._binaryIndex);
1200
- return content ?? ''; // Ensure consistent empty string return
1201
- }
1202
- const resolved = Number.isInteger(index) ? index : chunk?._index;
1203
- if (this.binaryStore && !Array.isArray(this.vectorStore) && Number.isInteger(resolved)) {
1204
- const content = await this.binaryStore.getContent(resolved);
1205
- return content ?? ''; // Ensure consistent empty string return
1206
- }
1207
- if (this.sqliteStore) {
1208
- const sqliteIndex = Number.isInteger(chunk?._sqliteIndex)
1209
- ? chunk._sqliteIndex
1210
- : Number.isInteger(chunk?.index)
1211
- ? chunk.index
1212
- : resolved;
1213
- if (Number.isInteger(sqliteIndex)) {
1214
- return this.sqliteStore.getContent(sqliteIndex) ?? '';
1215
- }
1216
- }
1217
- return '';
1218
- }
1219
-
1220
- deleteFileHash(file) {
1221
- this.fileHashes.delete(file);
1222
- }
1223
-
1224
- /**
1225
- * Remove all chunks for a given file from the vector store.
1226
- * Note: This is async to support future backend-specific cleanup.
1227
- * For binary/SQLite stores, actual removal happens on next full save.
1228
- * @param {string} file - Absolute path of file to remove
1229
- */
1230
- async removeFileFromStore(file) {
1231
- if (!Array.isArray(this.vectorStore)) return;
1232
- // In-place compaction to avoid allocating a new large array
1233
- let w = 0;
1234
- for (let r = 0; r < this.vectorStore.length; r++) {
1235
- const chunk = this.vectorStore[r];
1236
- if (chunk.file !== file) {
1237
- chunk._index = w;
1238
- this.vectorStore[w++] = chunk;
1239
- }
1240
- }
1241
- this.vectorStore.length = w;
1242
-
1243
- // Removing shifts labels => rebuild ANN
1244
- this.invalidateAnnIndex();
1245
- this.removeFileCallData(file);
1246
- // Also remove file hash to prevent orphaned entries
1247
- this.fileHashes.delete(file);
1248
- }
1249
-
1250
- addToStore(chunk) {
1251
- normalizeChunkVector(chunk);
1252
-
1253
- if (!Array.isArray(this.vectorStore)) {
1254
- this.vectorStore = [];
1255
- }
1256
-
1257
- const label = this.vectorStore.length;
1258
- chunk._index = label;
1259
- this.vectorStore.push(chunk);
1260
- if (Array.isArray(this.annVectorCache) && this.annVectorCache.length === label) {
1261
- this.annVectorCache.push(chunk.vector);
1262
- }
1263
-
1264
- // Best-effort incremental ANN append (fast path)
1265
- if (
1266
- this.annIndex &&
1267
- !this.annDirty &&
1268
- this.annMeta &&
1269
- typeof this.annIndex.addPoint === 'function' &&
1270
- this.annMeta.count === label &&
1271
- this.annMeta.maxElements > this.annMeta.count
1272
- ) {
1273
- try {
1274
- this.annIndex.addPoint(chunk.vector, label);
1275
- this.annMeta.count += 1;
1276
- this.annPersistDirty = true;
1277
- return;
1278
- } catch {
1279
- // fall through
1280
- }
1281
- }
1282
-
1283
- this.invalidateAnnIndex();
1284
- }
1285
-
1286
- invalidateAnnIndex() {
1287
- this.annIndex = null;
1288
- this.annMeta = null;
1289
- this.annDirty = true;
1290
- this.annPersistDirty = false;
1291
- this.annVectorCache = null;
1292
- }
1293
-
1294
- getAnnVector(index) {
1295
- if (!Array.isArray(this.vectorStore)) return null;
1296
- const chunk = this.vectorStore[index];
1297
- if (!chunk) return null;
1298
-
1299
- if (
1300
- !Array.isArray(this.annVectorCache) ||
1301
- this.annVectorCache.length !== this.vectorStore.length
1302
- ) {
1303
- this.annVectorCache = new Array(this.vectorStore.length);
1304
- }
1305
-
1306
- const cached = this.annVectorCache[index];
1307
- if (cached) return cached;
1308
-
1309
- let vec = null;
1310
- if (chunk.vector) {
1311
- vec = ensureFloat32(chunk.vector);
1312
- } else if (this.binaryStore && Number.isInteger(chunk._binaryIndex)) {
1313
- vec = this.binaryStore.getVector(chunk._binaryIndex);
1314
- } else if (this.sqliteStore) {
1315
- const sqliteIndex = Number.isInteger(chunk._sqliteIndex)
1316
- ? chunk._sqliteIndex
1317
- : Number.isInteger(chunk.index)
1318
- ? chunk.index
1319
- : index;
1320
- if (Number.isInteger(sqliteIndex)) {
1321
- vec = this.sqliteStore.getVector(sqliteIndex);
1322
- }
1323
- }
1324
-
1325
- if (!vec) return null;
1326
-
1327
- if (this.config.vectorStoreLoadMode !== 'disk') {
1328
- chunk.vector = vec;
1329
- }
1330
- this.annVectorCache[index] = vec;
1331
- return vec;
1332
- }
1333
-
1334
- getAnnIndexPaths() {
1335
- return {
1336
- indexFile: path.join(this.config.cacheDirectory, ANN_INDEX_FILE),
1337
- metaFile: path.join(this.config.cacheDirectory, ANN_META_FILE),
1338
- };
1339
- }
1340
-
1341
- // -------------------- ANN --------------------
1342
-
1343
- /**
1344
- * Ensure ANN (Approximate Nearest Neighbor) index is built and ready.
1345
- * Loads from disk cache if available and valid, otherwise builds a new index.
1346
- *
1347
- * @returns {Promise<HierarchicalNSW|null>} The HNSW index, or null if:
1348
- * - ANN is disabled in config
1349
- * - vectorStore is not an array
1350
- * - vectorStore size is below annMinChunks threshold
1351
- * - hnswlib-node is not available
1352
- * - Vector dimension mismatch detected
1353
- * @note This method is safe to call multiple times; concurrent calls share the same promise.
1354
- */
1355
- async ensureAnnIndex() {
1356
- if (!this.config.annEnabled) return null;
1357
- if (!Array.isArray(this.vectorStore)) return null;
1358
- if (this.vectorStore.length < (this.config.annMinChunks ?? 5000)) return null;
1359
- if (this.annIndex && !this.annDirty) return this.annIndex;
1360
- if (this.annLoading) return this.annLoading;
1361
-
1362
- this.annLoading = (async () => {
1363
- try {
1364
- const HierarchicalNSW = await loadHnswlib();
1365
- if (!HierarchicalNSW) {
1366
- if (hnswlibLoadError) {
1367
- this.addInitError('loadHnswlib', hnswlibLoadError);
1368
- }
1369
- return null;
1370
- }
1371
-
1372
- const dim =
1373
- this.vectorStore[0]?.vector?.length ||
1374
- this.binaryStore?.dim ||
1375
- this.sqliteStore?.dim;
1376
- if (!dim) return null;
1377
-
1378
- // Validate dimension consistency before building index
1379
- // Use stratified sampling for better coverage across entire store
1380
- let dimensionMismatch = false;
1381
- const sampleSize = Math.min(ANN_DIMENSION_SAMPLE_SIZE, this.vectorStore.length);
1382
- const step = Math.max(1, Math.floor(this.vectorStore.length / sampleSize));
1383
- for (let i = step; i < this.vectorStore.length; i += step) {
1384
- const v = this.vectorStore[i]?.vector;
1385
- if (v && v.length !== dim) {
1386
- dimensionMismatch = true;
1387
- console.warn(
1388
- `[ANN] Dimension mismatch at index ${i}: expected ${dim}, got ${v.length}. ` +
1389
- 'This may indicate a config change mid-index. Consider full reindex.'
1390
- );
1391
- break;
1392
- }
1393
- }
1394
-
1395
- if (dimensionMismatch) {
1396
- this.addInitError('ensureAnnIndex', `Vector dimension inconsistency detected. Expected ${dim}. Full reindex required.`);
1397
- return null; // Skip ANN build - fallback to linear search
1398
- }
1399
-
1400
- if (!this.annDirty && this.config.annIndexCache !== false) {
1401
- const loaded = await this.loadAnnIndexFromDisk(HierarchicalNSW, dim);
1402
- if (loaded) return this.annIndex;
1403
- }
1404
-
1405
- return await this.buildAnnIndex(HierarchicalNSW, dim);
1406
- } finally {
1407
- this.annLoading = null;
1408
- }
1409
- })();
1410
-
1411
- return this.annLoading;
1412
- }
1413
-
1414
- async loadAnnIndexFromDisk(HierarchicalNSW, dim) {
1415
- const { indexFile, metaFile } = this.getAnnIndexPaths();
1416
- const metaData = await fs.readFile(metaFile, 'utf-8').catch(() => null);
1417
- if (!metaData) return false;
1418
-
1419
- let meta;
1420
- try {
1421
- meta = JSON.parse(metaData);
1422
- } catch {
1423
- console.warn('[ANN] Invalid ANN metadata, rebuilding');
1424
- return false;
1425
- }
1426
-
1427
- if (meta?.version !== ANN_META_VERSION) {
1428
- console.warn(`[ANN] ANN index version mismatch (${meta?.version}), rebuilding`);
1429
- return false;
1430
- }
1431
-
1432
- if (meta?.embeddingModel !== this.config.embeddingModel) {
1433
- console.warn('[ANN] Embedding model changed for ANN index, rebuilding');
1434
- return false;
1435
- }
1436
-
1437
- if (meta?.dim !== dim || meta?.count !== this.vectorStore.length) {
1438
- console.warn('[ANN] ANN index size mismatch, rebuilding');
1439
- return false;
1440
- }
1441
-
1442
- if (
1443
- meta?.metric !== this.config.annMetric ||
1444
- meta?.m !== this.config.annM ||
1445
- meta?.efConstruction !== this.config.annEfConstruction
1446
- ) {
1447
- console.warn('[ANN] ANN index config changed, rebuilding');
1448
- return false;
1449
- }
1450
-
1451
- let maxElements = meta?.maxElements;
1452
- if (!Number.isInteger(maxElements)) {
1453
- maxElements = meta.count;
1454
- } else if (maxElements < meta.count) {
1455
- console.warn('[ANN] ANN capacity invalid, rebuilding');
1456
- return false;
1457
- }
1458
-
1459
- const index = new HierarchicalNSW(meta.metric, dim);
1460
- const loaded = readHnswIndex(index, indexFile, maxElements);
1461
- if (!loaded) {
1462
- console.warn('[ANN] Failed to load ANN index file, rebuilding');
1463
- return false;
1464
- }
1465
-
1466
- if (typeof index.setEf === 'function') {
1467
- index.setEf(this.config.annEfSearch);
1468
- }
1469
-
1470
- this.annIndex = index;
1471
- this.annMeta = { ...meta, maxElements };
1472
- this.annDirty = false;
1473
- this.annPersistDirty = false;
1474
-
1475
- if (this.config.verbose) {
1476
- console.info(`[ANN] Loaded ANN index (${meta.count} vectors, cap=${maxElements})`);
1477
- }
1478
- return true;
1479
- }
1480
-
1481
- async buildAnnIndex(HierarchicalNSW, dim) {
1482
- if (!Array.isArray(this.vectorStore)) return null;
1483
- const total = this.vectorStore.length;
1484
- if (total === 0) return null;
1485
-
1486
- try {
1487
- const index = new HierarchicalNSW(this.config.annMetric, dim);
1488
-
1489
- const maxElements = computeAnnCapacity(total, this.config);
1490
- initHnswIndex(index, maxElements, this.config.annM, this.config.annEfConstruction);
1491
-
1492
- const yieldEvery = Number.isInteger(this.config.annBuildYieldEvery)
1493
- ? this.config.annBuildYieldEvery
1494
- : 1000;
1495
-
1496
- for (let i = 0; i < total; i++) {
1497
- const vector = this.getAnnVector(i);
1498
- if (!vector) throw new Error(`Missing vector for ANN index at position ${i}`);
1499
- index.addPoint(vector, i);
1500
-
1501
- if (yieldEvery > 0 && i > 0 && i % yieldEvery === 0) {
1502
- await yieldToLoop();
1503
- }
1504
- }
1505
-
1506
- if (typeof index.setEf === 'function') {
1507
- index.setEf(this.config.annEfSearch);
1508
- }
1509
-
1510
- this.annIndex = index;
1511
- this.annMeta = {
1512
- version: ANN_META_VERSION,
1513
- embeddingModel: this.config.embeddingModel,
1514
- metric: this.config.annMetric,
1515
- dim,
1516
- count: total,
1517
- maxElements,
1518
- m: this.config.annM,
1519
- efConstruction: this.config.annEfConstruction,
1520
- efSearch: this.config.annEfSearch,
1521
- };
1522
- this.annDirty = false;
1523
- this.annPersistDirty = true;
1524
-
1525
- if (this.config.annIndexCache !== false) {
1526
- try {
1527
- await fs.mkdir(this.config.cacheDirectory, { recursive: true });
1528
- const { indexFile, metaFile } = this.getAnnIndexPaths();
1529
- index.writeIndexSync(indexFile);
1530
- await fs.writeFile(metaFile, JSON.stringify(this.annMeta, null, 2));
1531
- this.annPersistDirty = false;
1532
- if (this.config.verbose) {
1533
- console.info(`[ANN] Saved ANN index (${total} vectors, cap=${maxElements})`);
1534
- }
1535
- } catch (error) {
1536
- console.warn(`[ANN] Failed to save ANN index: ${error.message}`);
1537
- }
1538
- }
1539
-
1540
- return index;
1541
- } catch (error) {
1542
- console.warn(`[ANN] Failed to build ANN index: ${error.message}`);
1543
- this.addInitError('buildAnnIndex', error);
1544
- this.annIndex = null;
1545
- this.annMeta = null;
1546
- this.annDirty = true;
1547
- this.annPersistDirty = false;
1548
- return null;
1549
- }
1550
- }
1551
-
1552
- /**
1553
- * Query the ANN index for k nearest neighbors.
1554
- * Falls back gracefully to empty results if ANN is unavailable.
1555
- *
1556
- * @param {Float32Array|number[]} queryVector - Normalized query embedding
1557
- * @param {number} k - Number of neighbors to return
1558
- * @returns {Promise<number[]>} Array of chunk indices sorted by similarity (may be empty)
1559
- * @throws Never throws - returns empty array on all error conditions
1560
- * @note Automatically invalidates corrupted index and falls back to linear search on next query
1561
- */
1562
- async queryAnn(queryVector, k) {
1563
- if (!Array.isArray(this.vectorStore) || this.vectorStore.length === 0) return [];
1564
- const index = await this.ensureAnnIndex();
1565
- if (!index) return [];
1566
-
1567
- const qVec = queryVector instanceof Float32Array ? queryVector : new Float32Array(queryVector);
1568
-
1569
- // Wrap searchKnn in try-catch to handle corrupted index or dimension mismatches
1570
- let results;
1571
- try {
1572
- results = index.searchKnn(qVec, k);
1573
- } catch (err) {
1574
- console.warn(`[ANN] searchKnn failed: ${err.message}. Falling back to linear search.`);
1575
- this.addInitError('queryAnn', err);
1576
- // Invalidate to trigger rebuild on next query
1577
- this.invalidateAnnIndex();
1578
- return [];
1579
- }
1580
-
1581
- const labels = normalizeLabels(results);
1582
-
1583
- if (labels.length === 0) return [];
1584
-
1585
- const filtered = labels.filter(
1586
- (label) => Number.isInteger(label) && label >= 0 && label < this.vectorStore.length
1587
- );
1588
-
1589
- return filtered;
1590
- }
1591
-
1592
- async clear() {
1593
- if (!this.config.enableCache) return;
1594
-
1595
- try {
1596
- await fs.rm(this.config.cacheDirectory, { recursive: true, force: true });
1597
- this.vectorStore = [];
1598
- if (this.binaryStore) {
1599
- try {
1600
- await this.binaryStore.close();
1601
- } catch {
1602
- // ignore close errors
1603
- }
1604
- }
1605
- this.binaryStore = null;
1606
- if (this.sqliteStore) {
1607
- try {
1608
- this.sqliteStore.close();
1609
- } catch {
1610
- // ignore close errors
1611
- }
1612
- }
1613
- this.sqliteStore = null;
1614
- this.fileHashes = new Map();
1615
- this.invalidateAnnIndex();
1616
- await this.clearCallGraphData();
1617
- if (this.config.verbose) {
1618
- console.info(`[Cache] Cache cleared successfully: ${this.config.cacheDirectory}`);
1619
- }
1620
- } catch (error) {
1621
- console.error('[Cache] Failed to clear cache:', error.message);
1622
- throw error;
1623
- }
1624
- }
1625
-
1626
- /**
1627
- * Adjust efSearch at runtime for speed/accuracy tradeoff.
1628
- * Higher values = more accurate but slower.
1629
- * @param {number} efSearch - New efSearch value (typically 16-512)
1630
- * @returns {object} Result with success status and current config
1631
- */
1632
- setEfSearch(efSearch) {
1633
- if (typeof efSearch !== 'number' || efSearch < 1 || efSearch > 1000) {
1634
- return {
1635
- success: false,
1636
- error: 'efSearch must be a number between 1 and 1000',
1637
- };
1638
- }
1639
-
1640
- this.config.annEfSearch = efSearch;
1641
-
1642
- if (this.annIndex && typeof this.annIndex.setEf === 'function') {
1643
- this.annIndex.setEf(efSearch);
1644
- if (this.annMeta) this.annMeta.efSearch = efSearch;
1645
- this.annPersistDirty = true;
1646
- if (this.config.verbose) {
1647
- console.info(`[ANN] efSearch updated to ${efSearch} (applied to active index)`);
1648
- }
1649
- return { success: true, applied: true, efSearch };
1650
- }
1651
-
1652
- if (this.config.verbose) {
1653
- console.info(`[ANN] efSearch updated to ${efSearch} (will apply on next index build)`);
1654
- }
1655
- return { success: true, applied: false, efSearch };
1656
- }
1657
-
1658
- /**
1659
- * Get current ANN index statistics for diagnostics.
1660
- * @returns {object} ANN stats including index state, config, and vector count
1661
- */
1662
- getAnnStats() {
1663
- return {
1664
- enabled: this.config.annEnabled ?? false,
1665
- indexLoaded: this.annIndex !== null,
1666
- dirty: this.annDirty,
1667
- vectorCount: Array.isArray(this.vectorStore) ? this.vectorStore.length : 0,
1668
- minChunksForAnn: this.config.annMinChunks ?? 5000,
1669
- config: this.annMeta
1670
- ? {
1671
- metric: this.annMeta.metric,
1672
- dim: this.annMeta.dim,
1673
- count: this.annMeta.count,
1674
- m: this.annMeta.m,
1675
- efConstruction: this.annMeta.efConstruction,
1676
- efSearch: this.config.annEfSearch,
1677
- }
1678
- : null,
1679
- };
1680
- }
1681
-
1682
- // -------------------- Call Graph --------------------
1683
-
1684
- async clearCallGraphData({ removeFile = false } = {}) {
1685
- this.fileCallData.clear();
1686
- this.callGraph = null;
1687
-
1688
- if (removeFile && this.config.enableCache) {
1689
- const callGraphFile = path.join(this.config.cacheDirectory, CALL_GRAPH_FILE);
1690
- try {
1691
- await fs.rm(callGraphFile, { force: true });
1692
- } catch (error) {
1693
- if (this.config.verbose) {
1694
- console.warn(`[Cache] Failed to remove call-graph cache: ${error.message}`);
1695
- }
1696
- }
1697
- }
1698
- }
1699
-
1700
- pruneCallGraphData(validFiles) {
1701
- if (!validFiles || this.fileCallData.size === 0) return 0;
1702
-
1703
- let pruned = 0;
1704
- for (const file of Array.from(this.fileCallData.keys())) {
1705
- if (!validFiles.has(file)) {
1706
- this.fileCallData.delete(file);
1707
- pruned++;
1708
- }
1709
- }
1710
-
1711
- if (pruned > 0) this.callGraph = null;
1712
- return pruned;
1713
- }
1714
-
1715
- getFileCallData(file) {
1716
- return this.fileCallData.get(file);
1717
- }
1718
-
1719
- hasFileCallData(file) {
1720
- return this.fileCallData.has(file);
1721
- }
1722
-
1723
- getFileCallDataKeys() {
1724
- return Array.from(this.fileCallData.keys());
1725
- }
1726
-
1727
- getFileCallDataCount() {
1728
- return this.fileCallData.size;
1729
- }
1730
-
1731
- /**
1732
- * Sets call data for a specific file.
1733
- * @param {string} file
1734
- * @param {object} data
1735
- */
1736
- setFileCallData(file, data) {
1737
- this.fileCallData.set(file, data);
1738
- this.callGraph = null;
1739
- }
1740
-
1741
- /**
1742
- * Sets the entire file call data map.
1743
- * @param {Map<string, object>|object} entries
1744
- */
1745
- setFileCallDataEntries(entries) {
1746
- if (entries instanceof Map) {
1747
- this.fileCallData = entries;
1748
- } else {
1749
- this.fileCallData.clear();
1750
- if (entries && typeof entries === 'object') {
1751
- for (const [file, data] of Object.entries(entries)) {
1752
- this.fileCallData.set(file, data);
1753
- }
1754
- }
1755
- }
1756
- this.callGraph = null;
1757
- }
1758
-
1759
- clearFileCallData() {
1760
- this.fileCallData.clear();
1761
- this.callGraph = null;
1762
- }
1763
-
1764
- removeFileCallData(file) {
1765
- this.fileCallData.delete(file);
1766
- this.callGraph = null;
1767
- }
1768
-
1769
- async rebuildCallGraph() {
1770
- if (this._callGraphBuild) return this._callGraphBuild;
1771
-
1772
- this._callGraphBuild = (async () => {
1773
- try {
1774
- const { buildCallGraph } = await import('./call-graph.js');
1775
- this.callGraph = buildCallGraph(this.fileCallData);
1776
- if (this.config.verbose && this.callGraph) {
1777
- console.info(
1778
- `[CallGraph] Built graph: ${this.callGraph.defines.size} definitions, ${this.callGraph.calledBy.size} call targets`
1779
- );
1780
- }
1781
- } catch (err) {
1782
- console.error(`[CallGraph] Failed to build: ${err.message}`);
1783
- this.callGraph = null;
1784
- } finally {
1785
- this._callGraphBuild = null;
1786
- }
1787
- })();
1788
-
1789
- return this._callGraphBuild;
1790
- }
1791
-
1792
- async getRelatedFiles(symbols) {
1793
- if (!this.config.callGraphEnabled || symbols.length === 0) return new Map();
1794
- if (!this.callGraph && this.fileCallData.size > 0) await this.rebuildCallGraph();
1795
- if (!this.callGraph) return new Map();
1796
-
1797
- const { getRelatedFiles } = await import('./call-graph.js');
1798
- return getRelatedFiles(this.callGraph, symbols, this.config.callGraphMaxHops);
1799
- }
1800
-
1801
- getCallGraphStats() {
1802
- return {
1803
- enabled: this.config.callGraphEnabled ?? false,
1804
- filesWithData: this.fileCallData.size,
1805
- graphBuilt: this.callGraph !== null,
1806
- definitions: this.callGraph?.defines.size ?? 0,
1807
- callTargets: this.callGraph?.calledBy.size ?? 0,
1808
- };
1809
- }
1810
-
1811
- // -------------------- Abstraction Layer --------------------
1812
-
1813
- /**
1814
- * Returns the total number of chunks in the store.
1815
- * @returns {number}
1816
- */
1817
- getStoreSize() {
1818
- if (Array.isArray(this.vectorStore)) return this.vectorStore.length;
1819
- if (this.binaryStore) return this.binaryStore.length;
1820
- if (this.sqliteStore) return this.sqliteStore.length();
1821
- return 0;
1822
- }
1823
-
1824
- /**
1825
- * Retrieves a vector by its store index.
1826
- * @param {number} index
1827
- * @returns {Float32Array|null}
1828
- */
1829
- getVector(index) {
1830
- return this.getChunkVector(index);
1831
- }
1832
-
1833
- /**
1834
- * Retrieves a chunk object by its store index.
1835
- * @param {number} index
1836
- * @returns {object|null}
1837
- */
1838
- getChunk(index) {
1839
- if (Array.isArray(this.vectorStore) && index >= 0 && index < this.vectorStore.length) {
1840
- return this.vectorStore[index];
1841
- }
1842
- if (this.binaryStore) {
1843
- const record = this.binaryStore.getRecord(index);
1844
- if (record) {
1845
- return {
1846
- file: record.file,
1847
- startLine: record.startLine,
1848
- endLine: record.endLine,
1849
- vector: this.binaryStore.getVector(index),
1850
- _index: index,
1851
- _binaryIndex: index,
1852
- };
1853
- }
1854
- }
1855
- if (this.sqliteStore) {
1856
- const record = this.sqliteStore.getRecord(index);
1857
- if (record) {
1858
- return {
1859
- file: record.file,
1860
- startLine: record.startLine,
1861
- endLine: record.endLine,
1862
- vector: this.sqliteStore.getVector(index),
1863
- _index: index,
1864
- _sqliteIndex: index,
1865
- };
1866
- }
1867
- }
1868
- return null;
1869
- }
1870
- }
1
+ import fs from 'fs/promises';
2
+ import path from 'path';
3
+ import { Worker } from 'worker_threads';
4
+ import { StreamingJsonWriter } from './json-writer.js';
5
+ import { BinaryVectorStore } from './vector-store-binary.js';
6
+ import { SqliteVectorStore } from './vector-store-sqlite.js';
7
+ import {
8
+ JSON_WORKER_THRESHOLD_BYTES,
9
+ ANN_DIMENSION_SAMPLE_SIZE,
10
+ HNSWLIB_ERROR_RESET_MS,
11
+ DEFAULT_READER_WAIT_TIMEOUT_MS,
12
+ } from './constants.js';
13
+
14
+ const CACHE_META_VERSION = 1;
15
+ const CACHE_META_FILE = 'meta.json';
16
+
17
+ // ANN meta version stays at 1 for compatibility; maxElements is optional.
18
+ const ANN_META_VERSION = 1;
19
+ const ANN_INDEX_FILE = 'ann-index.bin';
20
+ const ANN_META_FILE = 'ann-meta.json';
21
+
22
+ const CALL_GRAPH_FILE = 'call-graph.json';
23
+
24
+ const IS_TEST_ENV = process.env.VITEST === 'true' || process.env.NODE_ENV === 'test';
25
+
26
+ // Yield to event loop to keep IDE/extension host responsive during heavy CPU loops
27
+ const yieldToLoop = () => new Promise((resolve) => setImmediate(resolve));
28
+
29
+ let hnswlibPromise = null;
30
+ let hnswlibLoadError = null;
31
+
32
+ async function parseJsonInWorker(filePath) {
33
+ return new Promise((resolve, reject) => {
34
+ let settled = false;
35
+ const worker = new Worker(new URL('./json-worker.js', import.meta.url), {
36
+ workerData: { filePath },
37
+ });
38
+
39
+ // finish() provides safe cleanup: removeAllListeners ensures no memory leak from
40
+ // orphaned handlers, terminate() cleans up the worker process. The 'settled' flag
41
+ // prevents double-resolution if multiple events fire before cleanup completes.
42
+ const finish = (handler, value) => {
43
+ if (settled) return;
44
+ settled = true;
45
+ worker.removeAllListeners();
46
+ const termination = worker.terminate?.();
47
+ if (termination && typeof termination.catch === 'function') termination.catch(() => null);
48
+ handler(value);
49
+ };
50
+
51
+ worker.once('message', (msg) => {
52
+ if (msg?.ok) {
53
+ finish(resolve, msg.data);
54
+ } else {
55
+ const err = new Error(msg?.error || 'JSON worker failed');
56
+ console.warn(`[Cache] ${err.message}`);
57
+ finish(reject, err);
58
+ }
59
+ });
60
+
61
+ worker.once('error', (err) => {
62
+ console.error(`[Cache] JSON worker error: ${err.message}`);
63
+ finish(reject, err);
64
+ });
65
+
66
+ worker.once('exit', (code) => {
67
+ if (code !== 0) {
68
+ const err = new Error(`JSON worker exited with code ${code}`);
69
+ console.error(`[Cache] ${err.message}`);
70
+ finish(reject, err);
71
+ return;
72
+ }
73
+ if (!settled) {
74
+ const err = new Error('JSON worker exited without sending a response');
75
+ console.error(`[Cache] ${err.message}`);
76
+ finish(reject, err);
77
+ }
78
+ });
79
+ });
80
+ }
81
+
82
+ async function readJsonFile(
83
+ filePath,
84
+ { workerThresholdBytes = JSON_WORKER_THRESHOLD_BYTES } = {}
85
+ ) {
86
+ let stats;
87
+ try {
88
+ stats = await fs.stat(filePath);
89
+ } catch {
90
+ // File doesn't exist - this is expected and not an error condition
91
+ return null;
92
+ }
93
+
94
+ try {
95
+ const canUseWorker = typeof Worker === 'function';
96
+ const useWorker =
97
+ canUseWorker && stats && typeof stats.size === 'number'
98
+ ? stats.size >= workerThresholdBytes
99
+ : false;
100
+
101
+ if (useWorker) return await parseJsonInWorker(filePath);
102
+
103
+ const data = await fs.readFile(filePath, 'utf-8');
104
+ return JSON.parse(data);
105
+ } catch (error) {
106
+ console.warn(`[Cache] Failed to parse ${path.basename(filePath)}: ${error.message}`);
107
+ return null;
108
+ }
109
+ }
110
+
111
+ async function loadHnswlib() {
112
+ // Reset error state after configured timeout to allow retry
113
+ if (hnswlibLoadError) {
114
+ if (hnswlibLoadError._timestamp && Date.now() - hnswlibLoadError._timestamp > HNSWLIB_ERROR_RESET_MS) {
115
+ hnswlibLoadError = null;
116
+ hnswlibPromise = null;
117
+ } else {
118
+ return null;
119
+ }
120
+ }
121
+
122
+ if (!hnswlibPromise) {
123
+ hnswlibPromise = import('hnswlib-node')
124
+ .then((mod) => {
125
+ const HierarchicalNSW = mod?.HierarchicalNSW || mod?.default?.HierarchicalNSW;
126
+ if (!HierarchicalNSW) throw new Error('HierarchicalNSW export not found');
127
+ return HierarchicalNSW;
128
+ })
129
+ .catch((err) => {
130
+ // Store timestamp to allow later retry
131
+ err._timestamp = Date.now();
132
+ hnswlibLoadError = err;
133
+ console.warn(`[ANN] hnswlib-node unavailable, using linear search (${err.message})`);
134
+ return null;
135
+ });
136
+ }
137
+
138
+ return hnswlibPromise;
139
+ }
140
+
141
+ function initHnswIndex(index, maxElements, m, efConstruction) {
142
+ try {
143
+ index.initIndex(maxElements, m, efConstruction, 100);
144
+ return;
145
+ } catch (err) {
146
+ console.warn(`[ANN] Standard init failed: ${err.message}`);
147
+ }
148
+ try {
149
+ index.initIndex(maxElements, m, efConstruction);
150
+ return;
151
+ } catch (err) {
152
+ console.warn(`[ANN] Legacy init failed: ${err.message}`);
153
+ }
154
+ index.initIndex(maxElements);
155
+ }
156
+
157
+ function readHnswIndex(index, filePath, maxElements) {
158
+ try {
159
+ index.readIndexSync(filePath, maxElements);
160
+ return true;
161
+ } catch {
162
+ /* ignore */
163
+ }
164
+ try {
165
+ index.readIndexSync(filePath);
166
+ return true;
167
+ } catch (err) {
168
+ console.warn(`[ANN] Read index failed: ${err.message}`);
169
+ }
170
+ return false;
171
+ }
172
+
173
+ function normalizeLabels(result) {
174
+ if (!result) return [];
175
+ if (Array.isArray(result)) return result;
176
+ const labels = result.labels || result.neighbors || result.indices;
177
+ return labels ? Array.from(labels) : [];
178
+ }
179
+
180
+ function ensureFloat32(vector) {
181
+ if (!vector) return null;
182
+ if (vector instanceof Float32Array) return vector;
183
+
184
+ // Convert values (do NOT reinterpret bytes)
185
+ let result;
186
+ if (ArrayBuffer.isView(vector)) {
187
+ result = Float32Array.from(vector);
188
+ } else {
189
+ result = new Float32Array(vector);
190
+ }
191
+
192
+ // In test environment, validate that all values are finite to catch corruption early
193
+ if (IS_TEST_ENV && result.length > 0) {
194
+ for (let i = 0; i < result.length; i++) {
195
+ if (!Number.isFinite(result[i])) {
196
+ throw new Error(
197
+ `Invalid vector value at index ${i}: ${result[i]}. ` +
198
+ 'Vector contains NaN or Infinity, which will corrupt search results.'
199
+ );
200
+ }
201
+ }
202
+ }
203
+
204
+ return result;
205
+ }
206
+
207
+ function normalizeChunkVector(chunk) {
208
+ if (chunk?.vector) chunk.vector = ensureFloat32(chunk.vector);
209
+ }
210
+
211
+ function assignChunkIndices(store) {
212
+ if (!Array.isArray(store)) return;
213
+ for (let i = 0; i < store.length; i += 1) {
214
+ const chunk = store[i];
215
+ if (chunk) {
216
+ chunk._index = i;
217
+ }
218
+ }
219
+ }
220
+
221
+ function normalizeFileHashEntry(entry) {
222
+ if (!entry) return null;
223
+ if (typeof entry === 'string') return { hash: entry };
224
+ if (typeof entry !== 'object') return null;
225
+ if (typeof entry.hash !== 'string') return null;
226
+ const normalized = { hash: entry.hash };
227
+ if (Number.isFinite(entry.mtimeMs)) normalized.mtimeMs = entry.mtimeMs;
228
+ if (Number.isFinite(entry.size)) normalized.size = entry.size;
229
+ return normalized;
230
+ }
231
+
232
+ function serializeFileHashEntry(entry) {
233
+ if (!entry) return null;
234
+ if (typeof entry === 'string') return { hash: entry };
235
+ if (typeof entry !== 'object') return null;
236
+ if (typeof entry.hash !== 'string') return null;
237
+ const serialized = { hash: entry.hash };
238
+ if (Number.isFinite(entry.mtimeMs)) serialized.mtimeMs = entry.mtimeMs;
239
+ if (Number.isFinite(entry.size)) serialized.size = entry.size;
240
+ return serialized;
241
+ }
242
+
243
+ function computeAnnCapacity(total, config) {
244
+ const factor = typeof config.annCapacityFactor === 'number' ? config.annCapacityFactor : 1.2;
245
+ const extra = Number.isInteger(config.annCapacityExtra) ? config.annCapacityExtra : 1024;
246
+ const byFactor = Math.ceil(total * factor);
247
+ const byExtra = total + extra;
248
+ return Math.max(total, byFactor, byExtra);
249
+ }
250
+
251
+ export class EmbeddingsCache {
252
+ constructor(config) {
253
+ this.config = config;
254
+
255
+ this.vectorStore = [];
256
+ this.fileHashes = new Map();
257
+ this.isSaving = false;
258
+ this.lastIndexDurationMs = null;
259
+ this.lastIndexStats = null;
260
+
261
+ this.cacheMeta = {
262
+ version: CACHE_META_VERSION,
263
+ embeddingModel: config.embeddingModel,
264
+ embeddingDimension: config.embeddingDimension ?? null,
265
+ };
266
+
267
+ // Save coalescing / debounce (serialized via saveQueue)
268
+ this.saveQueue = Promise.resolve();
269
+ this._saveTimer = null;
270
+ this._saveRequested = false;
271
+ this._savePromise = null;
272
+
273
+ // ANN state
274
+ this.annIndex = null;
275
+ this.annMeta = null;
276
+ this.annDirty = false; // needs rebuild
277
+ this.annPersistDirty = false; // in-memory differs from disk
278
+ this.annLoading = null;
279
+ this.annVectorCache = null;
280
+
281
+ // Call graph
282
+ this.fileCallData = new Map();
283
+ this.callGraph = null;
284
+ this._callGraphBuild = null;
285
+
286
+ // Binary vector store (optional)
287
+ this.binaryStore = null;
288
+
289
+ // SQLite vector store (optional)
290
+ this.sqliteStore = null;
291
+
292
+ // Error tracking
293
+ this.initErrors = [];
294
+
295
+ // Concurrency hooks (read tracking)
296
+ this.activeReads = 0;
297
+ this._readWaiters = [];
298
+ this._saveInProgress = false; // Prevents new reads during save
299
+
300
+ // Lazy reload support after dropping in-memory vectors
301
+ this._clearedAfterIndex = false;
302
+ this._loadPromise = null;
303
+ }
304
+
305
+ /**
306
+ * Add an initialization error with consistent structure.
307
+ * @param {string} stage - The stage where the error occurred (e.g., 'loadHnswlib', 'ensureAnnIndex')
308
+ * @param {Error|string} error - The error object or message
309
+ */
310
+ addInitError(stage, error) {
311
+ this.initErrors.push({
312
+ stage,
313
+ message: error instanceof Error ? error.message : String(error),
314
+ stack: error instanceof Error ? error.stack : null,
315
+ timestamp: Date.now(),
316
+ });
317
+ }
318
+
319
+ clearInMemoryState() {
320
+ this.vectorStore = [];
321
+ this.fileHashes.clear();
322
+ this.invalidateAnnIndex();
323
+ this.fileCallData.clear();
324
+ this.callGraph = null;
325
+ this.initErrors = [];
326
+ if (this.binaryStore) {
327
+ try {
328
+ this.binaryStore.close?.();
329
+ } catch {
330
+ // ignore close errors
331
+ }
332
+ this.binaryStore = null;
333
+ }
334
+ if (this.sqliteStore) {
335
+ try {
336
+ this.sqliteStore.close?.();
337
+ } catch {
338
+ // ignore close errors
339
+ }
340
+ this.sqliteStore = null;
341
+ }
342
+ }
343
+
344
+ async close() {
345
+ if (this.binaryStore) {
346
+ await this.binaryStore.close();
347
+ this.binaryStore = null;
348
+ }
349
+ if (this.sqliteStore) {
350
+ try {
351
+ this.sqliteStore.close();
352
+ } catch {
353
+ // SQLite may already be closed or in error state
354
+ }
355
+ this.sqliteStore = null;
356
+ }
357
+ }
358
+
359
+ async ensureLoaded({ preferDisk = false } = {}) {
360
+ if (!this.config.enableCache) return;
361
+ if (!this._clearedAfterIndex) return;
362
+ if (this._loadPromise) return this._loadPromise;
363
+
364
+ this._loadPromise = (async () => {
365
+ if (preferDisk && this.config.verbose) {
366
+ console.info('[Cache] ensureLoaded: forcing disk vector mode for incremental low-RAM reload');
367
+ }
368
+ await this.load({
369
+ forceVectorLoadMode: preferDisk ? 'disk' : undefined,
370
+ });
371
+ this._clearedAfterIndex = false;
372
+ })().finally(() => {
373
+ this._loadPromise = null;
374
+ });
375
+
376
+ return this._loadPromise;
377
+ }
378
+
379
+ async dropInMemoryVectors() {
380
+ if (!this.config.enableCache) return;
381
+
382
+ if (this.activeReads > 0) {
383
+ await this.waitForReaders();
384
+ }
385
+
386
+ this.vectorStore = [];
387
+ this.annVectorCache = null;
388
+ this.annIndex = null;
389
+ this.annMeta = null;
390
+ this.annDirty = true;
391
+ this.annPersistDirty = false;
392
+
393
+ if (this.binaryStore) {
394
+ try {
395
+ await this.binaryStore.close();
396
+ } catch {
397
+ // ignore close errors
398
+ }
399
+ this.binaryStore = null;
400
+ }
401
+
402
+ if (this.sqliteStore) {
403
+ try {
404
+ this.sqliteStore.close();
405
+ } catch {
406
+ // ignore close errors
407
+ }
408
+ this.sqliteStore = null;
409
+ }
410
+
411
+ this._clearedAfterIndex = true;
412
+ }
413
+
414
+ // -------------------- Concurrency Hooks --------------------
415
+
416
+ startRead() {
417
+ // Prevent new reads while save is in progress to avoid race conditions
418
+ if (this._saveInProgress) {
419
+ throw new Error('Cache save in progress, try again shortly');
420
+ }
421
+ this.activeReads++;
422
+ }
423
+
424
+ endRead() {
425
+ if (this.activeReads > 0) {
426
+ this.activeReads--;
427
+ if (this.activeReads === 0 && this._readWaiters.length > 0) {
428
+ const waiters = this._readWaiters;
429
+ this._readWaiters = [];
430
+ for (const resolve of waiters) {
431
+ resolve();
432
+ }
433
+ }
434
+ }
435
+ }
436
+
437
+ async waitForReaders() {
438
+ if (this.activeReads === 0) return;
439
+ await new Promise((resolve) => {
440
+ this._readWaiters.push(resolve);
441
+ });
442
+ }
443
+
444
+ async waitForReadersWithTimeout(timeoutMs = 5000) {
445
+ if (this.activeReads === 0) return true;
446
+ let timedOut = false;
447
+ let resolved = false;
448
+ let waiterResolve;
449
+ const waiterPromise = new Promise((resolve) => {
450
+ waiterResolve = () => {
451
+ if (!resolved) {
452
+ resolved = true;
453
+ resolve();
454
+ }
455
+ };
456
+ this._readWaiters.push(waiterResolve);
457
+ });
458
+ await Promise.race([
459
+ waiterPromise,
460
+ new Promise((resolve) => {
461
+ setTimeout(() => {
462
+ if (!resolved) {
463
+ resolved = true;
464
+ timedOut = true;
465
+ // Remove waiter from array to prevent late invocation after timeout
466
+ const idx = this._readWaiters.indexOf(waiterResolve);
467
+ if (idx >= 0) this._readWaiters.splice(idx, 1);
468
+ resolve();
469
+ }
470
+ }, timeoutMs);
471
+ }),
472
+ ]);
473
+ if (timedOut) {
474
+ // Always warn (not just verbose) since proceeding with active readers is risky
475
+ console.warn(
476
+ `[Cache] Timed out waiting for ${this.activeReads} active reader(s); proceeding with save anyway. ` +
477
+ 'This may cause data inconsistency if readers access the store during write.'
478
+ );
479
+ }
480
+ return !timedOut;
481
+ }
482
+
483
+ // -------------------- Reset --------------------
484
+
485
+ /**
486
+ * Resets the cache state (clears vectors, hashes, and call graph).
487
+ * Used for forced reindexing.
488
+ */
489
+ async reset() {
490
+ this.vectorStore = [];
491
+ if (this.binaryStore) {
492
+ try {
493
+ await this.binaryStore.close();
494
+ } catch {
495
+ // ignore close errors
496
+ }
497
+ this.binaryStore = null;
498
+ }
499
+ if (this.sqliteStore) {
500
+ try {
501
+ this.sqliteStore.close();
502
+ } catch {
503
+ // ignore close errors
504
+ }
505
+ this.sqliteStore = null;
506
+ }
507
+ this.fileHashes.clear();
508
+ this.invalidateAnnIndex();
509
+ await this.clearCallGraphData({ removeFile: true });
510
+ this.initErrors = [];
511
+ }
512
+
513
+ // -------------------- Load --------------------
514
+
515
+ async load({ forceVectorLoadMode } = {}) {
516
+ if (!this.config.enableCache) return;
517
+
518
+ try {
519
+ await fs.mkdir(this.config.cacheDirectory, { recursive: true });
520
+
521
+ const cacheFile = path.join(this.config.cacheDirectory, 'embeddings.json');
522
+ const hashFile = path.join(this.config.cacheDirectory, 'file-hashes.json');
523
+ const metaFile = path.join(this.config.cacheDirectory, CACHE_META_FILE);
524
+
525
+ const workerThresholdBytes =
526
+ Number.isInteger(this.config.jsonWorkerThresholdBytes) &&
527
+ this.config.jsonWorkerThresholdBytes > 0
528
+ ? this.config.jsonWorkerThresholdBytes
529
+ : JSON_WORKER_THRESHOLD_BYTES;
530
+
531
+ const useBinary = this.config.vectorStoreFormat === 'binary';
532
+ const useSqlite = this.config.vectorStoreFormat === 'sqlite';
533
+
534
+ const { vectorsPath, recordsPath, contentPath, filesPath } = BinaryVectorStore.getPaths(
535
+ this.config.cacheDirectory
536
+ );
537
+ const pathExists = async (targetPath) => {
538
+ try {
539
+ await fs.access(targetPath);
540
+ return true;
541
+ } catch {
542
+ return false;
543
+ }
544
+ };
545
+
546
+ // In tests, read cache files eagerly to exercise worker paths.
547
+ let cacheData = null;
548
+ let hashData = null;
549
+ let prefetched = false;
550
+ if (IS_TEST_ENV) {
551
+ prefetched = true;
552
+ const cachePromise = useBinary || useSqlite
553
+ ? Promise.resolve(null)
554
+ : readJsonFile(cacheFile, { workerThresholdBytes });
555
+ [cacheData, hashData] = await Promise.all([
556
+ cachePromise,
557
+ readJsonFile(hashFile, { workerThresholdBytes }),
558
+ ]);
559
+ }
560
+
561
+ // Read meta first to avoid parsing huge cache files when invalid
562
+ const metaData = await fs.readFile(metaFile, 'utf-8').catch(() => null);
563
+ if (!metaData) {
564
+ console.warn('[Cache] Missing cache metadata, ignoring cache');
565
+ this.clearInMemoryState();
566
+ return;
567
+ }
568
+
569
+ let meta;
570
+ try {
571
+ meta = JSON.parse(metaData);
572
+ } catch {
573
+ console.warn('[Cache] Invalid cache metadata, ignoring cache');
574
+ this.clearInMemoryState();
575
+ return;
576
+ }
577
+
578
+ if (meta?.version !== CACHE_META_VERSION) {
579
+ console.warn(`[Cache] Cache version mismatch (${meta?.version}), ignoring cache`);
580
+ this.clearInMemoryState();
581
+ return;
582
+ }
583
+
584
+ if (meta?.embeddingModel !== this.config.embeddingModel) {
585
+ console.warn(
586
+ `[Cache] Embedding model changed, ignoring cache (${meta?.embeddingModel} -> ${this.config.embeddingModel})`
587
+ );
588
+ this.clearInMemoryState();
589
+ return;
590
+ }
591
+ const expectedDimension = this.config.embeddingDimension ?? null;
592
+ const metaDimension = meta?.embeddingDimension ?? null;
593
+ if (metaDimension !== expectedDimension) {
594
+ console.warn(
595
+ `[Cache] Embedding dimension changed, ignoring cache (${metaDimension} -> ${expectedDimension})`
596
+ );
597
+ this.clearInMemoryState();
598
+ return;
599
+ }
600
+
601
+ if (!prefetched) {
602
+ [cacheData, hashData] = await Promise.all([
603
+ useBinary || useSqlite ? Promise.resolve(null) : readJsonFile(cacheFile, { workerThresholdBytes }),
604
+ readJsonFile(hashFile, { workerThresholdBytes }),
605
+ ]);
606
+ }
607
+
608
+ this.cacheMeta = meta;
609
+
610
+ const [binaryFilesPresent, jsonCachePresent] = await Promise.all([
611
+ (async () => {
612
+ const [vectorsOk, recordsOk, contentOk, filesOk] = await Promise.all([
613
+ pathExists(vectorsPath),
614
+ pathExists(recordsPath),
615
+ pathExists(contentPath),
616
+ pathExists(filesPath),
617
+ ]);
618
+ return vectorsOk && recordsOk && contentOk && filesOk;
619
+ })(),
620
+ pathExists(cacheFile),
621
+ ]);
622
+
623
+ if (useBinary && !binaryFilesPresent) {
624
+ if (jsonCachePresent) {
625
+ console.warn(
626
+ '[Cache] vectorStoreFormat=binary but binary cache files are missing; embeddings.json exists. If you switched formats, reindex or set vectorStoreFormat=json.'
627
+ );
628
+ } else {
629
+ console.warn(
630
+ '[Cache] vectorStoreFormat=binary but binary cache files are missing. Reindex to regenerate the cache.'
631
+ );
632
+ }
633
+ } else if (!useBinary && !useSqlite && !jsonCachePresent) {
634
+ if (binaryFilesPresent) {
635
+ console.warn(
636
+ '[Cache] vectorStoreFormat=json but binary cache files exist. If you switched formats, set vectorStoreFormat=binary or reindex.'
637
+ );
638
+ } else {
639
+ console.warn(
640
+ '[Cache] vectorStoreFormat=json but embeddings.json is missing. Reindex to regenerate the cache.'
641
+ );
642
+ }
643
+ }
644
+
645
+ const configuredVectorLoadMode =
646
+ typeof this.config.vectorStoreLoadMode === 'string'
647
+ ? this.config.vectorStoreLoadMode.toLowerCase()
648
+ : 'memory';
649
+ const effectiveVectorLoadMode =
650
+ forceVectorLoadMode === 'disk' || forceVectorLoadMode === 'memory'
651
+ ? forceVectorLoadMode
652
+ : configuredVectorLoadMode;
653
+
654
+ if (useBinary) {
655
+ try {
656
+ this.binaryStore = await BinaryVectorStore.load(this.config.cacheDirectory, {
657
+ contentCacheEntries: this.config.contentCacheEntries,
658
+ vectorCacheEntries: this.config.vectorCacheEntries,
659
+ vectorLoadMode: effectiveVectorLoadMode,
660
+ });
661
+ cacheData = await this.binaryStore.toChunkViews({
662
+ includeContent: this.config.vectorStoreContentMode === 'inline',
663
+ includeVector: effectiveVectorLoadMode !== 'disk',
664
+ });
665
+ } catch (err) {
666
+ this.binaryStore = null;
667
+ console.warn(`[Cache] Failed to load binary vector store: ${err.message}`);
668
+ }
669
+ }
670
+
671
+ // SQLite store loading
672
+ if (useSqlite) {
673
+ try {
674
+ this.sqliteStore = await SqliteVectorStore.load(this.config.cacheDirectory);
675
+ if (this.sqliteStore) {
676
+ cacheData = this.sqliteStore.toChunkViews({
677
+ includeContent: this.config.vectorStoreContentMode === 'inline',
678
+ includeVector: effectiveVectorLoadMode !== 'disk',
679
+ });
680
+ } else {
681
+ // SQLite file missing, need reindex
682
+ console.warn('[Cache] vectorStoreFormat=sqlite but vectors.sqlite is missing. Reindex to regenerate the cache.');
683
+ }
684
+ } catch (err) {
685
+ this.sqliteStore = null;
686
+ console.warn(`[Cache] Failed to load SQLite vector store: ${err.message}`);
687
+ }
688
+ }
689
+
690
+ if (!cacheData) {
691
+ cacheData = await readJsonFile(cacheFile, { workerThresholdBytes });
692
+ }
693
+
694
+ const hasCacheData = Array.isArray(cacheData);
695
+ const hasHashData = hashData && typeof hashData === 'object';
696
+
697
+ if (hasCacheData) {
698
+ const allowedExtensions = new Set(
699
+ (this.config.fileExtensions || []).map((ext) => `.${ext}`)
700
+ );
701
+ const allowedFileNames = new Set(this.config.fileNames || []);
702
+ const applyExtensionFilter = !this.binaryStore;
703
+ const shouldKeepFile = (filePath) => {
704
+ const ext = path.extname(filePath);
705
+ if (allowedExtensions.has(ext)) return true;
706
+ return allowedFileNames.has(path.basename(filePath));
707
+ };
708
+
709
+ const rawHashes = hasHashData ? new Map(Object.entries(hashData)) : new Map();
710
+ this.vectorStore = [];
711
+ this.fileHashes.clear();
712
+
713
+ // Single-pass filter + normalization
714
+ for (const chunk of cacheData) {
715
+ if (applyExtensionFilter) {
716
+ if (!shouldKeepFile(chunk.file)) continue;
717
+ }
718
+ normalizeChunkVector(chunk);
719
+ this.vectorStore.push(chunk);
720
+ }
721
+ const filteredCount = cacheData.length - this.vectorStore.length;
722
+ if (filteredCount > 0 && this.config.verbose) {
723
+ console.info(`[Cache] Filtered ${filteredCount} outdated cache entries`);
724
+ }
725
+
726
+ if (hasHashData) {
727
+ // Only keep hashes for allowed extensions
728
+ for (const [file, entry] of rawHashes) {
729
+ if (!applyExtensionFilter || shouldKeepFile(file)) {
730
+ const normalized = normalizeFileHashEntry(entry);
731
+ if (normalized) {
732
+ this.fileHashes.set(file, normalized);
733
+ }
734
+ }
735
+ }
736
+ } else {
737
+ console.warn(
738
+ '[Cache] Missing file-hashes.json; loaded embeddings but hashes were cleared'
739
+ );
740
+ }
741
+
742
+ assignChunkIndices(this.vectorStore);
743
+
744
+ if (this.config.verbose) {
745
+ console.info(`[Cache] Loaded ${this.vectorStore.length} cached embeddings`);
746
+ }
747
+
748
+ // ANN index is lazily loaded/built on first query
749
+ this.annDirty = false;
750
+ this.annPersistDirty = false;
751
+ this.annIndex = null;
752
+ this.annMeta = null;
753
+ this.annVectorCache = null;
754
+ } else if (cacheData) {
755
+ console.warn('[Cache] Cache data is not an array; ignoring cached embeddings');
756
+ } else if (hasHashData) {
757
+ console.warn('[Cache] Hashes exist without embeddings; ignoring file-hashes.json');
758
+ }
759
+
760
+ // Load call-graph data if it exists
761
+ const callGraphFile = path.join(this.config.cacheDirectory, CALL_GRAPH_FILE);
762
+ try {
763
+ const callGraphData = await fs.readFile(callGraphFile, 'utf8');
764
+ const parsed = JSON.parse(callGraphData);
765
+ this.fileCallData = new Map(Object.entries(parsed));
766
+ if (this.config.verbose) {
767
+ console.info(`[Cache] Loaded call-graph data for ${this.fileCallData.size} files`);
768
+ }
769
+ } catch {
770
+ // no cache yet, OK
771
+ }
772
+ } catch (error) {
773
+ console.warn('[Cache] Failed to load cache:', error.message);
774
+ this.clearInMemoryState();
775
+ }
776
+ }
777
+
778
+ // -------------------- Save (debounced + serialized) --------------------
779
+
780
+ save() {
781
+ if (!this.config.enableCache) return Promise.resolve();
782
+
783
+ this._saveRequested = true;
784
+
785
+ if (this._saveTimer) return this._savePromise ?? Promise.resolve();
786
+
787
+ const debounceMs = Number.isInteger(this.config.saveDebounceMs)
788
+ ? this.config.saveDebounceMs
789
+ : 250;
790
+
791
+ this._savePromise = new Promise((resolve, reject) => {
792
+ this._saveTimer = setTimeout(() => {
793
+ this._saveTimer = null;
794
+
795
+ this.saveQueue = this.saveQueue
796
+ .then(async () => {
797
+ while (this._saveRequested) {
798
+ this._saveRequested = false;
799
+ await this.performSave();
800
+ }
801
+ })
802
+ .then(resolve, reject)
803
+ .finally(() => {
804
+ this._savePromise = null;
805
+ });
806
+ }, debounceMs);
807
+ });
808
+
809
+ return this._savePromise;
810
+ }
811
+
812
+ async performSave() {
813
+ // Block new reads from starting during save operation
814
+ this._saveInProgress = true;
815
+
816
+ // Wait for active readers before modifying state to prevent data corruption
817
+ if (this.activeReads > 0) {
818
+ const timeoutMs = this.config.saveReaderWaitTimeoutMs ?? DEFAULT_READER_WAIT_TIMEOUT_MS;
819
+ const allReadersFinished = await this.waitForReadersWithTimeout(timeoutMs);
820
+ if (!allReadersFinished && !this.config.forceSaveWithActiveReaders) {
821
+ console.warn('[Cache] Aborting save - active readers still present after timeout');
822
+ this._saveInProgress = false; // Reset flag on early return
823
+ return; // Abort instead of risking data corruption
824
+ }
825
+ }
826
+
827
+ this.isSaving = true;
828
+
829
+ try {
830
+ await fs.mkdir(this.config.cacheDirectory, { recursive: true });
831
+
832
+ const cacheFile = path.join(this.config.cacheDirectory, 'embeddings.json');
833
+ const hashFile = path.join(this.config.cacheDirectory, 'file-hashes.json');
834
+ const metaFile = path.join(this.config.cacheDirectory, CACHE_META_FILE);
835
+
836
+ // Snapshot to avoid race conditions during async write.
837
+ // Keep this shallow for binary/sqlite to prevent multi-GB vector materialization.
838
+ const snapshotStore = Array.isArray(this.vectorStore) ? [...this.vectorStore] : [];
839
+ const supportsBackendVectorResolve =
840
+ this.config.vectorStoreFormat === 'binary' || this.config.vectorStoreFormat === 'sqlite';
841
+ const hasMissingVectors = snapshotStore.some(
842
+ (chunk) => chunk && (chunk.vector === undefined || chunk.vector === null)
843
+ );
844
+ const useDiskVectors =
845
+ supportsBackendVectorResolve &&
846
+ (this.config.vectorStoreLoadMode === 'disk' || hasMissingVectors);
847
+ if (hasMissingVectors && !useDiskVectors) {
848
+ throw new Error(
849
+ 'Missing vector data for cache write and backend vector resolution is unavailable'
850
+ );
851
+ }
852
+
853
+ this.cacheMeta = {
854
+ version: CACHE_META_VERSION,
855
+ embeddingModel: this.config.embeddingModel,
856
+ embeddingDimension: this.config.embeddingDimension ?? null,
857
+ lastSaveTime: new Date().toISOString(),
858
+ filesIndexed: this.fileHashes.size,
859
+ chunksStored: snapshotStore.length,
860
+ workspace: this.config.searchDirectory || null,
861
+ };
862
+ if (Number.isFinite(this.lastIndexDurationMs) && this.lastIndexDurationMs >= 0) {
863
+ this.cacheMeta.indexDurationMs = Math.round(this.lastIndexDurationMs);
864
+ }
865
+ if (this.lastIndexStats && typeof this.lastIndexStats === 'object') {
866
+ Object.assign(this.cacheMeta, this.lastIndexStats);
867
+ }
868
+
869
+ const total = snapshotStore.length;
870
+ if (this.config.vectorStoreFormat === 'binary') {
871
+ this.binaryStore = await BinaryVectorStore.write(
872
+ this.config.cacheDirectory,
873
+ snapshotStore,
874
+ {
875
+ contentCacheEntries: this.config.contentCacheEntries,
876
+ vectorCacheEntries: this.config.vectorCacheEntries,
877
+ vectorLoadMode: useDiskVectors ? 'disk' : this.config.vectorStoreLoadMode,
878
+ getContent: (chunk, index) => this.getChunkContent(chunk, index),
879
+ getVector: useDiskVectors ? (chunk, index) => this.getChunkVector(chunk, index) : null,
880
+ preRename: async () => {
881
+ if (this.activeReads > 0) {
882
+ await this.waitForReadersWithTimeout(
883
+ Number.isInteger(this.config.saveReaderWaitTimeoutMs)
884
+ ? this.config.saveReaderWaitTimeoutMs
885
+ : 5000
886
+ );
887
+ }
888
+ if (this.binaryStore) {
889
+ await this.binaryStore.close();
890
+ this.binaryStore = null;
891
+ }
892
+ },
893
+ }
894
+ );
895
+ if (this.binaryStore) {
896
+ this.cacheMeta.chunksStored = this.binaryStore.length;
897
+ }
898
+ } else if (this.config.vectorStoreFormat === 'sqlite') {
899
+ // SQLite store save
900
+ if (this.sqliteStore) {
901
+ try {
902
+ this.sqliteStore.close();
903
+ } catch {
904
+ // ignore close errors
905
+ }
906
+ this.sqliteStore = null;
907
+ }
908
+ this.sqliteStore = await SqliteVectorStore.write(
909
+ this.config.cacheDirectory,
910
+ snapshotStore,
911
+ {
912
+ getContent: (chunk, index) => this.getChunkContent(chunk, index),
913
+ getVector: useDiskVectors ? (chunk, index) => this.getChunkVector(chunk, index) : null,
914
+ preRename: async () => {
915
+ if (this.activeReads > 0) {
916
+ await this.waitForReadersWithTimeout(
917
+ Number.isInteger(this.config.saveReaderWaitTimeoutMs)
918
+ ? this.config.saveReaderWaitTimeoutMs
919
+ : 5000
920
+ );
921
+ }
922
+ },
923
+ }
924
+ );
925
+ if (this.sqliteStore) {
926
+ this.cacheMeta.chunksStored = this.sqliteStore.length();
927
+ }
928
+ } else {
929
+ const vectorWriter = new StreamingJsonWriter(cacheFile, {
930
+ highWaterMark: this.config.cacheWriteHighWaterMark ?? 256 * 1024,
931
+ floatDigits: this.config.cacheVectorFloatDigits ?? 6,
932
+ flushChars: this.config.cacheVectorFlushChars ?? 256 * 1024,
933
+ indent: '', // set to " " if you prefer pretty formatting
934
+ assumeFinite: this.config.cacheVectorAssumeFinite,
935
+ checkFinite: this.config.cacheVectorCheckFinite,
936
+ noMutation: this.config.cacheVectorNoMutation ?? false,
937
+ joinThreshold: this.config.cacheVectorJoinThreshold ?? 8192,
938
+ joinChunkSize: this.config.cacheVectorJoinChunkSize ?? 2048,
939
+ });
940
+
941
+ await vectorWriter.writeStart();
942
+
943
+ // Optional responsiveness yield (only for huge saves)
944
+ const yieldEvery = total >= 50_000 ? 5000 : 0;
945
+
946
+ try {
947
+ for (let i = 0; i < total; i++) {
948
+ const pending = vectorWriter.writeItem(snapshotStore[i]);
949
+ if (pending) await pending;
950
+ if (yieldEvery && i > 0 && i % yieldEvery === 0) await yieldToLoop();
951
+ }
952
+ await vectorWriter.writeEnd();
953
+ } catch (e) {
954
+ vectorWriter.abort(e);
955
+ throw e;
956
+ }
957
+ }
958
+
959
+ const hashEntries = {};
960
+ for (const [file, entry] of this.fileHashes) {
961
+ const serialized = serializeFileHashEntry(entry);
962
+ if (serialized) {
963
+ hashEntries[file] = serialized;
964
+ }
965
+ }
966
+
967
+ await Promise.all([
968
+ fs.writeFile(hashFile, JSON.stringify(hashEntries, null, 2)),
969
+ fs.writeFile(metaFile, JSON.stringify(this.cacheMeta, null, 2)),
970
+ ]);
971
+
972
+ // Save call-graph data (or remove stale cache)
973
+ const callGraphFile = path.join(this.config.cacheDirectory, CALL_GRAPH_FILE);
974
+ if (this.fileCallData.size > 0) {
975
+ await fs.writeFile(
976
+ callGraphFile,
977
+ JSON.stringify(Object.fromEntries(this.fileCallData), null, 2)
978
+ );
979
+ } else {
980
+ await fs.rm(callGraphFile, { force: true });
981
+ }
982
+
983
+ // Persist ANN index if it exists and changed in memory
984
+ // Use mutex to prevent concurrent writes (index could be modified during save)
985
+ if (
986
+ this.config.annIndexCache !== false &&
987
+ this.annPersistDirty &&
988
+ !this.annDirty &&
989
+ !this._annWriting &&
990
+ this.annIndex &&
991
+ this.annMeta
992
+ ) {
993
+ this._annWriting = true;
994
+ try {
995
+ const { indexFile, metaFile: annMetaFile } = this.getAnnIndexPaths();
996
+ this.annIndex.writeIndexSync(indexFile);
997
+ await fs.writeFile(annMetaFile, JSON.stringify(this.annMeta, null, 2));
998
+ this.annPersistDirty = false;
999
+ if (this.config.verbose) {
1000
+ console.info(`[ANN] Persisted updated ANN index (${this.annMeta.count} vectors)`);
1001
+ }
1002
+ } catch (error) {
1003
+ console.warn(`[ANN] Failed to persist ANN index: ${error.message}`);
1004
+ } finally {
1005
+ this._annWriting = false;
1006
+ }
1007
+ }
1008
+ } catch (error) {
1009
+ console.warn('[Cache] Failed to save cache:', error.message);
1010
+ // Attempt to recover binary store if it was closed during failed save
1011
+ if (
1012
+ this.config.vectorStoreFormat === 'binary' &&
1013
+ this.binaryStore &&
1014
+ !this.binaryStore.vectorsBuffer
1015
+ ) {
1016
+ try {
1017
+ console.info('[Cache] Attempting to recover binary store after failed save...');
1018
+ this.binaryStore = await BinaryVectorStore.load(this.config.cacheDirectory, {
1019
+ contentCacheEntries: this.config.contentCacheEntries,
1020
+ });
1021
+ console.info('[Cache] Binary store recovered.');
1022
+ } catch (recoverErr) {
1023
+ console.warn(`[Cache] Failed to recover binary store: ${recoverErr.message}`);
1024
+ this.binaryStore = null; // Ensure it's null if unusable
1025
+ }
1026
+ }
1027
+ // Attempt to recover SQLite store if closed during failed save
1028
+ if (
1029
+ this.config.vectorStoreFormat === 'sqlite' &&
1030
+ !this.sqliteStore
1031
+ ) {
1032
+ try {
1033
+ console.info('[Cache] Attempting to recover SQLite store after failed save...');
1034
+ this.sqliteStore = await SqliteVectorStore.load(this.config.cacheDirectory);
1035
+ if (this.sqliteStore) {
1036
+ console.info('[Cache] SQLite store recovered.');
1037
+ }
1038
+ } catch (recoverErr) {
1039
+ console.warn(`[Cache] Failed to recover SQLite store: ${recoverErr.message}`);
1040
+ this.sqliteStore = null;
1041
+ }
1042
+ }
1043
+ } finally {
1044
+ this.isSaving = false;
1045
+ this._saveInProgress = false; // Allow reads to resume
1046
+ }
1047
+ }
1048
+
1049
+ // -------------------- Vector Store API --------------------
1050
+
1051
+ getVectorStore() {
1052
+ return Array.isArray(this.vectorStore) ? this.vectorStore : [];
1053
+ }
1054
+
1055
+ async setVectorStore(store) {
1056
+ const previousBinaryStore = this.binaryStore;
1057
+ const previousSqliteStore = this.sqliteStore;
1058
+ this.vectorStore = store;
1059
+ this.binaryStore = null;
1060
+ this.sqliteStore = null;
1061
+ if (Array.isArray(this.vectorStore)) {
1062
+ for (const chunk of this.vectorStore) normalizeChunkVector(chunk);
1063
+ assignChunkIndices(this.vectorStore);
1064
+ }
1065
+ this.invalidateAnnIndex();
1066
+ if (previousBinaryStore) {
1067
+ try {
1068
+ await previousBinaryStore.close();
1069
+ } catch {
1070
+ // ignore close errors
1071
+ }
1072
+ }
1073
+ if (previousSqliteStore) {
1074
+ try {
1075
+ previousSqliteStore.close();
1076
+ } catch {
1077
+ // ignore close errors
1078
+ }
1079
+ }
1080
+ }
1081
+
1082
+ setLastIndexDuration(durationMs) {
1083
+ if (Number.isFinite(durationMs) && durationMs >= 0) {
1084
+ this.lastIndexDurationMs = durationMs;
1085
+ }
1086
+ }
1087
+
1088
+ setLastIndexStats(stats) {
1089
+ if (stats && typeof stats === 'object') {
1090
+ this.lastIndexStats = { ...stats };
1091
+ }
1092
+ }
1093
+
1094
+ getFileHash(file) {
1095
+ const entry = this.fileHashes.get(file);
1096
+ if (typeof entry === 'string') return entry;
1097
+ return entry?.hash;
1098
+ }
1099
+
1100
+ getFileHashKeys() {
1101
+ return Array.from(this.fileHashes.keys());
1102
+ }
1103
+
1104
+ getFileHashCount() {
1105
+ return this.fileHashes.size;
1106
+ }
1107
+
1108
+ clearFileHashes() {
1109
+ this.fileHashes.clear();
1110
+ }
1111
+
1112
+ setFileHashes(entries) {
1113
+ this.fileHashes.clear();
1114
+ if (!entries || typeof entries !== 'object') return;
1115
+ const iterator =
1116
+ entries instanceof Map
1117
+ ? entries.entries()
1118
+ : Object.entries(entries);
1119
+ if (!iterator) return;
1120
+ for (const [file, entry] of iterator) {
1121
+ const normalized = normalizeFileHashEntry(entry);
1122
+ if (normalized) {
1123
+ this.fileHashes.set(file, normalized);
1124
+ }
1125
+ }
1126
+ }
1127
+
1128
+ setFileHash(file, hash, meta = null) {
1129
+ const entry = { hash };
1130
+ if (meta && typeof meta === 'object') {
1131
+ if (Number.isFinite(meta.mtimeMs)) entry.mtimeMs = meta.mtimeMs;
1132
+ if (Number.isFinite(meta.size)) entry.size = meta.size;
1133
+ }
1134
+ this.fileHashes.set(file, entry);
1135
+ }
1136
+
1137
+ getFileMeta(file) {
1138
+ const entry = this.fileHashes.get(file);
1139
+ if (!entry) return null;
1140
+ if (typeof entry === 'string') return { hash: entry };
1141
+ return entry;
1142
+ }
1143
+
1144
+ getChunkVector(chunk, index = null) {
1145
+ if (typeof chunk === 'number') {
1146
+ const store = Array.isArray(this.vectorStore) ? this.vectorStore : null;
1147
+ const entry = store ? store[chunk] : null;
1148
+ if (entry?.vector) return entry.vector;
1149
+ if (this.binaryStore) {
1150
+ const resolved = Number.isInteger(entry?._binaryIndex) ? entry._binaryIndex : chunk;
1151
+ return this.binaryStore.getVector(resolved);
1152
+ }
1153
+ if (this.sqliteStore) {
1154
+ const resolved = Number.isInteger(entry?._sqliteIndex) ? entry._sqliteIndex : chunk;
1155
+ return this.sqliteStore.getVector(resolved);
1156
+ }
1157
+ return null;
1158
+ }
1159
+
1160
+ if (chunk?.vector) return chunk.vector;
1161
+ const resolved = Number.isInteger(index) ? index : chunk?._index;
1162
+ if (this.binaryStore && Number.isInteger(chunk?._binaryIndex)) {
1163
+ return this.binaryStore.getVector(chunk._binaryIndex);
1164
+ }
1165
+ if (this.binaryStore && !Array.isArray(this.vectorStore) && Number.isInteger(resolved)) {
1166
+ return this.binaryStore.getVector(resolved);
1167
+ }
1168
+ if (this.sqliteStore) {
1169
+ const sqliteIndex = Number.isInteger(chunk?._sqliteIndex)
1170
+ ? chunk._sqliteIndex
1171
+ : Number.isInteger(chunk?.index)
1172
+ ? chunk.index
1173
+ : resolved;
1174
+ if (Number.isInteger(sqliteIndex)) {
1175
+ return this.sqliteStore.getVector(sqliteIndex);
1176
+ }
1177
+ }
1178
+ return null;
1179
+ }
1180
+
1181
+ async getChunkContent(chunk, index = null) {
1182
+ if (typeof chunk === 'number') {
1183
+ const store = Array.isArray(this.vectorStore) ? this.vectorStore : null;
1184
+ const entry = store ? store[chunk] : null;
1185
+ if (entry) return await this.getChunkContent(entry, chunk);
1186
+ if (!store && this.binaryStore) {
1187
+ const content = await this.binaryStore.getContent(chunk);
1188
+ return content ?? ''; // Ensure consistent empty string return
1189
+ }
1190
+ if (!store && this.sqliteStore) {
1191
+ return this.sqliteStore.getContent(chunk) ?? '';
1192
+ }
1193
+ return '';
1194
+ }
1195
+ if (chunk?.content !== undefined && chunk?.content !== null) {
1196
+ return chunk.content;
1197
+ }
1198
+ if (this.binaryStore && Number.isInteger(chunk?._binaryIndex)) {
1199
+ const content = await this.binaryStore.getContent(chunk._binaryIndex);
1200
+ return content ?? ''; // Ensure consistent empty string return
1201
+ }
1202
+ const resolved = Number.isInteger(index) ? index : chunk?._index;
1203
+ if (this.binaryStore && !Array.isArray(this.vectorStore) && Number.isInteger(resolved)) {
1204
+ const content = await this.binaryStore.getContent(resolved);
1205
+ return content ?? ''; // Ensure consistent empty string return
1206
+ }
1207
+ if (this.sqliteStore) {
1208
+ const sqliteIndex = Number.isInteger(chunk?._sqliteIndex)
1209
+ ? chunk._sqliteIndex
1210
+ : Number.isInteger(chunk?.index)
1211
+ ? chunk.index
1212
+ : resolved;
1213
+ if (Number.isInteger(sqliteIndex)) {
1214
+ return this.sqliteStore.getContent(sqliteIndex) ?? '';
1215
+ }
1216
+ }
1217
+ return '';
1218
+ }
1219
+
1220
+ deleteFileHash(file) {
1221
+ this.fileHashes.delete(file);
1222
+ }
1223
+
1224
+ /**
1225
+ * Remove all chunks for a given file from the vector store.
1226
+ * Note: This is async to support future backend-specific cleanup.
1227
+ * For binary/SQLite stores, actual removal happens on next full save.
1228
+ * @param {string} file - Absolute path of file to remove
1229
+ */
1230
+ async removeFileFromStore(file) {
1231
+ if (!Array.isArray(this.vectorStore)) return;
1232
+ // In-place compaction to avoid allocating a new large array
1233
+ let w = 0;
1234
+ for (let r = 0; r < this.vectorStore.length; r++) {
1235
+ const chunk = this.vectorStore[r];
1236
+ if (chunk.file !== file) {
1237
+ chunk._index = w;
1238
+ this.vectorStore[w++] = chunk;
1239
+ }
1240
+ }
1241
+ this.vectorStore.length = w;
1242
+
1243
+ // Removing shifts labels => rebuild ANN
1244
+ this.invalidateAnnIndex();
1245
+ this.removeFileCallData(file);
1246
+ // Also remove file hash to prevent orphaned entries
1247
+ this.fileHashes.delete(file);
1248
+ }
1249
+
1250
+ addToStore(chunk) {
1251
+ normalizeChunkVector(chunk);
1252
+
1253
+ if (!Array.isArray(this.vectorStore)) {
1254
+ this.vectorStore = [];
1255
+ }
1256
+
1257
+ const label = this.vectorStore.length;
1258
+ chunk._index = label;
1259
+ this.vectorStore.push(chunk);
1260
+ if (Array.isArray(this.annVectorCache) && this.annVectorCache.length === label) {
1261
+ this.annVectorCache.push(chunk.vector);
1262
+ }
1263
+
1264
+ // Best-effort incremental ANN append (fast path)
1265
+ if (
1266
+ this.annIndex &&
1267
+ !this.annDirty &&
1268
+ this.annMeta &&
1269
+ typeof this.annIndex.addPoint === 'function' &&
1270
+ this.annMeta.count === label &&
1271
+ this.annMeta.maxElements > this.annMeta.count
1272
+ ) {
1273
+ try {
1274
+ this.annIndex.addPoint(chunk.vector, label);
1275
+ this.annMeta.count += 1;
1276
+ this.annPersistDirty = true;
1277
+ return;
1278
+ } catch {
1279
+ // fall through
1280
+ }
1281
+ }
1282
+
1283
+ this.invalidateAnnIndex();
1284
+ }
1285
+
1286
+ invalidateAnnIndex() {
1287
+ this.annIndex = null;
1288
+ this.annMeta = null;
1289
+ this.annDirty = true;
1290
+ this.annPersistDirty = false;
1291
+ this.annVectorCache = null;
1292
+ }
1293
+
1294
+ getAnnVector(index) {
1295
+ if (!Array.isArray(this.vectorStore)) return null;
1296
+ const chunk = this.vectorStore[index];
1297
+ if (!chunk) return null;
1298
+
1299
+ if (
1300
+ !Array.isArray(this.annVectorCache) ||
1301
+ this.annVectorCache.length !== this.vectorStore.length
1302
+ ) {
1303
+ this.annVectorCache = new Array(this.vectorStore.length);
1304
+ }
1305
+
1306
+ const cached = this.annVectorCache[index];
1307
+ if (cached) return cached;
1308
+
1309
+ let vec = null;
1310
+ if (chunk.vector) {
1311
+ vec = ensureFloat32(chunk.vector);
1312
+ } else if (this.binaryStore && Number.isInteger(chunk._binaryIndex)) {
1313
+ vec = this.binaryStore.getVector(chunk._binaryIndex);
1314
+ } else if (this.sqliteStore) {
1315
+ const sqliteIndex = Number.isInteger(chunk._sqliteIndex)
1316
+ ? chunk._sqliteIndex
1317
+ : Number.isInteger(chunk.index)
1318
+ ? chunk.index
1319
+ : index;
1320
+ if (Number.isInteger(sqliteIndex)) {
1321
+ vec = this.sqliteStore.getVector(sqliteIndex);
1322
+ }
1323
+ }
1324
+
1325
+ if (!vec) return null;
1326
+
1327
+ if (this.config.vectorStoreLoadMode !== 'disk') {
1328
+ chunk.vector = vec;
1329
+ }
1330
+ this.annVectorCache[index] = vec;
1331
+ return vec;
1332
+ }
1333
+
1334
+ getAnnIndexPaths() {
1335
+ return {
1336
+ indexFile: path.join(this.config.cacheDirectory, ANN_INDEX_FILE),
1337
+ metaFile: path.join(this.config.cacheDirectory, ANN_META_FILE),
1338
+ };
1339
+ }
1340
+
1341
+ // -------------------- ANN --------------------
1342
+
1343
+ /**
1344
+ * Ensure ANN (Approximate Nearest Neighbor) index is built and ready.
1345
+ * Loads from disk cache if available and valid, otherwise builds a new index.
1346
+ *
1347
+ * @returns {Promise<HierarchicalNSW|null>} The HNSW index, or null if:
1348
+ * - ANN is disabled in config
1349
+ * - vectorStore is not an array
1350
+ * - vectorStore size is below annMinChunks threshold
1351
+ * - hnswlib-node is not available
1352
+ * - Vector dimension mismatch detected
1353
+ * @note This method is safe to call multiple times; concurrent calls share the same promise.
1354
+ */
1355
+ async ensureAnnIndex() {
1356
+ if (!this.config.annEnabled) return null;
1357
+ if (!Array.isArray(this.vectorStore)) return null;
1358
+ if (this.vectorStore.length < (this.config.annMinChunks ?? 5000)) return null;
1359
+ if (this.annIndex && !this.annDirty) return this.annIndex;
1360
+ if (this.annLoading) return this.annLoading;
1361
+
1362
+ this.annLoading = (async () => {
1363
+ try {
1364
+ const HierarchicalNSW = await loadHnswlib();
1365
+ if (!HierarchicalNSW) {
1366
+ if (hnswlibLoadError) {
1367
+ this.addInitError('loadHnswlib', hnswlibLoadError);
1368
+ }
1369
+ return null;
1370
+ }
1371
+
1372
+ const dim =
1373
+ this.vectorStore[0]?.vector?.length ||
1374
+ this.binaryStore?.dim ||
1375
+ this.sqliteStore?.dim;
1376
+ if (!dim) return null;
1377
+
1378
+ // Validate dimension consistency before building index
1379
+ // Use stratified sampling for better coverage across entire store
1380
+ let dimensionMismatch = false;
1381
+ const sampleSize = Math.min(ANN_DIMENSION_SAMPLE_SIZE, this.vectorStore.length);
1382
+ const step = Math.max(1, Math.floor(this.vectorStore.length / sampleSize));
1383
+ for (let i = step; i < this.vectorStore.length; i += step) {
1384
+ const v = this.vectorStore[i]?.vector;
1385
+ if (v && v.length !== dim) {
1386
+ dimensionMismatch = true;
1387
+ console.warn(
1388
+ `[ANN] Dimension mismatch at index ${i}: expected ${dim}, got ${v.length}. ` +
1389
+ 'This may indicate a config change mid-index. Consider full reindex.'
1390
+ );
1391
+ break;
1392
+ }
1393
+ }
1394
+
1395
+ if (dimensionMismatch) {
1396
+ this.addInitError('ensureAnnIndex', `Vector dimension inconsistency detected. Expected ${dim}. Full reindex required.`);
1397
+ return null; // Skip ANN build - fallback to linear search
1398
+ }
1399
+
1400
+ if (!this.annDirty && this.config.annIndexCache !== false) {
1401
+ const loaded = await this.loadAnnIndexFromDisk(HierarchicalNSW, dim);
1402
+ if (loaded) return this.annIndex;
1403
+ }
1404
+
1405
+ return await this.buildAnnIndex(HierarchicalNSW, dim);
1406
+ } finally {
1407
+ this.annLoading = null;
1408
+ }
1409
+ })();
1410
+
1411
+ return this.annLoading;
1412
+ }
1413
+
1414
+ async loadAnnIndexFromDisk(HierarchicalNSW, dim) {
1415
+ const { indexFile, metaFile } = this.getAnnIndexPaths();
1416
+ const metaData = await fs.readFile(metaFile, 'utf-8').catch(() => null);
1417
+ if (!metaData) return false;
1418
+
1419
+ let meta;
1420
+ try {
1421
+ meta = JSON.parse(metaData);
1422
+ } catch {
1423
+ console.warn('[ANN] Invalid ANN metadata, rebuilding');
1424
+ return false;
1425
+ }
1426
+
1427
+ if (meta?.version !== ANN_META_VERSION) {
1428
+ console.warn(`[ANN] ANN index version mismatch (${meta?.version}), rebuilding`);
1429
+ return false;
1430
+ }
1431
+
1432
+ if (meta?.embeddingModel !== this.config.embeddingModel) {
1433
+ console.warn('[ANN] Embedding model changed for ANN index, rebuilding');
1434
+ return false;
1435
+ }
1436
+
1437
+ if (meta?.dim !== dim || meta?.count !== this.vectorStore.length) {
1438
+ console.warn('[ANN] ANN index size mismatch, rebuilding');
1439
+ return false;
1440
+ }
1441
+
1442
+ if (
1443
+ meta?.metric !== this.config.annMetric ||
1444
+ meta?.m !== this.config.annM ||
1445
+ meta?.efConstruction !== this.config.annEfConstruction
1446
+ ) {
1447
+ console.warn('[ANN] ANN index config changed, rebuilding');
1448
+ return false;
1449
+ }
1450
+
1451
+ let maxElements = meta?.maxElements;
1452
+ if (!Number.isInteger(maxElements)) {
1453
+ maxElements = meta.count;
1454
+ } else if (maxElements < meta.count) {
1455
+ console.warn('[ANN] ANN capacity invalid, rebuilding');
1456
+ return false;
1457
+ }
1458
+
1459
+ const index = new HierarchicalNSW(meta.metric, dim);
1460
+ const loaded = readHnswIndex(index, indexFile, maxElements);
1461
+ if (!loaded) {
1462
+ console.warn('[ANN] Failed to load ANN index file, rebuilding');
1463
+ return false;
1464
+ }
1465
+
1466
+ if (typeof index.setEf === 'function') {
1467
+ index.setEf(this.config.annEfSearch);
1468
+ }
1469
+
1470
+ this.annIndex = index;
1471
+ this.annMeta = { ...meta, maxElements };
1472
+ this.annDirty = false;
1473
+ this.annPersistDirty = false;
1474
+
1475
+ if (this.config.verbose) {
1476
+ console.info(`[ANN] Loaded ANN index (${meta.count} vectors, cap=${maxElements})`);
1477
+ }
1478
+ return true;
1479
+ }
1480
+
1481
+ async buildAnnIndex(HierarchicalNSW, dim) {
1482
+ if (!Array.isArray(this.vectorStore)) return null;
1483
+ const total = this.vectorStore.length;
1484
+ if (total === 0) return null;
1485
+
1486
+ try {
1487
+ const index = new HierarchicalNSW(this.config.annMetric, dim);
1488
+
1489
+ const maxElements = computeAnnCapacity(total, this.config);
1490
+ initHnswIndex(index, maxElements, this.config.annM, this.config.annEfConstruction);
1491
+
1492
+ const yieldEvery = Number.isInteger(this.config.annBuildYieldEvery)
1493
+ ? this.config.annBuildYieldEvery
1494
+ : 1000;
1495
+
1496
+ for (let i = 0; i < total; i++) {
1497
+ const vector = this.getAnnVector(i);
1498
+ if (!vector) throw new Error(`Missing vector for ANN index at position ${i}`);
1499
+ index.addPoint(vector, i);
1500
+
1501
+ if (yieldEvery > 0 && i > 0 && i % yieldEvery === 0) {
1502
+ await yieldToLoop();
1503
+ }
1504
+ }
1505
+
1506
+ if (typeof index.setEf === 'function') {
1507
+ index.setEf(this.config.annEfSearch);
1508
+ }
1509
+
1510
+ this.annIndex = index;
1511
+ this.annMeta = {
1512
+ version: ANN_META_VERSION,
1513
+ embeddingModel: this.config.embeddingModel,
1514
+ metric: this.config.annMetric,
1515
+ dim,
1516
+ count: total,
1517
+ maxElements,
1518
+ m: this.config.annM,
1519
+ efConstruction: this.config.annEfConstruction,
1520
+ efSearch: this.config.annEfSearch,
1521
+ };
1522
+ this.annDirty = false;
1523
+ this.annPersistDirty = true;
1524
+
1525
+ if (this.config.annIndexCache !== false) {
1526
+ try {
1527
+ await fs.mkdir(this.config.cacheDirectory, { recursive: true });
1528
+ const { indexFile, metaFile } = this.getAnnIndexPaths();
1529
+ index.writeIndexSync(indexFile);
1530
+ await fs.writeFile(metaFile, JSON.stringify(this.annMeta, null, 2));
1531
+ this.annPersistDirty = false;
1532
+ if (this.config.verbose) {
1533
+ console.info(`[ANN] Saved ANN index (${total} vectors, cap=${maxElements})`);
1534
+ }
1535
+ } catch (error) {
1536
+ console.warn(`[ANN] Failed to save ANN index: ${error.message}`);
1537
+ }
1538
+ }
1539
+
1540
+ return index;
1541
+ } catch (error) {
1542
+ console.warn(`[ANN] Failed to build ANN index: ${error.message}`);
1543
+ this.addInitError('buildAnnIndex', error);
1544
+ this.annIndex = null;
1545
+ this.annMeta = null;
1546
+ this.annDirty = true;
1547
+ this.annPersistDirty = false;
1548
+ return null;
1549
+ }
1550
+ }
1551
+
1552
+ /**
1553
+ * Query the ANN index for k nearest neighbors.
1554
+ * Falls back gracefully to empty results if ANN is unavailable.
1555
+ *
1556
+ * @param {Float32Array|number[]} queryVector - Normalized query embedding
1557
+ * @param {number} k - Number of neighbors to return
1558
+ * @returns {Promise<number[]>} Array of chunk indices sorted by similarity (may be empty)
1559
+ * @throws Never throws - returns empty array on all error conditions
1560
+ * @note Automatically invalidates corrupted index and falls back to linear search on next query
1561
+ */
1562
+ async queryAnn(queryVector, k) {
1563
+ if (!Array.isArray(this.vectorStore) || this.vectorStore.length === 0) return [];
1564
+ const index = await this.ensureAnnIndex();
1565
+ if (!index) return [];
1566
+
1567
+ const qVec = queryVector instanceof Float32Array ? queryVector : new Float32Array(queryVector);
1568
+
1569
+ // Wrap searchKnn in try-catch to handle corrupted index or dimension mismatches
1570
+ let results;
1571
+ try {
1572
+ results = index.searchKnn(qVec, k);
1573
+ } catch (err) {
1574
+ console.warn(`[ANN] searchKnn failed: ${err.message}. Falling back to linear search.`);
1575
+ this.addInitError('queryAnn', err);
1576
+ // Invalidate to trigger rebuild on next query
1577
+ this.invalidateAnnIndex();
1578
+ return [];
1579
+ }
1580
+
1581
+ const labels = normalizeLabels(results);
1582
+
1583
+ if (labels.length === 0) return [];
1584
+
1585
+ const filtered = labels.filter(
1586
+ (label) => Number.isInteger(label) && label >= 0 && label < this.vectorStore.length
1587
+ );
1588
+
1589
+ return filtered;
1590
+ }
1591
+
1592
+ async clear() {
1593
+ if (!this.config.enableCache) return;
1594
+
1595
+ try {
1596
+ await fs.rm(this.config.cacheDirectory, { recursive: true, force: true });
1597
+ this.vectorStore = [];
1598
+ if (this.binaryStore) {
1599
+ try {
1600
+ await this.binaryStore.close();
1601
+ } catch {
1602
+ // ignore close errors
1603
+ }
1604
+ }
1605
+ this.binaryStore = null;
1606
+ if (this.sqliteStore) {
1607
+ try {
1608
+ this.sqliteStore.close();
1609
+ } catch {
1610
+ // ignore close errors
1611
+ }
1612
+ }
1613
+ this.sqliteStore = null;
1614
+ this.fileHashes = new Map();
1615
+ this.invalidateAnnIndex();
1616
+ await this.clearCallGraphData();
1617
+ if (this.config.verbose) {
1618
+ console.info(`[Cache] Cache cleared successfully: ${this.config.cacheDirectory}`);
1619
+ }
1620
+ } catch (error) {
1621
+ console.error('[Cache] Failed to clear cache:', error.message);
1622
+ throw error;
1623
+ }
1624
+ }
1625
+
1626
+ /**
1627
+ * Adjust efSearch at runtime for speed/accuracy tradeoff.
1628
+ * Higher values = more accurate but slower.
1629
+ * @param {number} efSearch - New efSearch value (typically 16-512)
1630
+ * @returns {object} Result with success status and current config
1631
+ */
1632
+ setEfSearch(efSearch) {
1633
+ if (typeof efSearch !== 'number' || efSearch < 1 || efSearch > 1000) {
1634
+ return {
1635
+ success: false,
1636
+ error: 'efSearch must be a number between 1 and 1000',
1637
+ };
1638
+ }
1639
+
1640
+ this.config.annEfSearch = efSearch;
1641
+
1642
+ if (this.annIndex && typeof this.annIndex.setEf === 'function') {
1643
+ this.annIndex.setEf(efSearch);
1644
+ if (this.annMeta) this.annMeta.efSearch = efSearch;
1645
+ this.annPersistDirty = true;
1646
+ if (this.config.verbose) {
1647
+ console.info(`[ANN] efSearch updated to ${efSearch} (applied to active index)`);
1648
+ }
1649
+ return { success: true, applied: true, efSearch };
1650
+ }
1651
+
1652
+ if (this.config.verbose) {
1653
+ console.info(`[ANN] efSearch updated to ${efSearch} (will apply on next index build)`);
1654
+ }
1655
+ return { success: true, applied: false, efSearch };
1656
+ }
1657
+
1658
+ /**
1659
+ * Get current ANN index statistics for diagnostics.
1660
+ * @returns {object} ANN stats including index state, config, and vector count
1661
+ */
1662
+ getAnnStats() {
1663
+ return {
1664
+ enabled: this.config.annEnabled ?? false,
1665
+ indexLoaded: this.annIndex !== null,
1666
+ dirty: this.annDirty,
1667
+ vectorCount: Array.isArray(this.vectorStore) ? this.vectorStore.length : 0,
1668
+ minChunksForAnn: this.config.annMinChunks ?? 5000,
1669
+ config: this.annMeta
1670
+ ? {
1671
+ metric: this.annMeta.metric,
1672
+ dim: this.annMeta.dim,
1673
+ count: this.annMeta.count,
1674
+ m: this.annMeta.m,
1675
+ efConstruction: this.annMeta.efConstruction,
1676
+ efSearch: this.config.annEfSearch,
1677
+ }
1678
+ : null,
1679
+ };
1680
+ }
1681
+
1682
+ // -------------------- Call Graph --------------------
1683
+
1684
+ async clearCallGraphData({ removeFile = false } = {}) {
1685
+ this.fileCallData.clear();
1686
+ this.callGraph = null;
1687
+
1688
+ if (removeFile && this.config.enableCache) {
1689
+ const callGraphFile = path.join(this.config.cacheDirectory, CALL_GRAPH_FILE);
1690
+ try {
1691
+ await fs.rm(callGraphFile, { force: true });
1692
+ } catch (error) {
1693
+ if (this.config.verbose) {
1694
+ console.warn(`[Cache] Failed to remove call-graph cache: ${error.message}`);
1695
+ }
1696
+ }
1697
+ }
1698
+ }
1699
+
1700
+ pruneCallGraphData(validFiles) {
1701
+ if (!validFiles || this.fileCallData.size === 0) return 0;
1702
+
1703
+ let pruned = 0;
1704
+ for (const file of Array.from(this.fileCallData.keys())) {
1705
+ if (!validFiles.has(file)) {
1706
+ this.fileCallData.delete(file);
1707
+ pruned++;
1708
+ }
1709
+ }
1710
+
1711
+ if (pruned > 0) this.callGraph = null;
1712
+ return pruned;
1713
+ }
1714
+
1715
+ getFileCallData(file) {
1716
+ return this.fileCallData.get(file);
1717
+ }
1718
+
1719
+ hasFileCallData(file) {
1720
+ return this.fileCallData.has(file);
1721
+ }
1722
+
1723
+ getFileCallDataKeys() {
1724
+ return Array.from(this.fileCallData.keys());
1725
+ }
1726
+
1727
+ getFileCallDataCount() {
1728
+ return this.fileCallData.size;
1729
+ }
1730
+
1731
+ /**
1732
+ * Sets call data for a specific file.
1733
+ * @param {string} file
1734
+ * @param {object} data
1735
+ */
1736
+ setFileCallData(file, data) {
1737
+ this.fileCallData.set(file, data);
1738
+ this.callGraph = null;
1739
+ }
1740
+
1741
+ /**
1742
+ * Sets the entire file call data map.
1743
+ * @param {Map<string, object>|object} entries
1744
+ */
1745
+ setFileCallDataEntries(entries) {
1746
+ if (entries instanceof Map) {
1747
+ this.fileCallData = entries;
1748
+ } else {
1749
+ this.fileCallData.clear();
1750
+ if (entries && typeof entries === 'object') {
1751
+ for (const [file, data] of Object.entries(entries)) {
1752
+ this.fileCallData.set(file, data);
1753
+ }
1754
+ }
1755
+ }
1756
+ this.callGraph = null;
1757
+ }
1758
+
1759
+ clearFileCallData() {
1760
+ this.fileCallData.clear();
1761
+ this.callGraph = null;
1762
+ }
1763
+
1764
+ removeFileCallData(file) {
1765
+ this.fileCallData.delete(file);
1766
+ this.callGraph = null;
1767
+ }
1768
+
1769
+ async rebuildCallGraph() {
1770
+ if (this._callGraphBuild) return this._callGraphBuild;
1771
+
1772
+ this._callGraphBuild = (async () => {
1773
+ try {
1774
+ const { buildCallGraph } = await import('./call-graph.js');
1775
+ this.callGraph = buildCallGraph(this.fileCallData);
1776
+ if (this.config.verbose && this.callGraph) {
1777
+ console.info(
1778
+ `[CallGraph] Built graph: ${this.callGraph.defines.size} definitions, ${this.callGraph.calledBy.size} call targets`
1779
+ );
1780
+ }
1781
+ } catch (err) {
1782
+ console.error(`[CallGraph] Failed to build: ${err.message}`);
1783
+ this.callGraph = null;
1784
+ } finally {
1785
+ this._callGraphBuild = null;
1786
+ }
1787
+ })();
1788
+
1789
+ return this._callGraphBuild;
1790
+ }
1791
+
1792
+ async getRelatedFiles(symbols) {
1793
+ if (!this.config.callGraphEnabled || symbols.length === 0) return new Map();
1794
+ if (!this.callGraph && this.fileCallData.size > 0) await this.rebuildCallGraph();
1795
+ if (!this.callGraph) return new Map();
1796
+
1797
+ const { getRelatedFiles } = await import('./call-graph.js');
1798
+ return getRelatedFiles(this.callGraph, symbols, this.config.callGraphMaxHops);
1799
+ }
1800
+
1801
+ getCallGraphStats() {
1802
+ return {
1803
+ enabled: this.config.callGraphEnabled ?? false,
1804
+ filesWithData: this.fileCallData.size,
1805
+ graphBuilt: this.callGraph !== null,
1806
+ definitions: this.callGraph?.defines.size ?? 0,
1807
+ callTargets: this.callGraph?.calledBy.size ?? 0,
1808
+ };
1809
+ }
1810
+
1811
+ // -------------------- Abstraction Layer --------------------
1812
+
1813
+ /**
1814
+ * Returns the total number of chunks in the store.
1815
+ * @returns {number}
1816
+ */
1817
+ getStoreSize() {
1818
+ if (Array.isArray(this.vectorStore)) return this.vectorStore.length;
1819
+ if (this.binaryStore) return this.binaryStore.length;
1820
+ if (this.sqliteStore) return this.sqliteStore.length();
1821
+ return 0;
1822
+ }
1823
+
1824
+ /**
1825
+ * Retrieves a vector by its store index.
1826
+ * @param {number} index
1827
+ * @returns {Float32Array|null}
1828
+ */
1829
+ getVector(index) {
1830
+ return this.getChunkVector(index);
1831
+ }
1832
+
1833
+ /**
1834
+ * Retrieves a chunk object by its store index.
1835
+ * @param {number} index
1836
+ * @returns {object|null}
1837
+ */
1838
+ getChunk(index) {
1839
+ if (Array.isArray(this.vectorStore) && index >= 0 && index < this.vectorStore.length) {
1840
+ return this.vectorStore[index];
1841
+ }
1842
+ if (this.binaryStore) {
1843
+ const record = this.binaryStore.getRecord(index);
1844
+ if (record) {
1845
+ return {
1846
+ file: record.file,
1847
+ startLine: record.startLine,
1848
+ endLine: record.endLine,
1849
+ vector: this.binaryStore.getVector(index),
1850
+ _index: index,
1851
+ _binaryIndex: index,
1852
+ };
1853
+ }
1854
+ }
1855
+ if (this.sqliteStore) {
1856
+ const record = this.sqliteStore.getRecord(index);
1857
+ if (record) {
1858
+ return {
1859
+ file: record.file,
1860
+ startLine: record.startLine,
1861
+ endLine: record.endLine,
1862
+ vector: this.sqliteStore.getVector(index),
1863
+ _index: index,
1864
+ _sqliteIndex: index,
1865
+ };
1866
+ }
1867
+ }
1868
+ return null;
1869
+ }
1870
+ }